In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys, warnings
from feature_engineer import *

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split 

import lightgbm as lgb
import catboost as ctb

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
raw_features = [
    'stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 
    'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price',
    'bid_size', 'ask_price', 'ask_size',
    ]

category_features = ['stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag']

scale_cols = [x for x in raw_features if x not in category_features]

df = pd.read_csv("/home/lishi/projects/Competition/kaggle_2023/data/train.csv")
df = df[~df['target'].isnull()] 

print(df.shape)
print(f"Trading days: {df['date_id'].nunique()}")
print(f"Stocks: {df['stock_id'].nunique()}")

df = df.drop(columns=['time_id', 'row_id'])

scaler_raw = StandardScaler()
scaler_raw.fit(df[scale_cols])

scaler_wap = StandardScaler()
scaler_wap.fit(df[['wap']])

df[scale_cols] = scaler_raw.transform(df[scale_cols])
df[['wap']] = scaler_wap.transform(df[['wap']])

wap_shift = df.groupby(['stock_id', 'date_id'])['wap'].shift(-6)
wap_shift.name = 'wap_shift'
df = pd.concat([df, wap_shift], axis=1)

df = df.dropna()

df[(df['stock_id'] == 0)&(df['date_id']==0)][['seconds_in_bucket', 'wap', 'wap_shift']].head(16)

(5237892, 17)
Trading days: 481
Stocks: 200


Unnamed: 0,seconds_in_bucket,wap,wap_shift
5730,300,0.060479,-0.401181
5921,310,-0.048429,-0.285065
6112,320,-0.066447,-0.289069
6303,330,-0.086467,-0.299479
6494,340,-0.302282,-0.297878
6685,350,-0.356336,-0.386766
6876,360,-0.401181,-0.458038
7067,370,-0.285065,-0.426406
7258,380,-0.289069,-0.404384
7449,390,-0.299479,-0.304284


In [None]:
n_split = 5 

dates_list = df['date_id'].unique()

k_fold = KFold(n_splits=n_split, shuffle=False, random_state=None)
kf_split = k_fold.split(dates_list)

folds = [
    [dates_list[train_idx], dates_list[valid_idx]] for train_idx, valid_idx in kf_split
]

for i, [x,y] in enumerate(folds):
    print(f"Valid {i}:{y.min()} - {y.max()}")

In [None]:
lgb_params = {
    'boosting_type': 'gbdt',
    'learning_rate': 0.015, #0.009,#0.018,
    'max_depth': 12,#9,
    'n_estimators': 800,#600,
    'num_leaves': 1024,#440,
    'objective': 'mae',
    'random_state': 42,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'early_stopping_rounds': 50,
    'num_threads': 16,
    'importance_type': 'gain',
    'verbose': -1,
    }

In [None]:
# train model in each fold
mae_list = []
models = []

for i, [train_dates, valid_dates] in enumerate(folds):
    print(f"Fold {i}: {valid_dates.min()} - {valid_dates.max()}")
    
    # split train_df into train and test
    train_df = df[df['date_id'].isin(train_dates)]
    valid_df = df[df['date_id'].isin(valid_dates)]

    X_train = train_df[raw_features+['wap']]
    y_train = train_df['wap_shift']

    X_valid = valid_df[raw_features+['wap']]
    y_valid = valid_df['wap_shift']

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    print(f"Train: {X_train.shape}, {y_train.shape}")
    print(f"Valid: {X_valid.shape}, {y_valid.shape}")
    print(f"Test: {X_test.shape}, {y_test.shape}")

    # train model
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='mae',
        feature_name = raw_features+['wap'],
        categorical_feature = category_features,
        callbacks=[lgb.callback.log_evaluation(period=100)],
        )

    models.append(model)

    y_pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    print(f"MAE: {mae}")
    mae_list.append(mae)