In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import catboost as ctb 

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [2]:
train_parquet = "/home/lishi/projects/Competition/kaggle_2023/data/train_add_daily_features.parquet"

prices =  ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
ta_indicators = ['ema', 'rsi', 'cci', 'mfi', 'ad_osc', 'macd', 'macdhist', 'macdsignal']

stock_labels = [0, 1, 2]

df_all = pd.read_parquet(train_parquet)

In [3]:
shenxian = df_all[df_all['stock_label']==1]['stock_id'].unique()
shenxian 

array([ 11,  31,  70,  80,  82,  85,  86,  92, 100, 101, 129, 158, 174,
       177, 180,  78, 153], dtype=int16)

In [4]:
lgb_params = {
    'boosting_type': 'gbdt',
    'learning_rate': 0.015, #0.009,#0.018,
    'max_depth': 12,#9,
    'n_estimators': 800,#600,
    'num_leaves': 1024,#440,
    'objective': 'mae',
    'random_state': 42,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'early_stopping_rounds': None,
    'num_threads': 16,
    'importance_type': 'gain',
    'verbose': -1,
    }

In [5]:
choose_stock = 153 
n_split = 5

# df = df_all[df_all.stock_id == choose_stock]

df = df_all[~df_all['target'].isnull()].reset_index(drop=True)
df = df.drop(columns=['row_id', 'time_id'])
df['imbalance_buy_sell_flag'] = df['imbalance_buy_sell_flag'].replace({-1: 0, 1: 1})

feature_cols = [x for x in df.columns if x not in ['target', 'date_id', 'row_id'] + ta_indicators]
category_cols = ["stock_id", "seconds_in_bucket", 'imbalance_buy_sell_flag', 'stock_label']

scale_cols = [x for x in feature_cols if x not in category_cols+['wap']]
scaler = StandardScaler().fit(df[scale_cols])
df[scale_cols] = scaler.transform(df[scale_cols])

scaler_wap = StandardScaler().fit(df[['wap']])
df[['wap']] = scaler_wap.transform(df[['wap']])

# scaler_target = StandardScaler().fit(df[['target']])
# df[['target']] = scaler_target.transform(df[['target']])

df['wap_shift'] = df.groupby(['date_id', 'stock_id'])['wap'].shift(-6)
df = df.dropna().reset_index(drop=True)

target_col ='wap_shift'

dates_list = df['date_id'].unique()

k_fold = KFold(n_splits=n_split, shuffle=False, random_state=None)
kf_split = k_fold.split(dates_list)

mae_list = []
lgb_models = []

for fold, (train_idx, valid_idx) in enumerate(kf_split):
    
    print(f"Fold {fold+1}")

    train_dates = dates_list[train_idx]
    
    half_valid = int(len(valid_idx)/2)
    valid_dates_1 = dates_list[valid_idx[:half_valid]]
    valid_dates_2 = dates_list[valid_idx[half_valid:]]
    
    print(f"Valid Dates 1: {valid_dates_1[0]} - {valid_dates_1[-1]}")
    print(f"Valid Dates 2: {valid_dates_2[0]} - {valid_dates_2[-1]}")
    
    # split train and valid set
    df_train_fold = df[df["date_id"].isin(train_dates)].reset_index(drop=True)
    df_valid_fold_1 = df[df["date_id"].isin(valid_dates_1)].reset_index(drop=True)
    df_valid_fold_2 = df[df["date_id"].isin(valid_dates_2)].reset_index(drop=True)

    print("Start training LightGBM...")
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        df_train_fold[feature_cols].values, 
        df_train_fold[target_col].values,
        eval_set=[(
            df_valid_fold_1[feature_cols].values, 
            df_valid_fold_1[target_col].values
            )],
        feature_name = feature_cols,
        categorical_feature = category_cols,
        callbacks=[lgb.callback.log_evaluation(period=100)],
        )
    
    valid_pred_lgb = lgb_model.predict(df_valid_fold_2[feature_cols].values)

    # t0_target = df[df['date_id']==valid_dates_1[-1]]['target'].mean()

    # targets = 

    # target_reverse = scaler_target.inverse_transform(df_valid_fold_2[['target']])
    # pred_reverse = scaler_target.inverse_transform(valid_pred_lgb.reshape(-1, 1))

    valid_mae_lgb = mean_absolute_error(df_valid_fold_2[target_col].values, valid_pred_lgb)
    mae_list.append(valid_mae_lgb)
    print(f"Valid MAE: {valid_mae_lgb}")

    lgb_models.append(lgb_model)

  df['wap_shift'] = df.groupby(['date_id', 'stock_id'])['wap'].shift(-6)


Fold 1
Valid Dates 1: 0 - 47
Valid Dates 2: 48 - 96
Start training LightGBM...
[100]	valid_0's l1: 0.122223
[200]	valid_0's l1: 0.111573
[300]	valid_0's l1: 0.111038
[400]	valid_0's l1: 0.111108
[500]	valid_0's l1: 0.111241
[600]	valid_0's l1: 0.111509
[700]	valid_0's l1: 0.111512
[800]	valid_0's l1: 0.111517
Valid MAE: 0.12586312673791855
Fold 2
Valid Dates 1: 97 - 144
Valid Dates 2: 145 - 192
Start training LightGBM...
[100]	valid_0's l1: 0.185495
[200]	valid_0's l1: 0.165648
[300]	valid_0's l1: 0.16492
[400]	valid_0's l1: 0.164833
[500]	valid_0's l1: 0.164898
[600]	valid_0's l1: 0.164916
[700]	valid_0's l1: 0.165009
[800]	valid_0's l1: 0.165039
Valid MAE: 0.15826408476746937
Fold 3
Valid Dates 1: 193 - 240
Valid Dates 2: 241 - 288
Start training LightGBM...


In [None]:
choose_stock = 153 
select_date = df["date_id"].unique()[18]

df_test = df_all[(df_all["date_id"] == select_date) & (df_all.stock_id == choose_stock)].copy().reset_index(drop=True)
df_test = df_test.drop(columns=['date_id'])

test_pred_lgb = np.mean([
    lgb_model.predict(df_test[feature_cols].values) for lgb_model in lgb_models
    ], axis=0)

# test_pred_lgb = scaler_target.inverse_transform(test_pred_lgb.reshape(-1, 1))

df_test['wap_pred'] = test_pred_lgb
df_test['wap_pred_reverse'] = scaler_wap.inverse_transform(df_test[['wap_pred']])
df_test['wap_reverse'] = scaler_wap.inverse_transform(df_test[['wap']])
df_test['wap_shift_reverse'] = scaler_wap.inverse_transform(df_test[['wap_shift']])

fig, ax = plt.subplots(1, 1)

ax.plot(df_test['wap_reverse'], linestyle='dashed', label='wap')
ax.plot(df_test['wap_shift_reverse'], label='wap_shift')
ax.plot(df_test['wap_pred_reverse'], label='wap_pred')
ax.text(0.95, .05, f"MAE: {mean_absolute_error(df_test['wap_reverse'], df_test['wap_pred_reverse']):.4f}", transform=ax.transAxes, ha='right', color='green', fontsize=14)
ax.legend()

plt.show()

In [None]:
import pandas as pd

In [None]:
df_importance = pd.DataFrame()
df_importance['feature'] = lgb_models[0].feature_name_
for i, lgb_model in enumerate(lgb_models):
    df_importance[f'importance_{i}'] = lgb_model.feature_importances_

df_importance['importance'] = df_importance[[f'importance_{i}' for i in range(len(lgb_models))]].mean(axis=1)

df_importance = df_importance.sort_values(by='importance', ascending=False).reset_index(drop=True)

df_importance.head(20)

In [None]:
plt.figure(figsize=(8, 12))
sns.barplot(data=df_importance.head(40), x='importance', y='feature')
plt.axvline(np.percentile(df_importance['importance'], 80), color='red', linestyle='dashed')
plt.show()

In [None]:
thred = np.percentile(df_importance['importance'], 80)
np.argmax(df_importance['importance']<thred)

In [None]:
very_importants = df_importance[df_importance['importance']>=thred]['feature'].values
print(len(very_importants))
very_importants

In [None]:
[ x for x in very_importants if x not in raw_features+v3_features+daily_features ]

In [None]:
raw_features = [x for x in very_importants if x in prices+sizes+category_cols ]
raw_features

In [None]:
v3_features = [x for x in very_importants if x.endswith('_5')] 
v3_features

In [None]:
daily_features = [x for x in very_importants if x.endswith('_5d')]
daily_features

In [None]:
len(raw_features), len(v3_features), len(daily_features)