In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, warnings
from time import time 
from create_feature import *

import lightgbm as lgb
import joblib, gc
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from glob import glob

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:

def prepare_data(csv_file, feature_dicts, feature_versions=['v1', 'v2', 'v3'], nrows=None, save_csv=None):
    
    df = pd.read_csv(csv_file, nrows=nrows)
    df = df[~df['target'].isnull()] 
    
    df.reset_index(drop=True, inplace=True)
    
    df.fillna(0, inplace=True)
    df = reduce_mem_usage(df, verbose=0)
    
    print(df.shape)
    print(f"Trading days: {df['date_id'].nunique()}")
    print(f"Stocks: {df['stock_id'].nunique()}")
    
    if 'v1' in feature_versions:
        df, v1_feat, v1_feat_cat = gen_v1_features(df, feature_dicts['prices'])
        feature_dicts['v1_features'] = v1_feat
        feature_dicts['v1_feature_category'] = v1_feat_cat
    
    if 'v2' in feature_versions:
        v2_feat_cols = feature_dicts['prices'] + feature_dicts['sizes'] + feature_dicts['v1_features']
        df, v2_features = gen_v2_features(df, v2_feat_cols)
        feature_dicts['v2_features'] = v2_features
        
    if 'v3' in feature_versions:
        df, v3_features = gen_v3_features(
            df, 
            feature_dicts['prices'],
            feature_dicts['sizes'],
            feature_dicts['v1_features']
            )
        
        feature_dicts['v3_features'] = v3_features
    
    df.fillna(0, inplace=True)
    df.replace([np.inf, -np.inf], 0, inplace=True)
    df = reduce_mem_usage(df, verbose=0)
    
    return df, feature_dicts

In [3]:
feature_dicts = {
    'prices': ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"],
    'sizes':  ["matched_size", "bid_size", "ask_size", "imbalance_size"],
    "category": ["stock_id", "seconds_in_bucket", 'imbalance_buy_sell_flag']
    }

feat_version = ['v1', 'v2', 'v3']

train_csv = "/home/lishi/projects/Competition/kaggle_2023/data/train.csv"

df, feature_dicts = prepare_data(train_csv, feature_dicts, feat_version, nrows=1000, save_csv=None)

feature_cols, category_cols = gen_feature_cols(feature_dicts)

print(len(feature_cols))
print(len(category_cols))

(1000, 17)
Trading days: 1
Stocks: 191
354
8


In [None]:
date_ids = df['date_id'].unique()

date_range_len = 55
date_ranges = [(date_ids[i], date_ids[i+date_range_len]) for i in range(len(date_ids)-date_range_len)]

print(len(date_ranges))

valid_len = 5

In [None]:
def train_and_cross_validate(df_train, df_valid, feature_cols, category_cols, lgb_params, model_name, save_dir, scaler_file=None, n_splits=5):
    
    scale_cols = [x for x in feature_cols if x not in category_cols]
    scaler = StandardScaler().fit(df_train[scale_cols])
    
    if scaler_file:
        joblib.dump(scaler, scaler_file)

    df_train[scale_cols] = scaler.transform(df_train[scale_cols])
    df_valid[scale_cols] = scaler.transform(df_valid[scale_cols])
    
    check_invalids = pd.DataFrame(columns=['null', 'inf'])
    for col in df.columns:
        try:
            check_invalids.loc[col] = [df_train[col].isnull().sum(), np.isinf(df_train[col]).sum()]
        except:
            print("Skip column: ", col)
            pass
        
    has_invalids = check_invalids.T[check_invalids.sum(axis=0)!=0]
    
    if len(has_invalids) > 0:
        print("Invalid values were found in dataframe.")
        print(has_invalids)
        raise Exception("Invalid values in dataframe")
    
    dates_list = df_train['date_id'].unique()
    k_fold = KFold(n_splits=n_splits, shuffle=False, random_state=None)
    kf_split = k_fold.split(dates_list)
    
    mae_scores = []
    models = []
    
    print(df_train['date_id'].unique())
    
    print("Start Cross-validation...")
    for fold, (train_idx, valid_idx) in enumerate(kf_split):
        
        train_dates = dates_list[train_idx]
        valid_dates = dates_list[valid_idx]
        
        print(f"Fold {fold+1}")
        fold_start = time()
        
        # split train and valid set
        df_train_fold = df_train[df_train["date_id"].isin(train_dates)]
        df_valid_fold = df_train[df_train["date_id"].isin(valid_dates)]
        
        print(f"Train : {df_train_fold.shape}, Valid : {df_valid_fold.shape}")
        
        print(f"Data preparation finished. Start training...")
        
        training_start = time()
        
        lgb_model = lgb.LGBMRegressor(**lgb_params)
        
        lgb_model.fit(
            df_train_fold[feature_cols], 
            df_train_fold['target'],
            eval_set=[(df_valid_fold[feature_cols], df_valid_fold['target'])],
            feature_name = feature_cols,
            categorical_feature = category_cols,
            callbacks=[lgb.callback.log_evaluation(period=100)],
            )
        
        models.append(lgb_model)
        
        model_file = f"{save_dir}/{model_name}_fold_{fold+1}.pkl" 
        joblib.dump(lgb_model, model_file)
        
        print(f"Fold {fold+1} Trainning finished. Time elapsed: {time()-training_start:.2f}s")
        
        y_pred_valid = lgb_model.predict(df_valid[feature_cols])
        mae = mean_absolute_error(df_valid['target'].values, y_pred_valid)
        mae_scores.append(mae)

        print(f"Fold {fold+1} MAE: {mae}")
        print(f"Fold {fold+1} Time elapsed: {time()-fold_start:.2f}s")
        
        del df_train_fold, df_valid_fold, y_pred_valid
        gc.collect()
        
    return models, mae_scores

In [None]:
def train_on_subdf(df, feature_cols, category_cols, date_range, valid_len, lgb_params):
    
    print(date_range)

    df_train = df[(df['date_id'] >= date_range[0]) & (df['date_id'] < (date_range[1]-valid_len))]
    df_valid = df[(df['date_id'] >= (date_range[1]-valid_len)) & (df['date_id'] < date_range[1])]
    
    print(f"Train days: {df_train['date_id'].nunique()}")
    print(f"Valid days: {df_valid['date_id'].nunique()}")
    
    cv_results = train_and_cross_validate(
        df_train,
        df_valid,
        feature_cols, 
        category_cols, 
        lgb_params, 
        model_name = f"small_lgb_{date_range[0]}_{date_range[1]}_{valid_len}",
        save_dir = "../data/small_lgbm/", 
        scaler_file="../data/small_lgb_cv_scaler.pkl", 
        n_splits=5)
    
    return cv_results
    

In [None]:
# set lgb parameters
lgb_params = {
    'learning_rate': 0.01,#0.018,
    'max_depth': 7,#9,
    'n_estimators': 500,#600,
    'num_leaves': 70,#440,
    'objective': 'mae',
    'random_state': 42,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'early_stopping_rounds': 50,
    'num_threads': 6,
    'min_data_in_leaf': 50,
    'importance_type': 'gain',
    'verbose': -1,
    }

csv_results = []

for d_range in date_ranges[11::3]:
    csv_result = train_on_subdf(df, feature_cols, category_cols, d_range, valid_len, lgb_params)
    print(f"\n{'-'*10}\nDate range {d_range} finished.")
    print(f"Mean MAE: {np.mean(csv_result[1])}\n{'-'*10}\n")
    csv_results.append(csv_result)

In [None]:
mae_list = [np.mean(x[1]) for x in csv_results]

print(f"Mean MAE: {np.mean(mae_list)}")

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(mae_list)
ax.set_xticks(np.arange(0, len(mae_list), 3))
xtick_labels = [f"{x[0]}-{x[1]}" for x in date_ranges[11::3]]
ax.set_xticklabels(xtick_labels[0:len(mae_list):3], rotation=45)
ax.set_xlabel("Date range")
ax.set_ylabel("MAE")
ax.grid(True)
plt.show()

In [None]:
# remove incomplete dates 
df = df[df['date_id'] != df['date_id'].max()]
df.reset_index(drop=True, inplace=True)

valid_lenght = 5 # days

df_train = df[df['date_id'] < df['date_id'].max() - valid_lenght]
df_valid = df[df['date_id'] >= df['date_id'].max() - valid_lenght]

print(df_train.shape, df_train['date_id'].unique())
print(df_valid.shape, df_valid['date_id'].unique())

In [None]:


# cv_results = train_and_cross_validate(
#     df_train,
#     df_valid,
#     feature_cols, 
#     category_cols, 
#     lgb_params, 
#     model_name = "small_lgb_cv", 
#     save_dir = "../data", 
#     scaler_file="../data/small_lgb_cv_scaler.pkl", 
#     n_splits=5)

In [None]:
# plot mae scores
plt.figure(figsize=(6, 5))

mae_scores = cv_results[1]

plt.plot(mae_scores, marker='o', color='blue', label='MAE')
plt.title(f'MAE Scores(Overall: {np.mean(mae_scores):.4f})')
plt.xlabel('Fold')
plt.ylabel('MAE')

plt.show()

In [None]:
def calc_feature_importance(model_name, model_dir, feature_cols):
    
    model_files = glob(f"{model_dir}/{model_name}_fold_*.pkl")
    models = [joblib.load(model_file) for model_file in model_files]

    df_importance = []
    
    for model in models:
        feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':feature_cols})
        feature_imp.sort_values(by='Value', ascending=False, inplace=True)
        df_importance.append(feature_imp)
        
    df_importance = pd.concat(df_importance)
    df_importance = df_importance.groupby('Feature').mean().reset_index()

    df_importance.sort_values(by='Value', ascending=False, inplace=True)
    df_importance = df_importance.reset_index(drop=True)
    
    return df_importance

In [None]:
df_importance = calc_feature_importance('small_lgb_cv', '../data',  feature_cols)
imp_thred = np.percentile(df_importance['Value'].values, 20)
less_important = df_importance[df_importance['Value'] < imp_thred]

print(f"Importance Threshold (20 percentile): {imp_thred}")
print(f"Number of less important features: {len(less_important)}")

fig, axes = plt.subplots(1, 3, figsize=(18, 11))

less_imp_v1 = less_important[less_important['Feature'].isin(feature_dicts['v1_features']+feature_dicts['v1_feature_category'])]
less_imp_v2 = less_important[less_important['Feature'].isin(feature_dicts['v2_features'])]
less_imp_v3 = less_important[less_important['Feature'].isin(feature_dicts['v3_features'])]

for i, (ax, df_lss) in enumerate(zip(axes, [less_imp_v1, less_imp_v2, less_imp_v3])):
    sns.barplot(x="Value", y="Feature", data=df_lss, ax=ax)
    ax.grid()
    ax.set_title(f"V{i+1} Features")
# ax.set_xlim(0, 2000)
plt.tight_layout()
plt.show()