# References
+ https://www.kaggle.com/tommy1028/lightgbm-starter-with-feature-engineering-idea
+ https://www.kaggle.com/monolith0456/2xlgbm-fnn-ensemble

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from glob import glob
from category_encoders.cat_boost import CatBoostEncoder
from joblib import Parallel, delayed
import time

In [None]:
start_time = time.time()
is_debug = True
show_feat_imp = True
device='cpu'
    
input_dir = '../input/optiver-realized-volatility-prediction/'
agg_feat_list = ['trade_price_mean',
                 'trade_seconds_in_bucket_count_unique']

# Preprocess

In [None]:
def down_cast(df):
    f_cols = df.select_dtypes('float').columns
    i_cols = df.select_dtypes('int').columns
    df[f_cols] = df[f_cols].apply(pd.to_numeric, downcast='float')
    df[i_cols] = df[i_cols].apply(pd.to_numeric, downcast='integer')
    
    return df

## Feature Engineering Functions

In [None]:
# for book data
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def bid_size_sum(df):
    return df['bid_size1'] + df['bid_size2']
    
def ask_size_sum(df):
    return df['ask_size1'] + df['ask_size2']

def diff_bid_ask_price(df):
    return df['bid_price1'] + df['bid_price2'] - df['ask_price1'] - df['ask_price2']

def diff_bid_ask_size(df):
    return df['bid_size1'] + df['bid_size2'] - df['ask_size1'] - df['ask_size2']

def diff_bid_price(df):
    return df['bid_price1'] - df['bid_price2']

def diff_ask_price(df):
    return df['ask_price1'] - df['ask_price2']

def diff_bid_size(df):
    return df['bid_size1'] - df['bid_size2']

def diff_ask_size(df):
    return df['ask_size1'] - df['ask_size2']

# for trade data
def amount(df):
    return df['price'] * df['size']

def count_unique(series):
    return len(np.unique(series))

# common 
def log_return(series):
    return np.log(series).diff()

def realized_volatility(series):
    return np.sqrt(np.sum(log_return(series)**2))


def get_stats_window(df, fe_dict, seconds_in_bucket, add_suffix=False):
    # Group by the window
    df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
    # Rename columns joining suffix
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    df_feature = df_feature.rename(columns={'time_id_': 'time_id'})
    # Add a suffix to differentiate windows
    if add_suffix:
        df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
    return df_feature

def agg_feat_groupby_stock_id(df, agg_feat_list):
    df_feature = df.groupby(['stock_id'])[agg_feat_list].agg(['mean', 'std']).reset_index()
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    df_feature = df_feature.add_suffix('_' + 'stock')
    return df_feature.rename(columns={'stock_id__stock': 'stock_id'})

def agg_feat_groupby_time_id(df, agg_feat_list):
    df_feature = df.groupby(['time_id'])[agg_feat_list].agg(['mean',  'std']).reset_index()
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    df_feature = df_feature.add_suffix('_' + 'time')
    return df_feature.rename(columns={'time_id__time': 'time_id'})

def calc_tau(df):
    df['size_tau'] = np.sqrt(1 / df['trade_seconds_in_bucket_count_unique'])
    df['size_tau_400'] = np.sqrt(1 / df['trade_seconds_in_bucket_count_unique_400'])
    df['size_tau_200'] = np.sqrt(1 / df['trade_seconds_in_bucket_count_unique_200'])

    # delta tau
    df['size_tau_d'] = df['size_tau_400'] - df['size_tau']

    df['size_tau2'] = np.sqrt(1 / df['trade_order_count_sum'])
    df['size_tau2_400'] = np.sqrt(0.33 / df['trade_order_count_sum'])
    df['size_tau2_200'] = np.sqrt(0.66 / df['trade_order_count_sum'])

    # delta tau
    df['size_tau2_d'] = df['size_tau2_400'] - df['size_tau2']
    
    return df

## Preprocess of Book Data

In [None]:
def preprocess_book(file_path):
    df = pd.read_parquet(file_path)
    # make new columns
    df['wap1'] = calc_wap1(df)
    df['bid_size_sum'] = bid_size_sum(df)
    df['ask_size_sum'] = ask_size_sum(df)
    df['diff_bid_ask_price'] = diff_bid_ask_price(df)
    df['diff_bid_ask_size'] = diff_bid_ask_size(df)
    df['diff_bid_price'] = diff_bid_price(df)
    df['diff_ask_price'] = diff_ask_price(df)
    df['diff_bid_size'] = diff_bid_size(df)
    df['diff_ask_size'] = diff_ask_size(df)
    
    # log returns
    lr_cols = ['wap1',
               'bid_price1',
               'ask_price2',
               'bid_size1',
               'ask_size2',
               'bid_size_sum',
               'ask_size_sum',
               'diff_bid_ask_price',
               'diff_bid_price',
               'diff_ask_price'
              ]
    for col in lr_cols:
        df[col + '_lr'] = df.groupby(['time_id'])[col].apply(log_return)
    
    create_feature_dict = {
        'wap1': [realized_volatility],
        'wap1_lr': [np.std, np.max, np.min],
        'bid_price1': [np.std],
        'bid_size1': [np.mean],
        'bid_price2': [np.std],
        'bid_size2': [np.mean],
        'ask_price1': [np.std],
        'ask_price2': [np.std],
        'diff_bid_ask_price': [realized_volatility, np.mean, np.max, np.min],
        'diff_bid_price': [np.mean, np.max, np.min],
        'diff_ask_price': [np.max, np.min],
        'bid_price1_lr': [np.std],
        'ask_price2_lr': [np.std],
        'bid_size_sum_lr': [np.mean],
        'ask_size_sum_lr': [np.mean],
        'diff_bid_price_lr': [np.mean, np.std, np.max, np.min],
    }
    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict,seconds_in_bucket=0, add_suffix=False)
    df_feature_200 = get_stats_window(df, create_feature_dict, seconds_in_bucket=200, add_suffix=True)
    df_feature_400 = get_stats_window(df, create_feature_dict, seconds_in_bucket=400, add_suffix=True)
    
    df_feature = df_feature.merge(df_feature_200, how='left', left_on='time_id', right_on='time_id_200')
    df_feature = df_feature.merge(df_feature_400, how='left', left_on='time_id', right_on='time_id_400')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id_200'], axis=1, inplace=True)
    df_feature.drop(['time_id_400'], axis=1, inplace=True)
    
    # row_id
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id'], axis = 1, inplace = True)
    
    return df_feature

## Preprocess of Trade Data

In [None]:
def preprocess_trade(file_path):
    df = pd.read_parquet(file_path)
    df['amount'] = amount(df)
    
    lr_cols = ['price',
               'amount',
               'order_count'
              ]
    for col in lr_cols:
        df[col + '_lr'] = df.groupby(['time_id'])[col].apply(log_return)
    
    create_feature_dict = {
        'price': [realized_volatility, np.mean, np.std],
        'price_lr': [np.max, np.min, np.std],
        'seconds_in_bucket': [count_unique],
        'amount_lr': [np.mean, np.std],
        'order_count': [np.sum]
    }
    
    # Get the stats for different windows
    df_feature = get_stats_window(df, create_feature_dict,seconds_in_bucket=0, add_suffix=False)
    df_feature_200 = get_stats_window(df, create_feature_dict, seconds_in_bucket=200, add_suffix=True)
    df_feature_400 = get_stats_window(df, create_feature_dict, seconds_in_bucket=400, add_suffix=True)
    
    df_feature = df_feature.merge(df_feature_200, how='left', left_on='time_id', right_on='time_id_200')
    df_feature = df_feature.merge(df_feature_400, how='left', left_on='time_id', right_on='time_id_400')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id_200'], axis=1, inplace=True)
    df_feature.drop(['time_id_400'], axis=1, inplace=True)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id'], axis=1, inplace=True)
    
    return df_feature

## Preprocess of All

In [None]:
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train=True, is_parallel=True):
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = input_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = input_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = input_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = input_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(preprocess_book(file_path_book), preprocess_trade(file_path_trade), on='row_id', how='left')
        
        # Return the merged dataframe
        return df_tmp

    if is_parallel:
        # Use parallel api to call paralle for loop
        df = Parallel(n_jobs=-1, verbose=1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
        # Concatenate all the dataframes that return from Parallel
        df = pd.concat(df, ignore_index = True)
    else:
        for stock_id in list_stock_ids:
            if is_train:
                file_path_book = input_dir + "book_train.parquet/stock_id=" + str(stock_id)
                file_path_trade = input_dir + "trade_train.parquet/stock_id=" + str(stock_id)
            else:
                file_path_book = input_dir + "book_test.parquet/stock_id=" + str(stock_id)
                file_path_trade = input_dir + "trade_test.parquet/stock_id=" + str(stock_id)
        df = pd.merge(preprocess_book(file_path_book), preprocess_trade(file_path_trade), on='row_id', how='left')
    df['stock_id'] = df['row_id'].apply(lambda x: int(str(x).split('-')[0]))
    df['time_id'] = df['row_id'].apply(lambda x: int(str(x).split('-')[1]))

    return df.drop('row_id', axis=1)

In [None]:
def extract_list_stock_ids(book_file_exp, trade_file_exp):
    book_file_list = glob(book_file_exp)
    trade_file_list = glob(trade_file_exp)
    book_stock_ids = {int(file_path.split('stock_id=')[1]) for file_path in book_file_list}
    trade_stock_ids = {int(file_path.split('stock_id=')[1]) for file_path in trade_file_list}

    return list(book_stock_ids and trade_stock_ids)

In [None]:
book_train_file_exp = input_dir + 'book_train.parquet/stock_id=*'
trade_train_file_exp = input_dir + 'trade_train.parquet/stock_id=*'

list_train_stock_ids = extract_list_stock_ids(book_train_file_exp, trade_train_file_exp)

In [None]:
if is_debug:
    list_train_stock_ids = [0]
df_train_org = pd.read_csv(input_dir + 'train.csv') 
df_train_feature = preprocessor(list_train_stock_ids, is_train=True)
df_train = df_train_org.merge(df_train_feature, on=['stock_id', 'time_id'], how='left')

# aggregate features group by stock_id
df_stock_id_feat_train = agg_feat_groupby_stock_id(df_train, agg_feat_list)
df_train = df_train.merge(df_stock_id_feat_train, on='stock_id', how='left')

# aggregate features group by time_id
df_time_id_feat_train = agg_feat_groupby_time_id(df_train, agg_feat_list)
df_train = df_train.merge(df_time_id_feat_train, on='time_id', how='left')

In [None]:
df_train = calc_tau(df_train)

In [None]:
# find too much corr combinations of feature
if is_debug:
    df_corr = df_train.drop(['stock_id', 'time_id', 'target'], axis=1).corr()
    too_match_corr_list = []
    threshold = 0.98
    for col in df_corr:
        for idx in df_corr.index:
            if (df_corr.loc[idx, col] > threshold) and (col != idx) and ((idx, col) not in too_match_corr_list):
                too_match_corr_list.append((col, idx))
    print(too_match_corr_list)

# Target Encoding

In [None]:
# Train data
cb_encoder = CatBoostEncoder(cols=['stock_id'])
cb_encoder.fit(df_train.drop('target', axis=1), df_train['target'])
df_train['stock_id_cbenc'] = cb_encoder.transform(df_train.drop('target', axis=1))['stock_id']
df_train = down_cast(df_train)

In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgbm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
if is_debug:
    d_tmp = (df_train.describe() > 1000000).any().reset_index()
    d_tmp.columns = ['colname', 'too_large']
    for i, row in d_tmp.iterrows():
        if row['too_large']:
            print(row['colname'])

# Standard Scaler

In [None]:
# train
scaler_dict = {}
X = df_train.drop(['stock_id', 'time_id', 'target'], axis=1)
X.fillna(0)
for col in X.columns:
    if col != 'stock_id':
        scaler = StandardScaler()
        x = X[col].values.reshape(-1, 1)
        scaler.fit(x)
        X[col] = scaler.transform(x)
        scaler_dict[col] = scaler
y = df_train['target']

In [None]:
# loss functions
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_RMSPE(preds, lgbm_train):
    labels = lgbm_train.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds), 5), False

In [None]:
# feature importance
def calc_model_importance(model, feature_names=None, importance_type='gain'):
    importance_df = pd.DataFrame(model.feature_importance(importance_type=importance_type),
                                 index=feature_names,
                                 columns=['importance']).sort_values('importance')
    return importance_df


def plot_importance(importance_df, title='',
                    save_filepath=None, figsize=(16, 32)):
    fig, ax = plt.subplots(figsize=figsize)
    importance_df.plot.barh(ax=ax)
    if title:
        plt.title(title)
    plt.tight_layout()
    if save_filepath is None:
        plt.show()
    else:
        plt.savefig(save_filepath)
    plt.close()
    
    
def calc_mean_importance(importance_df_list):
    mean_importance = np.mean(
        np.array([df['importance'].values for df in importance_df_list]), axis=0)
    mean_df = importance_df_list[0].copy()
    mean_df['importance'] = mean_importance
    return mean_df

In [None]:
# lgbm parms
params = {
      "objective": "rmse", 
      "metric": "rmse", 
      "boosting_type": "gbdt",
      'early_stopping_rounds': 50,
      'learning_rate': 0.01,
      'lambda_l1': 1,
      'lambda_l2': 1,
      'feature_fraction': 0.8,
      'feature_fraction_bynode': 0.8,
      'device': device
  }
# list of models
models = []
# validation score
scores = 0.0
gain_importance_list = []
split_importance_list = []

# Train Model

In [None]:
n_splits = 5
if is_debug:
    num_boost_round = 100
else:
    num_boost_round = 5000
    
kf = KFold(n_splits=n_splits, random_state=20210101, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Fold : {fold + 1}")
    X_train, y_train = X.loc[trn_idx], y[trn_idx]
    X_valid, y_valid = X.loc[val_idx], y[val_idx]
    
    # RMSPE weight
    weights = 1 / np.square(y_train)
    lgbm_train = lgbm.Dataset(X_train, y_train, weight=weights)

    weights = 1 / np.square(y_valid)
    lgbm_valid = lgbm.Dataset(X_valid, y_valid, reference=lgbm_train, weight=weights)
    
    # train model
    model = lgbm.train(params=params,
                      train_set=lgbm_train,
                      valid_sets=[lgbm_train, lgbm_valid],
                      num_boost_round=num_boost_round,         
                      feval=feval_RMSPE,
                      verbose_eval=100             
                     )
    
    # validation 
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

    RMSPE = round(rmspe(y_true=y_valid, y_pred=y_pred), 3)
    print(f'RMSPE: {RMSPE}')

    scores += RMSPE / n_splits
    models.append(model)
    print("*" * 100)
    if show_feat_imp:    
        feature_names = X_train.columns.values.tolist()
        gain_importance_df = calc_model_importance(
            model, feature_names=feature_names, importance_type='gain')
        gain_importance_list.append(gain_importance_df)

In [None]:
if show_feat_imp:
    mean_gain_df = calc_mean_importance(gain_importance_list)
    plot_importance(mean_gain_df, title='Model feature importance by gain')
    mean_gain_df = mean_gain_df.reset_index().rename(columns={'index': 'feature_names'})
    mean_gain_df.to_csv('gain_importance_mean.csv', index=False)

# Test Set

In [None]:
book_test_file_exp  = input_dir + 'book_test.parquet/stock_id=*'
trade_test_file_exp = input_dir + 'trade_test.parquet/stock_id=*'
list_test_stock_ids = extract_list_stock_ids(book_test_file_exp, trade_test_file_exp)

In [None]:
df_test_org = pd.read_csv(input_dir + 'test.csv') 
df_test_feature = preprocessor(list_test_stock_ids, is_train=False)

df_test = df_test_org.merge(df_test_feature, on=['stock_id', 'time_id'], how='left')

# aggregate groupby stock_id
df_stock_id_feat_test = agg_feat_groupby_stock_id(df_test, agg_feat_list)
df_test = df_test.merge(df_stock_id_feat_test, on='stock_id', how='left')

# aggregate groupby time_id
df_time_id_feat_test = agg_feat_groupby_time_id(df_test, agg_feat_list)
df_test = df_test.merge(df_time_id_feat_test, on='time_id', how='left')

df_test = calc_tau(df_test)

In [None]:
df_test = df_test.drop('row_id', axis=1)
# target encoding
df_test['stock_id_cbenc'] = cb_encoder.transform(df_test)['stock_id']

# standard scaler
X_test = df_test.drop(['stock_id', 'time_id'], axis=1)
for col in X_test.columns:
    if col != 'stock_id':
        x = X_test[col].values.reshape(-1, 1)
        X_test[col] = scaler_dict[col].transform(x)

df_test = down_cast(df_test)

# Prediction

In [None]:
# prediction with light gbm models
target = np.zeros(len(X_test))
for model in models:
    pred = model.predict(X_test[X.columns], num_iteration=model.best_iteration)
    target += pred / len(models)
    
df_test['row_id'] = df_test['stock_id'].astype(str) + '-' + df_test['time_id'].astype(str)
df_submission = df_test[['row_id']].assign(target=target)
df_submission.to_csv('submission.csv', index=False)
end_time = time.time()
print(f'total time: {round(end_time - start_time)} seconds')