In [None]:
import numpy as np
import pandas as pd
import glob
import time
from contextlib import contextmanager
from joblib import Parallel, delayed
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
path = '/kaggle/input/optiver-realized-volatility-prediction/'
list_order_book_file_train = glob.glob('/kaggle/input/optiver-comp/ffill/ffill/book_train.parquet/*')

train = pd.read_csv(path + 'train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)

In [None]:
@contextmanager
def timer(name):
    """
    Time Each Process
    """
    t0 = time.time()
    yield
    print('\n[{}] done in {} Minutes\n'.format(name, round((time.time() - t0) / 60, 2)))


def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()


def order_updates(order_info):
    return order_info.diff()


def order_shift(order_info):
    return order_info.shift(1)


def abs_sum_diff(order_info):
    return np.sum(np.abs(order_info.diff()))


def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return ** 2))


def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


def ffill(data_df):
    data_df = data_df.set_index(['time_id', 'seconds_in_bucket'])
    data_df = data_df.reindex(pd.MultiIndex.from_product([data_df.index.levels[0], np.arange(0, 600)],
                                                         names=['time_id', 'seconds_in_bucket']), method='ffill')
    return data_df.reset_index()


def rebase(df_book, df_trade):
    for time_id in df_book['time_id'].unique():
        sub_df_book = df_book[df_book['time_id'] == time_id]
        rebase_diff = min(sub_df_book['seconds_in_bucket'])
        if rebase_diff != 0:
            df_book.loc[df_book['time_id'] == time_id, 'seconds_in_bucket'] = sub_df_book['seconds_in_bucket'] - rebase_diff
            sub_df_trade = df_trade[df_trade['time_id'] == time_id]
            df_trade.loc[df_trade['time_id'] == time_id, 'seconds_in_bucket'] = sub_df_trade['seconds_in_bucket'] - rebase_diff
    
    return df_book, df_trade


def calc_tick(df):
    diff = abs(df.diff())
    min_diff = np.nanmin(diff.where(lambda x: x > 0))
    tick_size = np.nanmean(min_diff)
    return tick_size

In [None]:
def feature_engineering(file_path_order_book, file_path_trade, train_test=True):
    # read two tables
    df_book_data = pd.read_parquet(file_path_order_book)
    df_trade_data = pd.read_parquet(file_path_trade)
    stock_id = file_path_order_book.split('=')[1]
    
    if train_test == False:
        df_book_data, df_trade_data = rebase(df_book_data, df_trade_data)
        df_book_data = ffill(df_book_data)

    # ------------------------- BOOK-ONLY FEATURES --------------------------
    # return_square, ba_size_direction
    # tick size (questionable calculation)
    # tick_size = df_book_data[['time_id', 'ask_price1', 'ask_price2', 'bid_price1', 'bid_price2']].groupby('time_id').apply(calc_tick)

    # calculate weighted average price
    df_book_data['wap'] = (df_book_data['bid_price1'] * df_book_data['ask_size1'] + df_book_data['ask_price1'] *
                           df_book_data['bid_size1']) / (df_book_data['bid_size1'] + df_book_data['ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data['log_return_sq'] = np.square(df_book_data['log_return'])

    df_book_data['wap2'] = (df_book_data['bid_price2'] * df_book_data['ask_size2'] + df_book_data['ask_price2'] *
                            df_book_data['bid_size2']) / (df_book_data['bid_size2'] + df_book_data['ask_size2'])
    df_book_data['log_return2'] = df_book_data.groupby(['time_id'])['wap2'].apply(log_return)
    df_book_data['log_return2_sq'] = np.square(df_book_data['log_return2'])

    # bid price update
    df_book_data['bp_update'] = df_book_data.groupby(['time_id'])['bid_price1'].apply(order_updates)
    # ask price update
    df_book_data['ap_update'] = df_book_data.groupby(['time_id'])['ask_price1'].apply(order_updates)
    # bid size update
    df_book_data['bs_update'] = df_book_data.groupby(['time_id'])['bid_size1'].apply(order_updates)
    df_book_data['bid_size1_lag'] = df_book_data.groupby(['time_id'])['bid_size1'].apply(order_shift)
    df_book_data['bs_update_relative'] = df_book_data['bs_update'] / df_book_data['bid_size1_lag']
    # ask size update
    df_book_data['as_update'] = df_book_data.groupby(['time_id'])['ask_size1'].apply(order_updates)
    df_book_data['ask_size1_lag'] = df_book_data.groupby(['time_id'])['ask_size1'].apply(order_shift)
    df_book_data['as_update_relative'] = df_book_data['as_update'] / df_book_data['ask_size1_lag']
    # small orders
    df_book_data['small_orders'] = ((df_book_data['ap_update'] >= 0) & (df_book_data['as_update_relative'] <= -0.9)) | \
                                   ((df_book_data['bp_update'] <= 0) & (df_book_data['bs_update_relative'] <= -0.9))
    
    # total order size
    
    df_book_data['total_order'] = df_book_data['ask_size1'] + df_book_data['bid_size1']
    df_book_data['total_bid_order'] = df_book_data['bid_size1'] + df_book_data['bid_size2']
    df_book_data['total_ask_order'] = df_book_data['ask_size1'] + df_book_data['ask_size2']

    # volume_order_imbalance
    df_book_data['bid_increment'] = np.where(df_book_data['bp_update'] > 0, df_book_data['bid_size1'],
                                             np.where(df_book_data['bp_update'] < 0, 0, df_book_data['bs_update']))
    df_book_data['ask_increment'] = np.where(df_book_data['ap_update'] < 0, df_book_data['ask_size1'],
                                             np.where(df_book_data['ap_update'] > 0, 0, df_book_data['as_update']))
    df_book_data['VOI'] = df_book_data['bid_increment'] - df_book_data['ask_increment']

    # ba spread
    df_book_data['ba_spread'] = 2 * (df_book_data['ask_price1'] - df_book_data['bid_price1']) / \
                                (df_book_data['ask_price1'] + df_book_data['bid_price1'])
    # absolute ba spread by tick size
    df_book_data['abs_ba_spread'] = df_book_data['ask_price1'] - df_book_data['bid_price1']
    
    # bid spread
    df_book_data['bid_spread'] = (df_book_data['bid_price1'] - df_book_data['bid_price2']) / \
                                 (df_book_data['ask_price1'] + df_book_data['bid_price1'])
    df_book_data['bid_spread_vs_ba'] = df_book_data['bid_spread'] / df_book_data['ba_spread']
    
    # ask spread
    df_book_data['ask_spread'] = (df_book_data['ask_price2'] - df_book_data['ask_price1']) / \
                                 (df_book_data['ask_price1'] + df_book_data['bid_price1'])
    df_book_data['ask_spread_vs_ba'] = df_book_data['ask_spread'] / df_book_data['ba_spread']

    # Dispersion
    df_book_data['dispersion_bid'] = df_book_data['bid_spread'] * df_book_data['bid_size2']
    df_book_data['dispersion_ask'] = df_book_data['ask_spread'] * df_book_data['ask_size2']
    
    # order book imbalance (should be measured in other ways)
    df_book_data['volume_imbalance'] = abs((df_book_data['bid_size1'] + df_book_data['bid_size2']) \
                                           - (df_book_data['ask_size1'] + df_book_data['ask_size2']))
    
    df_book_data['total_order_per_ba_spread'] = df_book_data['total_order'] / df_book_data['ba_spread']
    
    # move in b/a price versus ba spread
    df_book_data['bid_price_chg_vs_ba'] = np.abs(df_book_data['bp_update']) / df_book_data['abs_ba_spread']
    df_book_data['ask_price_chg_vs_ba'] = np.abs(df_book_data['ap_update']) / df_book_data['abs_ba_spread']
    
    # number of quote updates
    df_book_data['best_quote'] = list(zip(df_book_data['ask_price1'], df_book_data['ask_size1'],
                                          df_book_data['bid_price1'], df_book_data['bid_size1']))
    df_book_data['best_price'] = list(zip(df_book_data['ask_price1'], df_book_data['bid_price1']))

    # calculate update interval & count order updates
    shift_ = df_book_data['best_quote'] == (df_book_data.groupby('time_id')['best_quote'].apply(order_shift))
    df_book_data['order_update'] = (~shift_)
    shift_ = df_book_data['best_price'] == (df_book_data.groupby('time_id')['best_price'].apply(order_shift))
    df_book_data['order_price_update'] = (~shift_)
    
    # time weights
    df_book_data['time_weights'] = 0.995 ** (600 - df_book_data['seconds_in_bucket'])
    df_book_data['time_weighted_return'] = df_book_data['log_return'] * df_book_data['time_weights']
    df_book_data['time_weighted_sqr_return'] = df_book_data['log_return_sq'] * df_book_data['time_weights']
    df_book_data['time_weighted_sqr_return2'] = df_book_data['log_return2_sq'] * df_book_data['time_weights']
    df_book_data['time_weighted_ba_spread'] = df_book_data['ba_spread'] * df_book_data['time_weights']

    # split a time bucket to 4 splits
    df_book_data['time_split'] = 1
    df_book_data.loc[(df_book_data['seconds_in_bucket'] >= 150) & (df_book_data['seconds_in_bucket'] < 300), 'time_split'] = 2
    df_book_data.loc[(df_book_data['seconds_in_bucket'] >= 300) & (df_book_data['seconds_in_bucket'] < 450), 'time_split'] = 3
    df_book_data.loc[(df_book_data['seconds_in_bucket'] >= 450) & (df_book_data['seconds_in_bucket'] < 600), 'time_split'] = 4
    time_split_stats = df_book_data.groupby(['time_id', 'time_split']).agg(time_split_rv=('log_return', realized_volatility),
                                                                           time_split_ba_spread_min=('ba_spread', 'min'),
                                                                           time_split_ba_spread_max=('ba_spread', 'max'),
                                                                           time_split_ba_spread_mean=('ba_spread', 'mean'),
                                                                           ).reset_index()

    feature_cols = ['time_id', 'time_split_rv',
                    'time_split_ba_spread_min', 'time_split_ba_spread_max', 'time_split_ba_spread_mean']

    first_time_split = time_split_stats.loc[time_split_stats['time_split'] == 1, feature_cols].set_index('time_id')
    # first_time_split = first_time_split.add_suffix('_first')

    second_time_split = time_split_stats.loc[time_split_stats['time_split'] == 2, feature_cols].set_index('time_id')
    # second_time_split = second_time_split.add_suffix('_second')

    third_time_split = time_split_stats.loc[time_split_stats['time_split'] == 3, feature_cols].set_index('time_id')
    # third_time_split = third_time_split.add_suffix('_third')

    last_time_split = time_split_stats.loc[time_split_stats['time_split'] == 4, feature_cols].set_index('time_id')
    # last_time_split = last_time_split.add_suffix('_last')

    last_time_split['time_split_ba_is_max'] = last_time_split['time_split_ba_spread_max'] / np.max([first_time_split['time_split_ba_spread_max'],
                                                                                                   second_time_split['time_split_ba_spread_max'],
                                                                                                   third_time_split['time_split_ba_spread_max']], axis=0)
    first_time_split['time_split_ba_is_max_first'] = first_time_split['time_split_ba_spread_max'] / np.max([last_time_split['time_split_ba_spread_max'],
                                                                                                            second_time_split['time_split_ba_spread_max'],
                                                                                                            third_time_split['time_split_ba_spread_max']], axis=0)
    uni_cols = ['time_split_rv']
    last_time_split = last_time_split.add_suffix('_last')
    time_split_feature_cols = ['time_split_rv_last', 'time_split_ba_spread_min_last', 'time_split_ba_spread_max_last', 'time_split_ba_spread_mean_last',
                               'time_split_ba_is_max_last']

    
    # group by time id
    # df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_order_features = df_book_data.groupby(['time_id']).agg(ba_spread_mean=('ba_spread', 'mean'), ba_spread_min=('ba_spread', 'min'),
                                                              ba_spread_std=('ba_spread', 'std'),  ba_spread_max=('ba_spread', 'max'),
                                                              # abs_ba_spread_std=('abs_ba_spread', 'std'),
                                                              total_order_per_ba_spread_mean=('total_order_per_ba_spread', 'mean'),
                                                              total_order_per_ba_spread_min=('total_order_per_ba_spread', 'min'),
                                                              num_small_orders=('small_orders', 'sum'),
                                                              ba_spread_abs_change=('ba_spread', abs_sum_diff),
                                                              # as_update_vol=('as_update_relative', realized_volatility),
                                                              # bs_update_vol=('bs_update_relative', realized_volatility),
                                                              # wap_std=('wap', 'std'),
                                                              log_return_sq_sum=('log_return_sq', 'sum'),
                                                              # bid_price_chg_vs_ba_mean=('bid_price_chg_vs_ba', 'mean'),
                                                              # ask_price_chg_vs_ba_mean=('ask_price_chg_vs_ba', 'mean'),
                                                              log_return_sq_large10_sum=('log_return_sq', lambda x: x.nlargest(10).sum()),
                                                              weights_sum=('time_weights', 'sum'),
                                                              weighted_log_return_mean=('time_weighted_return', 'sum'),  # weighted_log_return_max=('time_weighted_return', 'max'),
                                                              weighted_ba_spread_mean=('time_weighted_ba_spread', 'sum'),  # weighted_ba_spread_max=('time_weighted_ba_spread', 'max'),
                                                              # weighted_ba_spread_std=('time_weighted_ba_spread', 'std'),
                                                              # weighted_ba_spread_min=('time_weighted_ba_spread', 'min'),
                                                              bid_spread_vs_ba_mean=('bid_spread_vs_ba', 'mean'),
                                                              ask_spread_vs_ba_mean=('ask_spread_vs_ba', 'mean'),
                                                              time_weighted_rv=('time_weighted_sqr_return', 'sum'),
                                                              time_weighted_rv2=('time_weighted_sqr_return2', 'sum'),
                                                              log_return_max=('log_return', 'max'), log_return_min=('log_return', 'min'),
                                                              # log_return_std=('log_return', 'std'),
                                                              log_return_sum=('log_return', 'sum'),
                                                              realized_vol=('log_return', realized_volatility),
                                                              realized_vol_l2=('log_return2', realized_volatility),
                                                              num_quote_updates=('order_update', 'sum'),
                                                              num_price_updates=('order_price_update', 'sum'),
                                                              bid_order_max=('total_bid_order', 'max'),
                                                              total_bid_order1=('bid_size1', 'sum'),
                                                              total_bid_order2=('bid_size2', 'sum'),
                                                              total_ask_order1=('ask_size1', 'sum'),
                                                              total_ask_order2=('ask_size2', 'sum'),
                                                              bid_dispersion_calc=('dispersion_bid', 'sum'),
                                                              ask_dispersion_calc=('dispersion_ask', 'sum'),
                                                              # bid_spread_min=('bid_spread', 'min'),  # bid_spread_max=('bid_spread', 'max'),
                                                              # ask_spread_min=('ask_spread', 'min'),  # ask_spread_max=('ask_spread', 'max'),
                                                              VOI_min=('VOI', 'min'), VOI_max=('VOI', 'max'),
                                                              )  # .reset_index()
    df_order_features['weighted_log_return_mean'] = df_order_features['weighted_log_return_mean'] / df_order_features['weights_sum']
    df_order_features['weighted_ba_spread_mean'] = df_order_features['weighted_ba_spread_mean'] / df_order_features['weights_sum']
    df_order_features['rv_spikes'] = df_order_features['log_return_sq_large10_sum'] / df_order_features['log_return_sq_sum']
    df_order_features['LDispersion'] = (df_order_features['bid_dispersion_calc'] / df_order_features['total_bid_order2'] +
                                        df_order_features['ask_dispersion_calc'] / df_order_features['total_ask_order2']) / 2
    
    for data, suffix in ((first_time_split, '_first'), (second_time_split, '_second'), (third_time_split, '_third')):
        add_suffix = [col + suffix for col in uni_cols]
        df_order_features[add_suffix] = data[uni_cols]
    df_order_features['time_split_ba_is_max_first'] = first_time_split['time_split_ba_is_max_first']
    df_order_features[time_split_feature_cols] = last_time_split[time_split_feature_cols]
    df_order_features['first_time_split_rv_impact'] = (df_order_features['time_split_rv_first'] / df_order_features['realized_vol']) ** 2
    df_order_features['second_time_split_rv_impact'] = (df_order_features['time_split_rv_second'] / df_order_features['realized_vol']) ** 2
    df_order_features['third_time_split_rv_impact'] = (df_order_features['time_split_rv_third'] / df_order_features['realized_vol']) ** 2
    df_order_features['last_time_split_rv_impact'] = (df_order_features['time_split_rv_last'] / df_order_features['realized_vol']) ** 2
    # df_order_features['rv_slope_relative'] = (df_order_features['last_time_split_rv_impact'] - df_order_features['first_time_split_rv_impact']) / df_order_features['first_time_split_rv_impact']

    # add row add stock id + time id
    df_order_features.reset_index(inplace=True)
    df_order_features['row_id'] = df_order_features['time_id'].apply(lambda x: f'{stock_id}-{x}')

    # ------------------------- TRADE-ONLY FEATURES --------------------------
    # trade log return
    df_trade_data['trade_return'] = df_trade_data.groupby('time_id')['price'].apply(log_return)
    # df_trade_data['trade_batch_wait_time'] = df_trade_data.groupby('time_id')['seconds_in_bucket'].apply(order_updates)
    # cancel order
    df_trade_data = df_book_data.merge(df_trade_data, on=['time_id', 'seconds_in_bucket'], how='left')
    df_trade_data['trade_perc_bid'] = df_trade_data['size'] / df_trade_data['bid_size1_lag']
    df_trade_data['trade_perc_ask'] = df_trade_data['size'] / df_trade_data['ask_size1_lag']
    
    trade_shift_ = df_trade_data.groupby(['time_id'])[['ask_price1', 'ask_size1', 'ask_price2', 'ask_size2',
                                                       'bid_price1', 'bid_size1', 'bid_price2', 'bid_size2']].apply(order_shift)
    condition1 = ((df_trade_data['ask_price2'] == trade_shift_['ask_price2']) & (df_trade_data['ask_size2'] < trade_shift_['ask_size2'])) & (
                (df_trade_data['ask_price1'] == trade_shift_['ask_price1']) & (df_trade_data['ask_size1'] == trade_shift_['ask_size2']))
    size1 = trade_shift_['ask_size2'] - df_trade_data['ask_size2']
    condition2 = (df_trade_data['ask_price2'] > trade_shift_['ask_price2']) & ((df_trade_data['ask_price1'] == trade_shift_['ask_price1']) & (df_trade_data['ask_size1'] == trade_shift_['ask_size2']))
    size2 = trade_shift_['ask_size2']
    condition3 = ((df_trade_data['bid_price2'] == trade_shift_['bid_price2']) & (df_trade_data['bid_size2'] < trade_shift_['bid_size2'])) & (
                (df_trade_data['bid_price1'] == trade_shift_['bid_price1']) & (df_trade_data['bid_size1'] == trade_shift_['bid_size2']))
    size3 = trade_shift_['bid_size2'] - df_trade_data['bid_size2']
    condition4 = (df_trade_data['bid_price2'] < trade_shift_['bid_price2']) & ((df_trade_data['bid_price1'] == trade_shift_['bid_price1']) & (df_trade_data['bid_size1'] == trade_shift_['bid_size2']))
    size4 = trade_shift_['bid_size2']
    condition5 = (df_trade_data['price'].isna()) & (df_trade_data['ask_price1'] > trade_shift_['ask_price1'])
    size5 = trade_shift_['ask_size1']
    condition6 = (df_trade_data['price'].isna()) & ((df_trade_data['ask_price1'] == trade_shift_['ask_price1']) & (df_trade_data['ask_size1'] < trade_shift_['ask_size1']))
    size6 = trade_shift_['ask_size1'] - df_trade_data['ask_size1']
    condition7 = (df_trade_data['price'].isna()) & (df_trade_data['bid_price1'] < trade_shift_['bid_price1'])
    size7 = trade_shift_['bid_size1']
    condition8 = (df_trade_data['price'].isna()) & ((df_trade_data['bid_price1'] == trade_shift_['bid_price1']) & (df_trade_data['bid_size1'] < trade_shift_['bid_size1']))
    size8 = trade_shift_['bid_size1'] - df_trade_data['bid_size1']
    df_trade_data['cancel_order'] = condition1 | condition2 | condition3 | condition4 | condition5 | condition6 | condition7 | condition8
    df_trade_data['cancel_order_size'] = condition1 * size1 + condition2 * size2 + condition3 * size3 + \
                                         condition4 * size4 + condition5 * size5 + condition6 * size6 + \
                                         condition7 * size7 + condition8 * size8
    
    # sensitivity
#     avg_size = df_trade_data['size'].mean()
#     df_trade_data['ask_size_new'] = np.where(df_trade_data['ask_size1'] > avg_size, df_trade_data['ask_size1'] - avg_size,
#                                              np.where(df_trade_data['ask_size2'] > (avg_size - df_trade_data['ask_size1']),
#                                                       df_trade_data['ask_size1'] + df_trade_data['ask_size2'] - avg_size,
#                                                       np.nan))
#     df_trade_data['bid_size_new'] = np.where(df_trade_data['bid_size1'] > avg_size, df_trade_data['bid_size1'] - avg_size,
#                                              np.where(df_trade_data['bid_size2'] > (avg_size - df_trade_data['bid_size1']),
#                                                       df_trade_data['bid_size1'] + df_trade_data['bid_size2'] - avg_size,
#                                                       np.nan))
#     df_trade_data['ask_price_new'] = np.where(df_trade_data['ask_size1'] > avg_size, df_trade_data['ask_price1'],
#                                               np.where(df_trade_data['ask_size2'] > (avg_size - df_trade_data['ask_size1']),
#                                                        df_trade_data['ask_price2'], np.nan))
#     df_trade_data['bid_price_new'] = np.where(df_trade_data['bid_size1'] > avg_size, df_trade_data['bid_price1'],
#                                               np.where(df_trade_data['bid_size2'] > (avg_size - df_trade_data['bid_size1']),
#                                                        df_trade_data['bid_price2'], np.nan))
#     df_trade_data['wap_bid'] = (df_trade_data['bid_price_new'] * df_trade_data['ask_size1'] + df_trade_data['ask_price1'] *
#                                df_trade_data['bid_size_new']) / (df_trade_data['bid_size_new'] + df_trade_data['ask_size1'])
#     df_trade_data['wap_ask'] = (df_trade_data['bid_price1'] * df_trade_data['ask_size_new'] + df_trade_data['ask_price_new'] *
#                                df_trade_data['bid_size1']) / (df_trade_data['bid_size1'] + df_trade_data['ask_size_new'])
#     df_trade_data['wap_bid_change'] = df_trade_data['wap_bid'] - df_trade_data['wap']
#     df_trade_data['wap_ask_change'] = df_trade_data['wap_ask'] - df_trade_data['wap']
#     df_trade_data['wap_imbalance'] = df_trade_data['wap_ask'] - df_trade_data['wap_bid']
    
    # size, order_count, size_per_order
    df_trade_features = df_trade_data.groupby(['time_id']).agg(size=('size', 'sum'),
                                                               order_batches=('order_count', 'count'),
                                                               order_count=('order_count', 'sum'),
                                                               trade_rv=('trade_return', realized_volatility),
                                                               # trade_return_mean=('trade_return', 'mean'),
                                                               trade_return_std=('trade_return', 'std'),
                                                               # trade_return_min=('trade_return', 'min'),
                                                               # trade_outside_ba=('trade_outside_ba', 'sum')
                                                               cancel_order=('cancel_order', 'sum'),
                                                               cancel_order_size=('cancel_order_size', 'sum'),
                                                               # trade_batch_wait_time_mean=('trade_batch_wait_time', 'mean'),
                                                               # trade_perc_bid_median=('trade_perc_bid', 'median'),
                                                               # trade_perc_ask_median=('trade_perc_ask', 'median'),
                                                               trade_perc_ask_20=('trade_perc_ask', lambda x: x.quantile(0.2)),
                                                               # trade_perc_bid_20=('trade_perc_bid', lambda x: x.quantile(0.2)),
                                                               ).reset_index()

    df_trade_features['size_per_order'] = df_trade_features['size'] / df_trade_features['order_count']
    # df_trade_features['avg_trade_size'] = df_trade_features['size'] / df_trade_features['order_batches']

    # add row add stock id + time id
    df_trade_features['row_id'] = df_trade_features['time_id'].apply(lambda x: f'{stock_id}-{x}')
    
    # ------------------------- MERGING --------------------------
    df = df_order_features.merge(df_trade_features, on=['row_id'], how='left')
    df['quote_updated_ex_cancel'] = df['num_quote_updates'] - df['cancel_order']
    return df

In [None]:
def joblib_func(book_file, train_test=True):
    stock_folder = book_file.split('/')[-1]
    if train_test:
        trade_file = path + 'trade_train.parquet/' + stock_folder
    else:
        trade_file = path + 'trade_test.parquet/' + stock_folder
    df = feature_engineering(book_file, trade_file, train_test)
    return df


# function: get the aggregated dataframe
def paralle_fe(book_list_file, train_test=True):
    df = Parallel(n_jobs=-1, verbose=0)(delayed(joblib_func)(book_file, train_test) for book_file in book_list_file)
    df_all_stock = pd.concat(df, ignore_index=True)
    return df_all_stock

In [None]:
def fe_group(df):
    # log transform of rv
    df['log_realized_vol'] = np.log(df['realized_vol'])
    df['log_tw_realized_vol'] = np.log(df['time_weighted_rv'])
    df['log_tw_realized_vol2'] = np.log(df['time_weighted_rv2'])
    
    # order count
    df['market_order_count_median'] = df.groupby('time_id')['order_count'].transform('median')
    df['market_order_count_20'] = df.groupby('time_id')['order_count'].transform(lambda x: x.quantile(0.2))
    df['market_order_count_10'] = df.groupby('time_id')['order_count'].transform(lambda x: x.quantile(0.1))
    df['market_order_count_80'] = df.groupby('time_id')['order_count'].transform(lambda x: x.quantile(0.8))

    # min ba spread
    df['market_min_ba_spread_mean'] = df.groupby('time_id')['ba_spread_min'].transform('mean')
    df['market_min_ba_spread_std'] = df.groupby('time_id')['ba_spread_min'].transform('std')
    df['market_min_ba_spread_median'] = df.groupby('time_id')['ba_spread_min'].transform('median')
    df['market_min_ba_spread_20'] = df.groupby('time_id')['ba_spread_min'].transform(lambda x: x.quantile(0.2))
    df['market_min_ba_spread_80'] = df.groupby('time_id')['ba_spread_min'].transform(lambda x: x.quantile(0.8))
    # df['standardized_min_ba_spread_market'] = (df['ba_spread_min'] - df['market_min_ba_spread_mean']) / df['market_min_ba_spread_std']
    # df['market_min_ba_spread_std'] = df.groupby('time_id')['ba_spread_min'].transform('std')
    # df['market_relative_slope_mean'] = df.groupby('time_id')['rv_slope_relative'].transform('mean')
    # mean ba spread
    # df['market_mean_ba_spread_mean'] = df.groupby('time_id')['ba_spread_mean'].transform('mean')
    # df['market_mean_ba_spread_std'] = df.groupby('time_id')['ba_spread_mean'].transform('std')

    # standardized realized vol on market level
    df['market_rv_mean'] = df.groupby('time_id')['log_realized_vol'].transform('mean')
    df['market_rv_std'] = df.groupby('time_id')['log_realized_vol'].transform('std')
    df['market_rv_median'] = df.groupby('time_id')['log_realized_vol'].transform('median')
    df['market_rv_20'] = df.groupby('time_id')['log_realized_vol'].transform(lambda x: x.quantile(0.2))
    df['market_rv_80'] = df.groupby('time_id')['log_realized_vol'].transform(lambda x: x.quantile(0.8))
    df['standardized_rv_market'] = (df['log_realized_vol'] - df['market_rv_mean']) / df['market_rv_std']
    # rv slope on market level
    # df['market_rv_slope_mean'] = df.groupby('time_id')['rv_slope_relative'].transform('mean')
    # standardized time weighted realized vol on market level
    # df['market_tw_rv_mean'] = df.groupby('time_id')['log_tw_realized_vol'].transform('mean')
    # df['market_tw_rv_std'] = df.groupby('time_id')['log_tw_realized_vol'].transform('std')
    # df['standardized_tw_rv_market'] = (df['log_tw_realized_vol'] - df['market_tw_rv_mean'])/df['market_tw_rv_std']
    # standardized time weighted realized vol (level 2) on market level
    df['market_tw_rv2_mean'] = df.groupby('time_id')['log_tw_realized_vol2'].transform('mean')
    df['market_tw_rv2_std'] = df.groupby('time_id')['log_tw_realized_vol2'].transform('std')
    df['market_tw_rv2_median'] = df.groupby('time_id')['log_tw_realized_vol2'].transform('median')
    df['market_tw_rv2_20'] = df.groupby('time_id')['log_tw_realized_vol2'].transform(lambda x: x.quantile(0.2))
    df['market_tw_rv2_80'] = df.groupby('time_id')['log_tw_realized_vol2'].transform(lambda x: x.quantile(0.8))
    df['standardized_tw_rv2_market'] = (df['log_tw_realized_vol2'] - df['market_tw_rv2_mean']) / df['market_tw_rv2_std']
    # diff fe calculation
    df['log_return_diff'] = df['log_return_max'] - df['log_return_min']
    df['VOI_diff'] = df['VOI_max'] - df['VOI_min']
    
    df['rv_impact_diff_3_4'] = df['last_time_split_rv_impact'] - df['third_time_split_rv_impact']
    df['rv_impact_diff_1_4'] = df['last_time_split_rv_impact'] - df['first_time_split_rv_impact']
    df['rv_impact_diff_2_4'] = df['last_time_split_rv_impact'] - df['second_time_split_rv_impact']
    df['market_relative_slope_mean'] = df.groupby('time_id')['rv_impact_diff_1_4'].transform('mean')

    # corr features
    df_pivot = train_df.pivot(index='time_id', columns='stock_id', values='log_return_sum')
    corr = df_pivot.corr()
    max_stock, max_corr = [], []
    for i in corr.index:
        max_stock.append(corr[i].nlargest(2).index[1])
        max_corr.append(corr[i].nlargest(2).iloc[1])
    
    df_corr_manual = pd.DataFrame({'stock_id': corr.index, 'corr_stock_id': max_stock, 'corr_stock_value': max_corr})
    corr_stock_list = df_corr_manual['corr_stock_id'].unique()
    low_corr_list = df_corr_manual.loc[df_corr_manual['corr_stock_value'] < 0.5, 'stock_id']
    corr_columns = ['stock_id', 'time_id', 'ba_spread_min']
    df_corr_temp = df.loc[df['stock_id'].isin(corr_stock_list), corr_columns].rename(columns={'stock_id': 'corr_stock_id',
                                                                                              'ba_spread_min': 'ba_spread_min_corr'})
    df_corr_temp.loc[df_corr_temp['corr_stock_id'].isin(low_corr_list), ['ba_spread_min_corr']] = np.nan
    df = df.merge(df_corr_manual, on='stock_id', how='left')
    df = df.merge(df_corr_temp, on=['corr_stock_id', 'time_id'], how='left')
    
    return df

In [None]:
# running fe in multi-processing
with timer("Feature Engineering..."):
    train_df = paralle_fe(book_list_file=list_order_book_file_train)
    train_df = train.merge(train_df, on=['row_id'], how='right')
    train_df = fe_group(train_df)


In [None]:
FEATS = ['realized_vol',
         'realized_vol_l2',  
         # 'time_weighted_rv',
         'time_weighted_rv2',
         'standardized_rv_market', 'standardized_tw_rv2_market',
         'market_rv_mean', 'market_rv_std',
         'market_tw_rv2_mean', 'market_tw_rv2_std',
         # 'order_batches',
         # 'size',
         'order_count',
         'size_per_order',
         'trade_rv',  # 'trade_return_std',
         'ba_spread_mean', 'ba_spread_min',  # 'ba_spread_std',
         # 'ba_spread_max',
         'log_return_max',  # 'log_return_std',
         'log_return_min',  # 'log_return_mean',
         'weighted_log_return_mean',
         'num_quote_updates',
         'LDispersion',
         'stock_id',
         'VOI_max',  # 'VOI_mean', 'VOI_std',
         'VOI_diff',
         'log_return_diff',  # 'weighted_log_return_diff',
         'time_split_rv_first',  # 'last_90_rv',
         'time_split_rv_second',
         'time_split_rv_third',
         'time_split_rv_last',
         'num_price_updates',
         'time_split_ba_is_max_last',
         # 'time_split_ba_mean_compare_last',
         'time_split_ba_spread_min_last',
         'total_order_per_ba_spread_mean', 'total_order_per_ba_spread_min',
         'cancel_order_size',
         'quote_updated_ex_cancel',
         'market_min_ba_spread_mean',
         'market_min_ba_spread_std',
         # 'standardized_min_ba_spread_market',
         'weighted_ba_spread_mean',  # 'weighted_ba_spread_max',
         'market_order_count_median',
         # 'bid_spread_vs_ba_mean',
         'ask_spread_vs_ba_mean',
         'ba_spread_abs_change',
         'ba_spread_min_corr',
         'rv_spikes',
         'rv_impact_diff_1_4',
         'rv_impact_diff_2_4',
         'market_relative_slope_mean',
         'time_split_ba_is_max_first',
         'market_order_count_20', 'market_order_count_80', 'market_order_count_10',
         ]



categorical_features = ['stock_id']
for col in categorical_features:
    train_df[col] = train_df[col].astype('category')

lgb_params = {'objective': 'regression',
              'metric': 'rmse',
              'learning_rate': 0.05,
              'max_depth': 5,
              'num_leaves': 20,
              'verbose': -1,
              # 'feature_fraction': 0.9,
              # 'bagging_fraction': 0.9
              }

In [None]:
# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

Model_Dict = {}
cv_splits = 5
with timer('Modeling and Prediction...'):
    RMSPE_in_sample_array, RMSPE_oos_array = np.ones(cv_splits), np.ones(cv_splits)
    CV = GroupKFold(n_splits=cv_splits)

    X = train_df[FEATS]
    y = train_df.target
    original_y = train_df.target

    for cv_id, (train_idx, val_idx) in enumerate(CV.split(X=train_df, y=y, groups=train_df['time_id'])):
        print(f'Running Fold # {cv_id + 1}...')
        train_X, train_y = X.iloc[train_idx, :], y.iloc[train_idx]
        val_X, val_y = X.iloc[val_idx, :], y.iloc[val_idx]
        eval_train_y, eval_val_y = original_y.iloc[train_idx], original_y.iloc[val_idx]

        train_weights = 1 / np.square(train_y)
        val_weights = 1 / np.square(val_y)

        print('Train Data:', len(train_X), 'Validation Data:', len(val_X))
        lgb_train = lgb.Dataset(train_X, label=train_y, weight=train_weights, feature_name=FEATS)
        lgb_valid = lgb.Dataset(val_X, label=val_y, weight=val_weights, feature_name=FEATS)

        # =========================================================================
        start = time.time()
        model = lgb.train(lgb_params,
                          train_set=lgb_train,
                          num_boost_round=1000,
                          valid_sets=[lgb_train, lgb_valid],  # list of data to be evaluated on during training
                          verbose_eval=100,
                          early_stopping_rounds=50,
                          feval=feval_rmspe)
        end = time.time()
        print('LGB training time elapsed: {}'.format(end - start))
        # =========================================================================
        train_prediction = model.predict(train_X)
        val_prediction = model.predict(val_X)

        RMSPE_in_sample_array[cv_id] = rmspe(eval_train_y, train_prediction)
        RMSPE_oos_array[cv_id] = rmspe(eval_val_y, val_prediction)
        print('\n')
        Model_Dict[f'lgb {cv_id}'] = model

RMSPE = round(RMSPE_oos_array.mean(), 4)
print(f'Performance of the naive prediction: RMSPE: {RMSPE}')

train_RMSPE = round(RMSPE_in_sample_array.mean(), 4)
print(f'Performance of the naive prediction in sample: RMSPE: {train_RMSPE}')

# lgb.plot_importance(Model_Dict['lgb 0'], importance_type='gain', ignore_zero=False)

In [None]:
# for name, model in Model_Dict.items():
#     print(f'Feature Importance of Fold {name}')
#     lgb.plot_importance(model, importance_type='gain', ignore_zero=False)
#     print('\n')

## Submission

In [None]:
def single_stock_fe(book_list_file, train_test=True):
    df_all_stock = pd.DataFrame()
    for book_file in book_list_file:
        stock_folder = book_file.split('/')[-1]
        if train_test:
            trade_file = path + 'trade_train.parquet/' + stock_folder
        else:
            trade_file = path + 'trade_test.parquet/' + stock_folder
        df_all_stock = pd.concat([df_all_stock, feature_engineering(book_file, trade_file, train_test)])

    return df_all_stock

In [None]:
test = pd.read_csv(path + 'test.csv')
list_order_book_file_test = glob.glob(path + 'book_test.parquet/*')

# running fe in multi-processing
with timer("Feature Engineering..."):
    test_df = paralle_fe(book_list_file=list_order_book_file_test, train_test=False)
    test_df = test.merge(test_df, on=['row_id'], how='left')
    test_df = fe_group(test_df)

In [None]:
for col in categorical_features:
    test_df[col] = test_df[col].astype('category')
test_X = test_df[FEATS]

pred = []
test_df['target'] = 0
for name, model in Model_Dict.items():
    # pred.append(model.predict(test_X))
    test_df['target'] += model.predict(test_X)
# avg_pred = np.mean(pred, axis=0)
test_df['target'] /= cv_splits
df_pred_test = test_df[['row_id', 'target']]
df_pred_test.to_csv('submission.csv', index=False)