In [None]:
import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from multiprocessing import Pool
from tqdm import tqdm

from numba import njit
from numba import jit

from sklearn.model_selection import KFold

import lightgbm as lgb

pd.options.display.max_columns = 300

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Idea
Create features from book data upto where real trade occurs. This will reduce the book order data up to the tune of trade data for each stock and after that aggregate trade data on time ids.

# Load train and test data

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)

In [None]:
book_train_path = "../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id="
trade_train_path = '../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id='

book_test_path = "../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id="
trade_test_path = '../input/optiver-realized-volatility-prediction/trade_test.parquet/stock_id='

In [None]:
train_stock_ids = train['stock_id'].unique()
test_stock_ids = test['stock_id'].unique()

In [None]:
n_timeids = []
for i in train_stock_ids:
    n_timeids.append(train[train['stock_id'] == i]['time_id'].nunique())
stock_timeids = pd.DataFrame(zip(train_stock_ids, n_timeids), columns = ['stock', 'unique_time_ids'])
stock_timeids.sort_values(by='unique_time_ids', inplace=True)
stock_timeids[stock_timeids['unique_time_ids'] < 3830]

# Some time ids not present in some stocks
Some stocks dont have trades in specific time ids for which volatility is to be predicted.

Below stock 31 dont have trades in time ids [ 985 3987 5539 5629 6197 8753 8840 9208 12011 13377 13663 15010 20017 22498 28186 32174 ] for which volatility is to be predicted .

**For Aggregations on trade data , the above time ids has to be incorporated**

In [None]:
n_timeids_book = []
time_ids_book = []
n_timeids_trade = []
time_ids_trade = []
for i in tqdm(train_stock_ids):
    book_train = pd.read_parquet(book_train_path + str(i))
    trade_train = pd.read_parquet(trade_train_path + str(i))
    book_time_ids  = book_train['time_id'].unique()
    trade_time_ids  = trade_train['time_id'].unique()
    
    n_timeids_book.append(len(book_time_ids))
    time_ids_book.append(book_time_ids)
    n_timeids_trade.append(len(trade_time_ids))
    time_ids_trade.append(trade_time_ids)
    
book_trade_timeids = pd.DataFrame(zip(train_stock_ids, n_timeids_book, time_ids_book, n_timeids_trade, time_ids_trade), 
                           columns = ['stock_id', 'n_timeids_book', 'time_ids_book', 'n_timeids_trade', 'time_ids_trade'])

unequal_time_ids = book_trade_timeids[~(book_trade_timeids['n_timeids_book'] == book_trade_timeids['n_timeids_trade'])]

unequal_time_ids

In [None]:
for i, j in zip(unequal_time_ids['time_ids_book'], unequal_time_ids['time_ids_trade']):
    print(np.setdiff1d(np.array(i), np.array(j)))

# Feature Creation Numpy

In [None]:
original_cols = ['time_id', 'seconds_in_bucket', 'bid_price1', 'ask_price1',
       'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2',
       'ask_size2', 'price', 'size', 'order_count']
        
cols = ['hbuy1', 'lsell1', 'mid1', 
            'hbuy2','lsell2', 'mid2', 
            'mid1_sub_bp1', 'mid1_sub_ap1',
            'mid2_sub_bp2', 'mid2_sub_ap2', 
            'hbuy1_lsell1_spread', 'hbuy2_lsell2_spread', 
            'wap_high_low1', 'wap_high_low2',
            'log_wap_high_low1_std', 'log_wap_high_low2_std'
            ]

In [None]:
@njit
def calculate_features(np_arr, indices):
    zeros = np.zeros((np_arr.shape[0], 16)) # change dimension as you have more features
    for ind in range(len(indices) - 1):
        start, end = indices[ind], indices[ind + 1] # will give data upto the point where trade occurs
        arr = np_arr[start:end]
        arr = arr.transpose() # for easy selection

        hbuy1 = arr[2].max() # highest buy 1
        lsell1 = arr[3].min() # lowest sell 1
        mid1 = (hbuy1 + lsell1) / 2 # average of highest buy 1 and lowest sell 1
        zeros[start:end, 0] = hbuy1 
        zeros[start:end, 1] = lsell1
        zeros[start:end, 2] = mid1
        
        hbuy2 = arr[4].max() # highest buy 2
        lsell2 = arr[5].min() # lowest sell 2
        mid2 = (hbuy2 + lsell2) / 2 # average of highest buy 2 and lowest sell 2
        zeros[start:end, 3] = hbuy2 
        zeros[start:end, 4] = lsell2
        zeros[start:end, 5] = mid2
        
        # mid1 - bidprice1 # mid1 - askprice1
        mid1_sub_bp1 = mid1 - arr[2]
        mid1_sub_ap1 = mid1 - arr[3]
        zeros[start:end, 6] = mid1_sub_bp1
        zeros[start:end, 7] = mid1_sub_ap1

        # mid2 - bidprice2 # mid2 - askprice2
        mid2_sub_bp2 = mid2 - arr[4]
        mid2_sub_ap2 = mid2 - arr[5]
        zeros[start:end, 8] = mid2_sub_bp2
        zeros[start:end, 9] = mid2_sub_ap2
        
        zeros[start:end, 10] = hbuy1 - lsell1 # high buy low sell spread 1
        zeros[start:end, 11] = hbuy2 - lsell2 # high buy low sell spread 2
         
        wap_high_low1 = (hbuy1*arr[7] + lsell1*arr[6])/ (arr[6]+arr[7]) # high buy 1 low sell 1
        wap_high_low2 = (hbuy2*arr[7] + lsell2*arr[6])/ (arr[6]+arr[7]) # high buy 2 low sell 2
        log_wap_high_low1 = np.append(np.array(0), np.diff(np.log(wap_high_low1)))
        log_wap_high_low2 = np.append(np.array(0),np.diff(np.log(wap_high_low2)))
        zeros[start:end, 12] = wap_high_low1
        zeros[start:end, 13] = wap_high_low2
        zeros[start:end, 14] = log_wap_high_low1.std()
        zeros[start:end, 15] = log_wap_high_low2.std()

#         zeros[start:end, 4] = (arr[2]*arr[7] + arr[3]*arr[6])/ (arr[6]+arr[7]) # wap1
#         zeros[start:end, 5] = (arr[4]*arr[9] + arr[5]*arr[8])/ (arr[8]+arr[9]) # wap2
        
#         zeros[start:end, 6] = (arr[4]*arr[9] + arr[5]*add[8])/ (arr[8]+arr[9]) 
        
    cols = ['hbuy1', 'lsell1', 'mid1', 
            'hbuy2','lsell2', 'mid2', 
            'mid1_sub_bp1', 'mid1_sub_ap1',
            'mid2_sub_bp2', 'mid2_sub_ap2', 
            'hbuy1_lsell1_spread', 'hbuy2_lsell2_spread', 
            'wap_high_low1', 'wap_high_low2',
            'log_wap_high_low1_std', 'log_wap_high_low2_std'
            ]
    
    
    trade_book_arr = np.append(np_arr, zeros, axis=1)
    return trade_book_arr

> Mistake in version 1 notebook 
> * Trade index selection was wrong, now rectified

In [None]:
%%time
def get_calculated_features_trade(stock, train=True):
    if train:
        trade_path = trade_train_path
        book_path = book_train_path
    else:
        trade_path = trade_test_path
        book_path = book_test_path
        
    trade_train = pd.read_parquet(trade_path+str(stock))
    book_train = pd.read_parquet(book_path+str(stock))
    
    # time ids where there is no trade transaction 
    time_ids_not_in_trade = np.setdiff1d(book_train['time_id'].unique(), trade_train['time_id'].unique())

    merged_df = book_train.merge(trade_train, on=['time_id', 'seconds_in_bucket'], how='left')
    
    # selection of rows upto whare a trade happens
    no_trade_index = np.array(merged_df[~merged_df['price'].isna()].index + 1)
    no_trade_index = np.insert(no_trade_index, 0, 0) # insert 0 at the beginning
    
    # Inserting the len of book data so that data is captured upto the last book order data
    if merged_df.shape[0] != no_trade_index[-1]:
        no_trade_index = np.insert(no_trade_index, len(no_trade_index), merged_df.shape[0])

    # features calculation
    features_arr = calculate_features(merged_df.to_numpy(), no_trade_index)

    #  ********************************************** Version 1 WRONG CODE
    # Return only the data where trade occurs 
    # trade_ind = no_trade_index.copy()
    # trade_ind[-1] = trade_ind[-1] - 1 
    # trade_ind = np.unique(trade_ind)
    #  ********************************************** Version 1 WRONG CODE
    
    trade_ind = np.argwhere(~np.isnan(features_arr[:, 10]))[:, 0] # RIGHT APPROACH , 10 is price column , select where price ie trade occurs
    if len(trade_ind) <= 1:
        trade_arr = features_arr
    else:
        trade_arr = features_arr[trade_ind]
    # filling the trade data with book order data so that all time ids are present for prediction
    if len(time_ids_not_in_trade) > 0:
        for i in time_ids_not_in_trade:
            trade_arr = np.concatenate([trade_arr, features_arr[features_arr[:, 0] == i]])

    df = pd.DataFrame(trade_arr, columns = original_cols + cols)
    df['time_id'] = df['time_id'].astype('int64')

    features = [f for f in df.columns if f not in ['time_id', 'seconds_in_bucket']]
    grouped_df = df.groupby('time_id')[features].agg(['mean', 
                                            'std', 
                                            'median',
                                            'var', 
                                            'sem', 
                                           ])
    grouped_df.columns = ['_'.join(col) for col in grouped_df.columns]

    # extract the last transaction of book order data for which trade occured
    last_book = df.groupby('time_id')[features].agg(['last'])
    last_book.columns = [k for k, v in last_book.columns]

    # Open high low close for trade data only
    ohlc_trade = df.groupby('time_id')[['price']].agg(['first', 'max', 'min', 'last'])
    ohlc_trade.columns = ['_'.join(col) for col in ohlc_trade.columns]

    df_all = pd.concat([last_book, ohlc_trade, grouped_df], axis=1)
    df_all.insert(0, 'stock_id', stock)
    df_all = df_all.reset_index()
    df_all['row_id'] = df_all['stock_id'].astype(str) + "-" + df_all['time_id'].astype(str)
    return df_all

train_trade_book = Parallel(n_jobs=-1, verbose=1)(delayed(get_calculated_features_trade)(ind) for ind in train_stock_ids)

test_trade_book = Parallel(n_jobs=-1, verbose=1)(delayed(get_calculated_features_trade)(ind, train=False) for ind in test_stock_ids)

In [None]:
df_train = pd.concat(train_trade_book)
df_test = pd.concat(test_trade_book)

In [None]:
df_train.head()

In [None]:
df_test.head()

# Train LGBM 

In [None]:
features = [f for f in df_train.columns if f not in ['row_id', 'time_id']]
y = train['target'].values

In [None]:
seed=60
params = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'categorical_column':[0],
    'seed':seed,
    'feature_fraction_seed': seed,
    'bagging_seed': seed,
    'drop_seed': seed,
    'data_random_seed': seed,
    'n_jobs':-1,
    'verbose': -1}

oof_predictions = np.zeros(df_train.shape[0])

test_preds = np.zeros(df_test.shape[0])

kfold = KFold(n_splits = 5, random_state = 60, shuffle = True)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(df_train)):
    print(f'Training fold {fold + 1}')
    x_train, x_val = df_train[features].iloc[trn_ind], df_train[features].iloc[val_ind]
    y_train, y_val = y[trn_ind], y[val_ind]

    train_weights = 1 / np.square(y_train)
    val_weights = 1 / np.square(y_val)

    train_dataset = lgb.Dataset(x_train[features], y_train, 
                                    weight = train_weights
                               )
    val_dataset = lgb.Dataset(x_val[features], y_val, 
                                  weight = val_weights
                             )
    model = lgb.train(params = params,
                          num_boost_round=1200,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=30,
                          feval = feval_rmspe)

    oof_predictions[val_ind] = model.predict(x_val[features])
    oof_rmspe = rmspe(y_val, oof_predictions[val_ind])
    print(f'Fold {fold} RMSPE is {oof_rmspe}')
    lgb.plot_importance(model, figsize=(5, 2.5), max_num_features=10)
    plt.show()
    
    test_preds += model.predict(df_test[features])

test_preds = test_preds / kfold.n_splits

In [None]:
print(f"RMSPE on overall oof predictions : {rmspe(y, oof_predictions)}")
res = pd.concat([pd.DataFrame(y, columns=['actual']), pd.DataFrame(oof_predictions, columns=['predicted'])], axis=1)
res.plot(figsize=(15, 6))
plt.show()

# Submission

In [None]:

test_preds = pd.DataFrame(test_preds, columns=['target'])
test_preds = pd.concat([test, test_preds], axis=1).fillna(0)
test_preds[['row_id', 'target']].to_csv('submission.csv', index=False)
test_preds.head()