# Data Pre-Processing


Defining some useful functions for later processing:

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

def ffill(data_df):
    data_df=data_df.set_index(['time_id', 'seconds_in_bucket'])
    data_df = data_df.reindex(pd.MultiIndex.from_product([data_df.index.levels[0], np.arange(0,600)], names = ['time_id', 'seconds_in_bucket']), method='ffill')
    return data_df.reset_index()

def renamecol(col):
    if isinstance(col, tuple):
        col = '_'.join(str(c) for c in col)
    return col

def Price_Open(price_data):
    price_open = price_data.iloc[0]
    ## price_data.head(1).item()
    return price_open

def Price_Close(price_data):
    price_close = price_data.iat[-1]
    ## price_data.tail(1).item()
    
    return price_close


def CloseToClose_estimator(close, window=1, trading_periods=1, clean=True): 
    log_return = (close / close.shift(1)).apply(np.log)

    result = log_return.rolling(
        window=window,
        center=False
    ).std() * math.sqrt(trading_periods)

    if clean:
        return result.dropna()
    else:
        return result
    
    
def Parkinson_estimator(High,Low, window=1, trading_periods=1, clean=True):

    rs = (1.0 / (4.0 * math.log(2.0))) * ((High / Low).apply(np.log))**2.0

    def f(v):
        return trading_periods * v.mean()**0.5
    
    result = rs.rolling(
        window=window,
        center=False
    ).apply(func=f)
    
    if clean:
        return result.dropna()
    else:
        return result
    

def GarmanKlass_estimator(High,Low,Close,Open, window=1, trading_periods=1, clean=True):

    log_hl = (High / Low).apply(np.log)
    log_co = (Close / Open).apply(np.log)

    rs = 0.5 * log_hl**2 - (2*math.log(2)-1) * log_co**2
    
    def f(v):
        return (trading_periods * v.mean())**0.5
    
    result = rs.rolling(window=window, center=False).apply(func=f)
    
    if clean:
        return result.dropna()
    else:
        return result


    
def RogerSatchell_estimator(High,Low,Close,Open, window=1, trading_periods=1, clean=True):
    
    log_ho = (High/ Open).apply(np.log)
    log_lo = (Low / Open).apply(np.log)
    log_co = (Close /Open).apply(np.log)
    
    rs = log_ho * (log_ho - log_co) + log_lo * (log_lo - log_co)

    def f(v):
        return trading_periods * v.mean()**0.5
    
    result = rs.rolling(
        window=window,
        center=False
    ).apply(func=f)
    
    if clean:
        return result.dropna()
    else:
        return result
    
    
    
def YangZhang_estimator(High,Low,Close,Open, window=1, trading_periods=1, clean=True):

    log_ho = (High / Open).apply(np.log)
    log_lo = (Low / Open).apply(np.log)
    log_co = (Close/ Open).apply(np.log)
    
    log_oc = (Open / Close.shift(1)).apply(np.log)
    log_oc_sq = log_oc**2
    
    log_cc = (Close/ Close.shift(1)).apply(np.log)
    log_cc_sq = log_cc**2
    
    rs = log_ho * (log_ho - log_co) + log_lo * (log_lo - log_co)
    
    close_vol = log_cc_sq.rolling(
        window=window,
        center=False
    ).sum() * (1.0)
    open_vol = log_oc_sq.rolling(
        window=window,
        center=False
    ).sum() * (1.0)
    window_rs = rs.rolling(
        window=window,
        center=False
    ).sum() * (1.0)

    k = 0.34 / (1)
    result = (open_vol + k * close_vol + (1 - k) * window_rs).apply(np.sqrt) * math.sqrt(trading_periods)

    if clean:
        return result.dropna()
    else:
        return result
    
def count_unique(series):
    return len(np.unique(series))

# 25th Percentile
def q25(x):
    return x.quantile(0.25)

# 50th Percentile
def q50(x):
    return x.quantile(0.5)

# 75th Percentile
def q75(x):
    return x.quantile(0.75)

Loading libraries and the challenge data


In [None]:
#Import Libraries and setting correct file path
import numpy as np # Math
import pandas as pd # data processing
import glob 
import os
import gc
import math
import scipy


from joblib import Parallel, delayed # Parallel processing

from sklearn import preprocessing, model_selection # Model evaluation
import lightgbm as lgb # Boosting Models

from sklearn.metrics import r2_score # Model evaluation

import matplotlib.pyplot as plt # Data Visualization
import seaborn as sns # Data Visualization

import pyarrow
import tqdm
seed = 42 

path_data = '../input/optiver-realized-volatility-prediction'
path_train = '../input/traindata-pre-processing-part2'

path_submissions = '/'

target_name = 'target'
scores_folds = {}

In [None]:
#Core function that will extract and trasform the data
def get_stock_stat(stock_id : int,dataType = 'train'):
    key = ['stock_id', 'time_id', 'seconds_in_bucket']
    
    #Book Data
    df_book = pd.read_parquet(os.path.join(path_data, 'book_{}.parquet/stock_id={}/'.format(dataType, stock_id)))
    df_book['stock_id'] = stock_id
    cols = key + [col for col in df_book.columns if col not in key]
    df_book = df_book[cols]
    
    #Book data Seconds in buckets features
    
    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] +
                                    df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1'] + df_book['ask_size1'])
    df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] +
                                    df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2'] + df_book['ask_size2'])
    
    df_book['wap_balance'] = abs(df_book['wap1'] - df_book['wap2'])
    df_book['bid_spread'] = df_book['bid_price1'] - df_book['bid_price2']
    df_book['ask_spread'] = df_book['ask_price1'] - df_book['ask_price2']
    df_book['total_volume'] = (df_book['ask_size1'] + df_book['ask_size2']) + (df_book['bid_size1'] + df_book['bid_size2'])
    df_book['volume_imbalance'] = abs((df_book['ask_size1'] + df_book['ask_size2']) - (df_book['bid_size1'] + df_book['bid_size2']))
    
    df_book['Bid_Ask_Spread'] = df_book['ask_price1'] - df_book['bid_price1']
    df_book['Bid_Ask_Spread_Pct'] = ((df_book['ask_price1']/ df_book['bid_price1'])-1)*100
    df_book['Quoted_Spread'] = (df_book['ask_price1']-df_book['bid_price1'])/((df_book['ask_price1']+df_book['bid_price1'])/2)*100
    df_book['Bid_Ask_Balance'] = (df_book['ask_size1']- df_book['bid_size1'])
    df_book['Bid_Ask_Balance_Pct'] = ((df_book['ask_size1']/df_book['bid_size1'])-1)*100
    df_book['Market_Depth'] = (df_book['bid_size1']+df_book['ask_size1']+df_book['bid_size2']+df_book['ask_size2'])
    df_book['Order_Volume_bid'] = (df_book['bid_size1']+df_book['bid_size2'])
    df_book['Order_Volume_ask'] = (df_book['ask_size1']+df_book['ask_size2'])
    df_book['BestPrice_Difference_bid'] = (df_book['ask_price1'] - df_book['bid_price2'])
    df_book['BestPrice_Difference_ask'] = (df_book['bid_price1']+df_book['bid_price2'])
    df_book['BestPrice_Difference_bid_pct'] = ((df_book['ask_price1']/df_book['bid_price2'])-1)*100
    df_book['BestPrice_Difference_ask_pct'] = ((df_book['bid_price1']/df_book['bid_price2'])-1)*100
    
    df_book['log_return1'] = df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).fillna(0)
    df_book['log_return2'] = df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).fillna(0)
    
    #Book data time id aggregation
    
    #dict for aggregate
    create_feature_dict = {
        'log_return1':[realized_volatility],
        'log_return2':[realized_volatility],
        'wap1':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close,scipy.stats.kurtosis,
                scipy.stats.skew,q25,q50,q75],
        'wap2':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close,scipy.stats.kurtosis,
                scipy.stats.skew,q25,q50,q75],
        'Bid_Ask_Spread':[np.mean,np.var,np.max,np.min],
        'wap_balance':[np.mean,np.var,np.max,np.min],
        'bid_spread':[np.mean,np.var,np.max,np.min],
        'ask_spread':[np.mean,np.var,np.max,np.min],
        'total_volume':[np.mean,np.var,np.max,np.min],
        'volume_imbalance':[np.mean,np.var,np.max,np.min],
        'Bid_Ask_Spread_Pct':[np.mean,np.var,np.max,np.min],
        'Quoted_Spread':[np.mean,np.var,np.max,np.min],
        'Bid_Ask_Balance':[np.mean,np.var,np.max,np.min],
        'Bid_Ask_Balance_Pct':[np.mean,np.var,np.max,np.min],
        'Market_Depth':[np.mean,np.var,np.max,np.min],
        'Order_Volume_bid':[np.mean,np.var,np.max,np.min],
        'Order_Volume_ask':[np.mean,np.var,np.max,np.min],
        'BestPrice_Difference_bid':[np.mean,np.var,np.max,np.min],
        'BestPrice_Difference_ask':[np.mean,np.var,np.max,np.min],
        'BestPrice_Difference_bid_pct':[np.mean,np.var,np.max,np.min],
        'BestPrice_Difference_ask_pct':[np.mean,np.var,np.max,np.min],
        'ask_price1':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close],
        'ask_price2':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close],
        'bid_price1':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close],
        'bid_price2':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close]
            }
    
    stock_stat= df_book.groupby(by = ['stock_id', 'time_id']).agg(create_feature_dict).reset_index()
    stock_stat.columns = map(renamecol, stock_stat.columns)
    stock_stat=stock_stat.rename({'stock_id_':'stock_id'}, axis=1)
    stock_stat=stock_stat.rename({'time_id_':'time_id'}, axis=1)
    
    stock_stat_150 = df_book[df_book['seconds_in_bucket'] >= 150].groupby(['stock_id','time_id']).agg(create_feature_dict).reset_index()
    stock_stat_150.columns = map(renamecol, stock_stat_150.columns)
    stock_stat_150=stock_stat_150.rename({'stock_id_':'stock_id'}, axis=1)
    stock_stat_150=stock_stat_150.rename({'time_id_':'time_id'}, axis=1)
    
    stock_stat_300 = df_book[df_book['seconds_in_bucket'] >= 300].groupby(['stock_id','time_id']).agg(create_feature_dict).reset_index()
    stock_stat_300.columns = map(renamecol, stock_stat_300.columns)
    stock_stat_300=stock_stat_300.rename({'stock_id_':'stock_id'}, axis=1)
    stock_stat_300=stock_stat_300.rename({'time_id_':'time_id'}, axis=1)
    
    stock_stat_450 = df_book[df_book['seconds_in_bucket'] >= 450].groupby(['stock_id','time_id']).agg(create_feature_dict).reset_index()
    stock_stat_450.columns = map(renamecol, stock_stat_450.columns)
    stock_stat_450=stock_stat_450.rename({'stock_id_':'stock_id'}, axis=1)
    stock_stat_450=stock_stat_450.rename({'time_id_':'time_id'}, axis=1)
    
    stock_stat_150 = stock_stat_150.add_suffix('_150')
    stock_stat_300 = stock_stat_300.add_suffix('_300')
    stock_stat_450 = stock_stat_450.add_suffix('_450')
    
    stock_stat = stock_stat.merge(stock_stat_150, how = 'left', left_on = ['stock_id','time_id'], right_on = ['stock_id_150','time_id_150'])
    stock_stat = stock_stat.merge(stock_stat_300, how = 'left', left_on = ['stock_id','time_id'], right_on = ['stock_id_300','time_id_300'])
    stock_stat = stock_stat.merge(stock_stat_450, how = 'left', left_on = ['stock_id','time_id'], right_on = ['stock_id_450','time_id_450'])
    stock_stat.drop(['stock_id_150','time_id_150', 'stock_id_300', 'time_id_300','stock_id_450','time_id_450'], axis = 1, inplace = True)
    
    
    
    #Trade data
    df_trade =  pd.read_parquet(os.path.join(path_data,'trade_{}.parquet/stock_id={}'.format(dataType, stock_id)))
    df_trade = df_trade.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    df_trade['stock_id'] = stock_id
    cols = key + [col for col in df_trade.columns if col not in key]
    df_trade = df_trade[cols]
    
    #Trade data Seconds in buckets features
    df_trade['trade_log_return1'] = df_trade.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)
    #Trade data time id aggregation
    #Trade data time id aggregation
    create_feature_dict_trade = {
        'trade_log_return1':[realized_volatility],
        'seconds_in_bucket':[np.count_nonzero,count_unique],
        'size':[np.mean,np.var,np.max,np.min,np.sum],
        'order_count':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close,np.sum],
        'price':[np.mean,np.var,np.max,np.min,Price_Open,Price_Close,scipy.stats.kurtosis,
                 scipy.stats.skew,q25,q50,q75]
            }
    
    trade_stat = df_trade.groupby(by = ['stock_id', 'time_id']).agg(create_feature_dict_trade).reset_index()
    trade_stat.columns = map(renamecol, trade_stat.columns)
    trade_stat=trade_stat.rename({'stock_id_':'stock_id'}, axis=1)
    trade_stat=trade_stat.rename({'time_id_':'time_id'}, axis=1)
    
    
    trade_stat_150 = df_trade[df_trade['seconds_in_bucket'] >= 150].groupby(['stock_id','time_id']).agg(create_feature_dict_trade).reset_index()
    trade_stat_150.columns = map(renamecol, trade_stat_150.columns)
    trade_stat_150=trade_stat_150.rename({'stock_id_':'stock_id'}, axis=1)
    trade_stat_150=trade_stat_150.rename({'time_id_':'time_id'}, axis=1)
    
    trade_stat_300 = df_trade[df_trade['seconds_in_bucket'] >= 300].groupby(['stock_id','time_id']).agg(create_feature_dict_trade).reset_index()
    trade_stat_300.columns = map(renamecol, trade_stat_300.columns)
    trade_stat_300=trade_stat_300.rename({'stock_id_':'stock_id'}, axis=1)
    trade_stat_300=trade_stat_300.rename({'time_id_':'time_id'}, axis=1)
    
    trade_stat_450 = df_trade[df_trade['seconds_in_bucket'] >= 450].groupby(['stock_id','time_id']).agg(create_feature_dict_trade).reset_index()
    trade_stat_450.columns = map(renamecol, trade_stat_450.columns)
    trade_stat_450=trade_stat_450.rename({'stock_id_':'stock_id'}, axis=1)
    trade_stat_450=trade_stat_450.rename({'time_id_':'time_id'}, axis=1)
    
    trade_stat_150 = trade_stat_150.add_suffix('_150')
    trade_stat_300 = trade_stat_300.add_suffix('_300')
    trade_stat_450 = trade_stat_450.add_suffix('_450')
    
    trade_stat = trade_stat.merge(trade_stat_150, how = 'left', left_on = ['stock_id','time_id'], right_on = ['stock_id_150','time_id_150'])
    trade_stat = trade_stat.merge(trade_stat_300, how = 'left', left_on = ['stock_id','time_id'], right_on = ['stock_id_300','time_id_300'])
    trade_stat = trade_stat.merge(trade_stat_450, how = 'left', left_on = ['stock_id','time_id'], right_on = ['stock_id_450','time_id_450'])
    trade_stat.drop(['stock_id_150','time_id_150', 'stock_id_300', 'time_id_300','stock_id_450','time_id_450'], axis = 1, inplace = True)
    
    
    
    
    
    
    
    #Joining book and trade feature   
    stock_stat = stock_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(0)
    
    return stock_stat

# Making the extraction faster by doing the function on multiple processor unit at the time

def get_dataSet(stock_ids : list,dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

Applying the main function for extracting and processing the data


In [None]:
%%time
train = pd.read_csv(os.path.join(path_data, 'train.csv'))
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(),dataType = 'train')
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
#print('Train shape: {}'.format(train.shape))
#train.drop(train.columns[[0, 1]], axis = 1, inplace = True)
#train.columns = map(renamecol, train.columns)
#train.to_csv('TrainingWithFeatures.csv',index=False)
display(train.head(2))

In [None]:
#Import Libraries and setting correct file path
import numpy as np # Math
import pandas as pd # data processing
import glob 
import os
import gc
import math
import scipy


from joblib import Parallel, delayed # Parallel processing

from sklearn import preprocessing, model_selection # Model evaluation
import lightgbm as lgb # Boosting Models

from sklearn.metrics import r2_score # Model evaluation

import matplotlib.pyplot as plt # Data Visualization
import seaborn as sns # Data Visualization

import pyarrow
import tqdm
seed = 42 

path_data = '../input/optiver-realized-volatility-prediction'
path_train = '../input/traindata-pre-processing-window'

path_submissions = '/'

target_name = 'target'
scores_folds = {}

## Train and test datasets

In [None]:
#FeatureEngineering

test = pd.read_csv(os.path.join(path_data, 'test.csv'))
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test = test[test.columns.drop(list(test.filter(regex='index__')))]

#test.columns = map(renamecol, test.columns)



test['Parkinson_estimator']=  Parkinson_estimator(test.wap1_amin,test.wap1_amax)
test['GarmanKlass_estimator']=GarmanKlass_estimator(test.wap1_amin,test.wap1_amax,test.wap1_Price_Close,test.wap1_Price_Open)
test['RogerSatchell_estimator']=RogerSatchell_estimator(test.wap1_amin,test.wap1_amax,test.wap1_Price_Close,test.wap1_Price_Open)
test['YangZhang_estimator']= YangZhang_estimator(test.wap1_amin,test.wap1_amax,test.wap1_Price_Close,test.wap1_Price_Open)
test['Parkinson_estimator_trade']=  Parkinson_estimator(test.price_amin,test.price_amax)
test['GarmanKlass_estimator_trade']=GarmanKlass_estimator(test.price_amin,test.price_amax,test.price_Price_Close,test.price_Price_Open)
test['Parkinson_estimator_2']=  Parkinson_estimator(test.wap2_amin,test.wap2_amax)
test['GarmanKlass_estimator_2']=GarmanKlass_estimator(test.wap2_amin,test.wap2_amax,test.wap2_Price_Close,test.wap2_Price_Open)

In [None]:
#Loading pre-processed training data
#train = pd.read_csv(os.path.join(path_train, 'TrainingWithFeatures.csv'))

train['Parkinson_estimator']=  Parkinson_estimator(train.wap1_amin,train.wap1_amax)
train['GarmanKlass_estimator']=GarmanKlass_estimator(train.wap1_amin,train.wap1_amax,train.wap1_Price_Close,train.wap1_Price_Open)
train['RogerSatchell_estimator']=RogerSatchell_estimator(train.wap1_amin,train.wap1_amax,train.wap1_Price_Close,train.wap1_Price_Open)
train['YangZhang_estimator']= YangZhang_estimator(train.wap1_amin,train.wap1_amax,train.wap1_Price_Close,train.wap1_Price_Open)
train['Parkinson_estimator_trade']=  Parkinson_estimator(train.price_amin,train.price_amax)
train['GarmanKlass_estimator_trade']=GarmanKlass_estimator(train.price_amin,train.price_amax,train.price_Price_Close,train.price_Price_Open)
train['Parkinson_estimator_2']=  Parkinson_estimator(train.wap2_amin,train.wap2_amax)
train['GarmanKlass_estimator_2']=GarmanKlass_estimator(train.wap2_amin,train.wap2_amax,train.wap2_Price_Close,train.wap2_Price_Open)


In [None]:
MarketAverageTrain = train.groupby(by = ['time_id']).agg(realized_volatility_market1=('log_return1_realized_volatility', np.mean),
                                                         realized_volatility_market2=('log_return2_realized_volatility', np.mean),
                                                         Parkinson_estimator_market=('Parkinson_estimator', np.mean),
                                                         GarmanKlass_estimator_market=('GarmanKlass_estimator', np.mean),
                                                         RogerSatchell_estimator_market=('RogerSatchell_estimator', np.mean),
                                                         YangZhang_estimator_market=('YangZhang_estimator', np.mean),
                                                         total_volume_mean_market=('total_volume_mean', np.mean),
                                                         order_count_sum_market=('order_count_sum', np.mean),  
                                                         size_sum_market=('size_sum', np.mean), 
                                                         Bid_Ask_Spread_market=('Bid_Ask_Spread_mean', np.mean),
                                                         realized_volatility_market1_150=('log_return1_realized_volatility_150', np.mean),
                                                         realized_volatility_market2_150=('log_return2_realized_volatility_150', np.mean),
                                                         total_volume_mean_market_150=('total_volume_mean_150', np.mean),
                                                         order_count_sum_market_150=('order_count_sum_150', np.mean),  
                                                         size_sum_market_150=('size_sum_150', np.mean), 
                                                         Bid_Ask_Spread_market_150=('Bid_Ask_Spread_mean_150', np.mean),
                                                         realized_volatility_market1_300=('log_return1_realized_volatility_300', np.mean),
                                                         realized_volatility_market2_300=('log_return2_realized_volatility_300', np.mean),
                                                         total_volume_mean_market_300=('total_volume_mean_300', np.mean),
                                                         order_count_sum_market_300=('order_count_sum_300', np.mean),  
                                                         size_sum_market_300=('size_sum_300', np.mean), 
                                                         Bid_Ask_Spread_market_300=('Bid_Ask_Spread_mean_300', np.mean),
                                                         realized_volatility_market1_450=('log_return1_realized_volatility_450', np.mean),
                                                         realized_volatility_market2_450=('log_return2_realized_volatility_450', np.mean),
                                                         total_volume_mean_market_450=('total_volume_mean_450', np.mean),
                                                         order_count_sum_market_450=('order_count_sum_450', np.mean),  
                                                         size_sum_market_450=('size_sum_450', np.mean), 
                                                         Bid_Ask_Spread_market_450=('Bid_Ask_Spread_mean_450', np.mean)
                                                        ).reset_index()
                                                     

MarketAverageTest = test.groupby(by = ['time_id']).agg(realized_volatility_market1=('log_return1_realized_volatility', np.mean),
                                                         realized_volatility_market2=('log_return2_realized_volatility', np.mean),
                                                         Parkinson_estimator_market=('Parkinson_estimator', np.mean),
                                                         GarmanKlass_estimator_market=('GarmanKlass_estimator', np.mean),
                                                         RogerSatchell_estimator_market=('RogerSatchell_estimator', np.mean),
                                                         YangZhang_estimator_market=('YangZhang_estimator', np.mean),
                                                         total_volume_mean_market=('total_volume_mean', np.mean),
                                                         order_count_sum_market=('order_count_sum', np.mean),  
                                                         size_sum_market=('size_sum', np.mean), 
                                                         Bid_Ask_Spread_market=('Bid_Ask_Spread_mean', np.mean),
                                                         realized_volatility_market1_150=('log_return1_realized_volatility_150', np.mean),
                                                         realized_volatility_market2_150=('log_return2_realized_volatility_150', np.mean),
                                                         total_volume_mean_market_150=('total_volume_mean_150', np.mean),
                                                         order_count_sum_market_150=('order_count_sum_150', np.mean),  
                                                         size_sum_market_150=('size_sum_150', np.mean), 
                                                         Bid_Ask_Spread_market_150=('Bid_Ask_Spread_mean_150', np.mean),
                                                         realized_volatility_market1_300=('log_return1_realized_volatility_300', np.mean),
                                                         realized_volatility_market2_300=('log_return2_realized_volatility_300', np.mean),
                                                         total_volume_mean_market_300=('total_volume_mean_300', np.mean),
                                                         order_count_sum_market_300=('order_count_sum_300', np.mean),  
                                                         size_sum_market_300=('size_sum_300', np.mean), 
                                                         Bid_Ask_Spread_market_300=('Bid_Ask_Spread_mean_300', np.mean),
                                                         realized_volatility_market1_450=('log_return1_realized_volatility_450', np.mean),
                                                         realized_volatility_market2_450=('log_return2_realized_volatility_450', np.mean),
                                                         total_volume_mean_market_450=('total_volume_mean_450', np.mean),
                                                         order_count_sum_market_450=('order_count_sum_450', np.mean),  
                                                         size_sum_market_450=('size_sum_450', np.mean), 
                                                         Bid_Ask_Spread_market_450=('Bid_Ask_Spread_mean_450', np.mean)
                                                   ).reset_index()


train = train.merge(MarketAverageTrain, on=['time_id'], how='left').fillna(0)
test = test.merge(MarketAverageTest, on=['time_id'], how='left').fillna(0)

In [None]:
del MarketAverageTrain
del MarketAverageTest
import gc
gc.collect()
print('Test shape: {}'.format(test.shape))
print('Train shape: {}'.format(train.shape))

# Boosting Model

In [None]:
seed0=42
params0 = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'categorical_column':[0],
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1,
    #Optuna Optimization
    'max_bin': 95,
    'num_leaves': 323,
    'bagging_fraction': 0.6949437575460405,
    'max_depth': 10,
    'feature_fraction_bynode': 0.7708016312491508,
    'bagging_freq': 3,
    'min_data_in_leaf': 296,
    'reg_alpha': 9.093379148472092,
    'reg_lambda': 5.395451741231765,
    'feature_fraction': 0.42414747362283134

}



In [None]:
ColToExclude={"time_id",
 "target",
 "row_id",
 'bid_spread_var',
 'wap2_skew_150',
 'ask_spread_amin_150',
 'BestPrice_Difference_bid_amin_150',
 'ask_spread_mean_150',
 'Quoted_Spread_var_300',
 'Market_Depth_amin_150',
 'size_mean_450',
 'BestPrice_Difference_bid_pct_var_300',
 'bid_spread_mean_450',
 'bid_spread_mean_300',
 'BestPrice_Difference_ask_pct_var_150',
 'Order_Volume_ask_mean',
 'Market_Depth_amax',
 'price_kurtosis_300',
 'order_count_Price_Open',
 'Bid_Ask_Balance_amin',
 'Bid_Ask_Spread_Pct_amax_300',
 'Bid_Ask_Balance_var_150',
 'price_var_150',
 'BestPrice_Difference_bid_amax_150',
 'wap2_skew_300',
 'Bid_Ask_Balance_Pct_var_150',
 'Market_Depth_var',
 'BestPrice_Difference_ask_pct_mean_300',
 'bid_price1_var_300',
 'order_count_amax_450',
 'bid_price1_Price_Close_300',
 'volume_imbalance_mean',
 'Bid_Ask_Balance_Pct_var_300',
 'Bid_Ask_Spread_Pct_amin_150',
 'ask_price1_var_150',
 'Bid_Ask_Spread_amin_450',
 'Bid_Ask_Balance_Pct_var',
 'BestPrice_Difference_ask_pct_var_300',
 'size_var_450',
 'ask_spread_var_300',
 'bid_price2_Price_Close_150',
 'BestPrice_Difference_bid_pct_amin_150',
 'ask_price1_amin',
 'bid_price2_var_150',
 'price_kurtosis_450',
 'Market_Depth_amin_450',
 'Order_Volume_bid_amax_300',
 'bid_price2_var',
 'Order_Volume_ask_mean_300',
 'BestPrice_Difference_bid_pct_amax_150',
 'Quoted_Spread_var_150',
 'ask_price2_var',
 'Order_Volume_ask_mean_150',
 'Quoted_Spread_var_450',
 'ask_price2_amax_150',
 'Bid_Ask_Spread_Pct_var_300',
 'ask_price2_amax_300',
 'bid_spread_mean',
 'wap2_kurtosis_300',
 'BestPrice_Difference_ask_pct_mean',
 'BestPrice_Difference_bid_pct_var_150',
 'wap2_Price_Open_300',
 'Market_Depth_var_150',
 'BestPrice_Difference_bid_pct_amin_300',
 'Bid_Ask_Balance_Pct_var_450',
 'ask_price2_Price_Open_450',
 'Order_Volume_bid_amax_150',
 'BestPrice_Difference_ask_pct_mean_150',
 'wap1_var',
 'Bid_Ask_Balance_amin_450',
 'Bid_Ask_Spread_amax_150',
 'Order_Volume_ask_var_450',
 'wap2_Price_Close_150',
 'ask_price2_Price_Close',
 'ask_price1_var',
 'wap2_Price_Open_150',
 'bid_price2_amax',
 'Bid_Ask_Balance_mean_450',
 'bid_spread_var_150',
 'ask_spread_mean_300',
 'volume_imbalance_var',
 'volume_imbalance_amax_450',
 'price_Price_Close_150',
 'wap1_Price_Close_150',
 'ask_price2_amax',
 'volume_imbalance_mean_150',
 'volume_imbalance_amax',
 'order_count_Price_Open_300',
 'ask_price1_Price_Open_150',
 'total_volume_amax_150',
 'Bid_Ask_Spread_var_450',
 'bid_spread_var_450',
 'total_volume_mean_450',
 'BestPrice_Difference_ask_var_150',
 'Order_Volume_ask_amax',
 'Bid_Ask_Balance_amax_300',
 'Order_Volume_ask_amax_450',
 'BestPrice_Difference_bid_amin_300',
 'volume_imbalance_mean_450',
 'volume_imbalance_mean_300',
 'BestPrice_Difference_bid_var_150',
 'Market_Depth_mean_300',
 'Order_Volume_ask_mean_450',
 'size_sum_150',
 'Bid_Ask_Balance_Pct_amax_150',
 'volume_imbalance_var_450',
 'price_Price_Open_150',
 'ask_price2_Price_Open_300',
 'Bid_Ask_Balance_amin_300',
 'order_count_Price_Close',
 'volume_imbalance_amax_150',
 'bid_price1_Price_Open_150',
 'total_volume_amax',
 'Order_Volume_ask_amax_150',
 'order_count_Price_Open_150',
 'wap2_Price_Close_300',
 'Bid_Ask_Balance_amax_450',
 'bid_price1_var_150',
 'volume_imbalance_var_150',
 'ask_price2_amin_300',
 'ask_price2_mean',
 'Market_Depth_var_300',
 'bid_price1_Price_Close_450',
 'Order_Volume_ask_amax_300',
 'ask_price1_amax_450',
 'ask_price1_amax_150',
 'wap2_q25',
 'Order_Volume_bid_amax_450',
 'Bid_Ask_Balance_amax',
 'bid_price2_Price_Open_150',
 'Bid_Ask_Balance_amax_150',
 'ask_price2_amin',
 'Order_Volume_ask_var_300',
 'ask_price2_amax_450',
 'bid_price1_amax',
 'price_amax',
 'total_volume_var_150',
 'volume_imbalance_var_300',
 'wap1_amin',
 'price_Price_Close',
 'total_volume_var_300',
 'Market_Depth_mean_450',
 'wap2_q25_300',
 'bid_price1_Price_Open_300',
 'BestPrice_Difference_ask_var',
 'BestPrice_Difference_ask_amin_450',
 'Order_Volume_ask_var_150',
 'order_count_Price_Close_150',
 'bid_price2_amin_450',
 'ask_price1_Price_Close',
 'order_count_Price_Close_450',
 'wap1_Price_Open_150',
 'ask_price1_mean',
 'price_Price_Close_300',
 'price_mean_450',
 'bid_price1_var',
 'wap1_Price_Close',
 'order_count_Price_Close_300',
 'Market_Depth_amax_450',
 'wap1_amax',
 'Bid_Ask_Spread_Pct_amin_450',
 'ask_price2_Price_Close_300',
 'total_volume_amax_450',
 'wap2_Price_Close_450',
 'total_volume_amax_300',
 'ask_price2_amin_450',
 'wap1_q50',
 'wap2_amin',
 'wap2_q75_150',
 'price_amin_450',
 'price_Price_Open_450',
 'price_amax_150',
 'price_amin',
 'wap1_q25',
 'price_Price_Open_300',
 'bid_price2_mean_300',
 'price_q75_450',
 'bid_price2_amin_150',
 'ask_price1_Price_Open_300',
 'bid_price2_Price_Open_300',
 'Bid_Ask_Balance_var_450',
 'wap1_amax_300',
 'bid_price2_amax_450',
 'bid_price1_amin_450',
 'ask_price1_mean_150',
 'ask_price2_amin_150',
 'price_q25',
 'wap1_q75_450',
 'ask_price1_amax',
 'bid_price1_amin_300',
 'wap2_Price_Open_450',
 'bid_price2_amax_300',
 'wap2_q50_450',
 'ask_price1_Price_Open_450',
 'bid_price2_Price_Open_450',
 'bid_price2_amin',
 'bid_price2_amax_150',
 'bid_price2_Price_Close_300',
 'wap2_q50_150',
 'price_q25_150',
 'wap2_mean_300',
 'ask_price2_mean_450',
 'ask_price1_Price_Close_150',
 'ask_price1_amax_300',
 'wap1_Price_Close_300',
 'price_q25_450',
 'wap2_amin_300',
 'wap1_amax_150',
 'price_q50_300',
 'bid_price2_mean',
 'bid_price1_amin',
 'BestPrice_Difference_ask_amax',
 'wap2_q50_300',
 'bid_price2_Price_Close_450',
 'price_q50_450',
 'Market_Depth_amax_300',
 'ask_price1_amin_450',
 'wap1_mean',
 'bid_price2_amin_300',
 'price_amin_300',
 'wap1_q50_150',
 'wap1_Price_Open_300',
 'price_q75',
 'wap2_amax',
 'wap2_q25_150',
 'BestPrice_Difference_ask_amax_450',
 'wap1_mean_450',
 'Market_Depth_var_450',
 'BestPrice_Difference_ask_amin',
 'wap2_amax_300',
 'ask_price2_Price_Close_150',
 'bid_price1_amax_150',
 'wap1_q25_300',
 'wap1_amin_300',
 'wap2_q75_450',
 'bid_price1_amin_150',
 'ask_price2_Price_Close_450',
 'wap2_q25_450',
 'bid_price1_Price_Open_450',
 'Market_Depth_amax_150',
 'price_q50',
 'BestPrice_Difference_ask_amin_300',
 'wap2_amin_150',
 'price_q25_300',
 'ask_price1_amin_150',
 'ask_price1_mean_300',
 'wap1_q25_450',
 'ask_price2_mean_150',
 'price_q50_150',
 'wap2_amax_450',
 'wap2_amax_150',
 'ask_price1_mean_450',
 'bid_price1_amax_450',
 'wap1_q75_150',
 'wap1_q75',
 'price_amin_150',
 'wap2_amin_450',
 'wap2_q75',
 'BestPrice_Difference_ask_amax_300',
 'BestPrice_Difference_ask_mean_150',
 'price_q75_300',
 'BestPrice_Difference_ask_amin_150',
 'bid_price1_amax_300',
 'bid_price2_mean_150',
 'BestPrice_Difference_ask_amax_150',
 'wap2_q75_300',
 'wap1_amin_450',
 'price_q75_150',
 'ask_price1_Price_Close_300',
 'BestPrice_Difference_ask_mean',
 'ask_price1_amin_300',
 'bid_price1_mean_300',
 'wap1_Price_Close_450',
 'ask_price2_mean_300',
 'wap1_Price_Open_450',
 'wap1_q50_300',
 'wap1_q50_450',
 'price_amax_450',
 'wap1_q25_150',
 'price_mean_150',
 'bid_price1_mean',
 'BestPrice_Difference_ask_mean_300',
 'BestPrice_Difference_ask_mean_450',
 'bid_price1_mean_450',
 'wap2_q50',
 'price_mean',
 'ask_price1_Price_Close_450',
 'wap2_mean_450',
 'wap2_mean_150',
 'wap2_mean',
 'bid_price1_mean_150',
 'wap1_amax_450',
 'wap1_q75_300',
 'price_mean_300',
 'price_amax_300',
 'bid_price2_mean_450',
 'wap1_mean_300',
 'order_count_amin_450',
 'order_count_amin',
 'wap1_amin_150',
 'order_count_amin_150',
 'order_count_amin_300',
 'wap1_mean_150'
             }

In [None]:
from sklearn.model_selection import GroupKFold

import lightgbm as lgb


# Function to early stop with root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

def train_and_evaluate_lgb(train, test, params):
    # Hyperparammeters (just basic)
    
    #features = [col for col in train.columns if col not in {"time_id", "target", "row_id"}]
    features = [col for col in train.columns if col not in ColToExclude]
    #features
    # Create out of folds array
    y = train['target']
    oof_predictions = np.zeros(train.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(test.shape[0])
    # Create a KFold object
    kfold = GroupKFold(n_splits = 5)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train,groups=train['time_id'])):
        print(f'Training fold {fold + 1}')
        x_train, x_val = train.iloc[trn_ind], train.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train[features], y_train, weight = train_weights)
        val_dataset = lgb.Dataset(x_val[features], y_val, weight = val_weights)
        model = lgb.train(params = params,
                          num_boost_round=1500,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=50,
                          feval = feval_rmspe)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val[features])
        # Predict the test set
        test_predictions += model.predict(test[features]) / 5
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    lgb.plot_importance(model,max_num_features=20)
    #model.save_model('lgb.txt', num_iteration=model.best_iteration) 
    # Return test predictions
    return test_predictions

In [None]:
predictions_lgb_0= train_and_evaluate_lgb(train, test,params0)
#predictions_lgb_1= train_and_evaluate_lgb(train1, test,params1)
#predictions_lgb_2= train_and_evaluate_lgb(train2, test,params2)

#test['target'] = predictions_lgb_0
#test['predictions_lgb_1'] = predictions_lgb_1
#test['predictions_lgb_2'] = predictions_lgb_2
#test['target'] = np.where(test['realized_volatility_market1']<= 0.0098, test['predictions_lgb_1'], test['predictions_lgb_2'])

#test[['row_id', 'target']].to_csv('submission.csv',index = False)

In [None]:
#Features Importance
#model = lgb.Booster(model_file='lgb.txt')
#importances = pd.DataFrame({'Feature': model.feature_name(), 
#                           'Importance': model.feature_importance(importance_type='gain')})
#importances.sort_values(by = 'Importance', inplace=True)
#importances2 = importances.nsmallest(300,'Importance', keep='first').sort_values(by='Importance', ascending=False)
#importances2.Feature.tolist()

# TabNet

In [None]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.matlib

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from scipy import stats
from scipy.stats import norm
from joblib import Parallel, delayed

import shutil
import glob

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts


# setting some globl config

plt.style.use('ggplot')
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams["figure.facecolor"] = '#FFFACD'
plt.rcParams["axes.facecolor"] = '#FFFFE0'
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = orange_black[3]
plt.rcParams["grid.alpha"] = 0.5
plt.rcParams["grid.linestyle"] = '--'


import warnings
warnings.filterwarnings("ignore")

In [None]:
import psutil
psutil.cpu_count()

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

In [None]:
features = [col for col in train.columns if col not in ColToExclude]
X = train
y = train['target']
X_test=test


In [None]:
gc.collect()
print('X shape: {}'.format(X.shape))
print('y shape: {}'.format(y.shape))

In [None]:
nunique = X.nunique()
types = X.dtypes

categorical_columns = []
categorical_dims =  {}

#for col in X.columns:
for col in features:
    if  col == 'stock_id':
        l_enc = LabelEncoder()
        X[col] = l_enc.fit_transform(X[col].values)
        X_test[col] = l_enc.transform(X_test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        scaler = StandardScaler()
        X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
        X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))
        


cat_idxs = [ i for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

In [None]:
tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    n_d = 16,
    n_a = 16,
    n_steps = 2,
    gamma = 2,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = (2e-2)),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 10
    
)

In [None]:
class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        
        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))
    


def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()

In [None]:
#kfold = KFold(n_splits = 5, random_state = 42, shuffle = True)
kfold = GroupKFold(n_splits = 5)

# Create out of folds array
oof_predictions = np.zeros((X.shape[0], 1))
test_predictions = np.zeros(X_test.shape[0])
#feature_importances = pd.DataFrame()
#feature_importances["feature"] = X[features].columns.tolist()
#stats = pd.DataFrame()
explain_matrices = []
masks_ =[]

#for fold, (trn_ind, val_ind) in enumerate(kfold.split(X)):
for fold, (trn_ind, val_ind) in enumerate(kfold.split(X,groups=X['time_id'])):
    print(f'Training fold {fold + 1}')
    X_train, X_val = X[features].iloc[trn_ind].values, X[features].iloc[val_ind].values
    y_train, y_val = y.iloc[trn_ind].values.reshape(-1,1), y.iloc[val_ind].values.reshape(-1,1)


    clf =  TabNetRegressor(**tabnet_params)
    clf.fit(
      X_train, y_train,
      eval_set=[(X_val, y_val)],
      max_epochs = 200,
      patience = 50,
      batch_size = 1024*20, 
      virtual_batch_size = 128*20,
      num_workers = 4,
      drop_last = False,
      eval_metric=[RMSPE],
      loss_fn=RMSPELoss
      )
    
    saving_path_name = f"./fold{fold}"
    saved_filepath = clf.save_model(saving_path_name)
    
    explain_matrix, masks = clf.explain(X_val)
    explain_matrices.append(explain_matrix)
    masks_.append(masks[0])
    masks_.append(masks[1])
      
    oof_predictions[val_ind] = clf.predict(X_val)
    test_predictions+=clf.predict(X_test[features].values).flatten()/5
    #feature_importances[f"importance_fold{fold}+1"] = clf.feature_importances_
    
    #stats[f'fold{fold+1}_train_rmspe']=clf.history['loss']
    #stats[f'fold{fold+1}_val_rmspe']=clf.history['val_0_rmspe']
    
print(f'OOF score across folds: {rmspe(y, oof_predictions.flatten())}')

In [None]:
test['target'] = (test_predictions*0.6) + (predictions_lgb_0*0.4)
test[['row_id', 'target']].to_csv('submission.csv',index = False)