In [None]:
#optiver volatility prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 50)

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
order_book = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_book = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')

In [None]:
#train first few rows
train.head(10)

In [None]:
#first few rows of order book stock = 0
order_book.head(3)

In [None]:
#first few rows of trade book stock = 0
trade_book.head(3)

In [None]:
#create log diffs
def logDiff(stock_prices):
    return np.log(stock_prices).diff()

In [None]:
#create realized vols for each time / stock price
def realized_vol(log_diffs):
    return np.sqrt(np.sum(log_diffs ** 2))

In [None]:
#process the order book file
def preprocess_order(orderPath):
    stock = pd.read_parquet(orderPath)
    stock_id = orderPath.split('=')[1]
    stock['stock_id'] = stock_id
    stock['wap'] = (stock['bid_price1'] * stock['ask_size1'] + stock['ask_price1'] * stock['bid_size1']) / (stock['bid_size1'] + stock['ask_size1'])
    stock['wap2'] = (stock['bid_price2'] * stock['ask_size2'] + stock['ask_price2'] * stock['bid_size2']) / (stock['bid_size2'] + stock['ask_size2'])
    stock['logDifferences'] = stock.groupby(['time_id'])['wap'].apply(logDiff)
    stock['logDifferences2'] = stock.groupby(['time_id'])['wap2'].apply(logDiff)    
    stock['volume_imbalance1'] = stock['bid_size1'] / stock['ask_size1']
    stock['volume_imbalance2'] = stock['bid_size2'] / stock['ask_size2']
    stock['spread'] = stock['ask_price1'] - stock['bid_price1']
    stock['bid_spread'] = stock['bid_price1'] - stock['bid_price2']
    stock['ask_spread'] = stock['ask_price2'] - stock['ask_price1']
    
    
    return stock

In [None]:
#glob glob the two file paths with all the trade and order files
orderPath = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')
tradePath = glob.glob('../input/optiver-realized-volatility-prediction/trade_train.parquet/*')
stock = preprocess_order(orderPath[0])
stock.head()
#let's see first few rows of stock = 0 after preprocessing

In [None]:
#preprocess the orderbook with aggregate stats
def preprocess_order_agg(stk):
    
    agg_stats = {
        'logDifferences':[realized_vol],
        'logDifferences2':[realized_vol],
        'wap': [np.mean, np.std],
        'wap2':[np.mean, np.std],
        'volume_imbalance1':[np.mean, np.std],
        'spread':[np.mean, np.std, np.min, np.max],
        'bid_spread':[np.mean, np.std],
        'ask_spread':[np.mean, np.std]
    }
    
    df_agg = pd.DataFrame(stk.groupby(['time_id']).agg(agg_stats)).reset_index()
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
    
    return df_agg

In [None]:
#create agg stats for stock
agg_stats = preprocess_order_agg(stock)

In [None]:
agg_stats.head()

In [None]:
def preprocess_trade(tradePath):
    stk = pd.read_parquet(tradePath)
    stock_id = tradePath.split('=')[1]
    stk['stock_id'] = stock_id
    
    agg_stats = {
        'price': [np.mean, np.std, np.min, np.max],
        'size':[np.sum],
        'order_count':[np.sum]
    }
    
    df_agg = pd.DataFrame(stk.groupby(['time_id']).agg(agg_stats)).reset_index()
    df_agg.columns = ['_'.join(col) for col in df_agg.columns]
   
    return df_agg

In [None]:
#create agg stats for trade book
agg_stats2 = preprocess_trade(tradePath[0])

In [None]:
agg_stats2.head()

In [None]:
#time stats

def time_stats_agg(stk, time_in_seconds):
    df = pd.DataFrame()
    
    agg_stats = {
        'logDifferences':[realized_vol],
        'logDifferences2':[realized_vol],
        'wap': [np.mean, np.std],
        'wap2':[np.mean, np.std],
        'volume_imbalance1':[np.mean, np.std],
        'spread':[np.mean, np.std, np.min, np.max],
        'bid_spread':[np.mean, np.std],
        'ask_spread':[np.mean, np.std]
    }
    
    time_df = pd.DataFrame(stk.query(f'seconds_in_bucket > {time_in_seconds}').groupby(['time_id']).agg(agg_stats)).reset_index()
    time_df.columns = ['_'.join(col) for col in time_df.columns]
    time_df = time_df.add_suffix('_' + str(time_in_seconds))
    
    return time_df

In [None]:
#create time stats by different time amts
time_stats_0 = time_stats_agg(stock, time_in_seconds = 0)
time_stats_150 = time_stats_agg(stock, time_in_seconds = 150)
time_stats_300 = time_stats_agg(stock, time_in_seconds = 300)
time_stats_450 = time_stats_agg(stock, time_in_seconds = 450)

In [None]:
time_stats_0.head()

In [None]:
#merge all dfs
time_stats = time_stats_0.merge(time_stats_150, how = 'left', left_on = 'time_id__0', right_on = 'time_id__150')
time_stats = time_stats.merge(time_stats_300, how = 'left', left_on = 'time_id__0', right_on = 'time_id__300')
time_stats = time_stats.merge(time_stats_450, how = 'left', left_on = 'time_id__0', right_on = 'time_id__450')

In [None]:
#see columns created
time_stats.columns

In [None]:
#drop unneccessary time columns and add row_id
stock_id = orderPath[0].split('=')[1]
time_stats['row_id'] = time_stats['time_id__0'].apply(lambda x: f'{stock_id}-{x}')
time_stats.drop(['time_id__0','time_id__150', 'time_id__300', 'time_id__450'], axis = 1, inplace = True)

In [None]:
time_stats.columns

In [None]:
time_stats.head()

In [None]:
#check the shape of df to see that it's correct
time_stats.shape

In [None]:
#loop through all stocks
i = 1
df_final = pd.DataFrame()

for (order, trade) in zip(orderPath, tradePath):
    
    stock = preprocess_order(order)
    trade_agg = preprocess_trade(trade)
    time_stats_0 = time_stats_agg(stock, time_in_seconds = 0)
    time_stats_150 = time_stats_agg(stock, time_in_seconds = 150)
    time_stats_300 = time_stats_agg(stock, time_in_seconds = 300)
    time_stats_450 = time_stats_agg(stock, time_in_seconds = 450)
    
    #merge all dfs
    time_stats = time_stats_0.merge(time_stats_150, how = 'left', left_on = 'time_id__0', right_on = 'time_id__150')
    time_stats = time_stats.merge(time_stats_300, how = 'left', left_on = 'time_id__0', right_on = 'time_id__300')
    time_stats = time_stats.merge(time_stats_450, how = 'left', left_on = 'time_id__0', right_on = 'time_id__450')
    
    df = time_stats.merge(trade_agg, how = 'left', left_on = 'time_id__0', right_on = 'time_id_')
    
    stock_id = order.split('=')[1]
    df['stock_id'] = int(stock_id)
    df['row_id'] = df['time_id__0'].apply(lambda x: f'{stock_id}-{x}')
    
    df.drop(['time_id__0','time_id__150', 'time_id__300', 'time_id__450'], axis = 1, inplace = True)
    
    df_final = pd.concat([df, df_final], axis = 0)
    
    if i%10 == 0:
        print (i)
    i += 1

In [None]:
#check shape of final df
df_final.shape

In [None]:
#check first few rows of df_final
df_final.head(5)

In [None]:
df_final['stock_id'].unique()

In [None]:
df_final.isnull().any().sum()

In [None]:
#lgb model
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split

X = df_final.drop(['logDifferences_realized_vol_0', 'row_id'], axis = 1)
y = df_final['logDifferences_realized_vol_0']

X_train, X_test, y_train, y_test = train_test_split(X,y)
print('Shape of X_test is {}'.format(X_test.shape))
print('Shape of X_train is {}'.format(X_train.shape))
print('Shape of y_test is {}'.format(y_test.shape))
print('Shape of y_train is {}'.format(y_train.shape))

In [None]:
#build lgb model. fit to train data
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(X_train, y_train)

In [None]:
#make predictions
ypreds = model_lgb.predict(X_test)

In [None]:
#create RMSPE metric
def RMSPE(vols, truth):         
    return np.sqrt(np.sum(np.mean(np.square((vols - truth)/truth))))

In [None]:
#check preds against values
RMSPE(ypreds, y_test)

from sklearn.model_selection import GridSearchCV
params = {
    'num_leaves': [7, 14, 21, 28, 31, 50],
    'learning_rate': [0.1, 0.03, 0.003],
    'max_depth': [-1, 3, 5],
    'n_estimators': [50, 100, 200, 500],
}

grid = GridSearchCV(model_lgb, params, scoring='r2', cv = 5)
grid.fit(X_train, y_train)

params = grid.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500, 'num_leaves': 28}

In [None]:
#params from gridsearch
params = {
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'n_estimators': 500, 
    'num_leaves': 28
}

In [None]:
#rerun with new params
model_lgb = lgb.LGBMRegressor(**params)
model_lgb.fit(X_train, y_train)

In [None]:
#create new preds
ypreds = model_lgb.predict(X_test)

In [None]:
RMSPE(ypreds, y_test)

In [None]:
#create preds for test set
#loop through all stocks
#glob glob the two file paths with all the trade and order files
orderTest = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')
tradeTest = glob.glob('../input/optiver-realized-volatility-prediction/trade_test.parquet/*')

df_final = pd.DataFrame()

for (order, trade) in zip(orderTest, tradeTest):
    stock = preprocess_order(order)
    trade_agg = preprocess_trade(trade)
    time_stats_0 = time_stats_agg(stock, time_in_seconds = 0)
    time_stats_150 = time_stats_agg(stock, time_in_seconds = 150)
    time_stats_300 = time_stats_agg(stock, time_in_seconds = 300)
    time_stats_450 = time_stats_agg(stock, time_in_seconds = 450)
    
    #merge all dfs
    time_stats = time_stats_0.merge(time_stats_150, how = 'left', left_on = 'time_id__0', right_on = 'time_id__150')
    time_stats = time_stats.merge(time_stats_300, how = 'left', left_on = 'time_id__0', right_on = 'time_id__300')
    time_stats = time_stats.merge(time_stats_450, how = 'left', left_on = 'time_id__0', right_on = 'time_id__450')
    
    df = time_stats.merge(trade_agg, how = 'left', left_on = 'time_id__0', right_on = 'time_id_')
    df['stock_id'] = int(trade.split('=')[1])
    df['row_id'] = df['time_id__0'].apply(lambda x: f'{stock_id}-{x}')
    
    df.drop(['time_id__0','time_id__150', 'time_id__300', 'time_id__450'], axis = 1, inplace = True)
    
    df_final = pd.concat([df, df_final], axis = 0)


In [None]:
#create model on test data
X = df_final.drop(['logDifferences_realized_vol_0', 'row_id'], axis = 1)
X.head()

In [None]:
#LGB model
X['target'] = model_lgb.predict(X)
X['row_id'] = X['stock_id'].apply(str) + '-' + (X['time_id_']).apply(str)

In [None]:
submission = X[['row_id', 'target']]

In [None]:
submission.head(3)

In [None]:
submission.to_csv('submission.csv', index = False)