In [None]:
#optiver volatility prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

In [None]:
print(train.head())
print(test.head())

In [None]:
#preprocess a file
orderPath = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')
def preprocess_order(path, predictionColumn):
    #read file
    file = pd.read_parquet(path)
    #create stock_id
    stock_id = path.split('=')[1]
    
    #create top of book wt avg
    file['wt_avg1'] = (file['bid_price1'] * file['ask_size1'] + 
                                   file['ask_price1'] * file['bid_size1'])  / (file['bid_size1'] + file['ask_size1'])

    #create 2nd level wt avg
    file['wt_avg2'] = (file['bid_price2'] * file['ask_size2'] + 
                                   file['ask_price2'] * file['bid_size2']) / (file['bid_size2'] + file['ask_size2'])

    #create mid point
    file['mid_pt'] = (file['bid_price1'] + file['ask_price1']) / 2

    #create spread 
    file['spread1'] = file['ask_price1'] - file['bid_price1'] 
    file['spread2'] = file['ask_price2'] - file['bid_price2']
    file['bid_spread'] = abs(file['bid_price1'] - file['bid_price2'])
    file['ask_spread'] = abs(file['ask_price1'] - file['ask_price2'])
    
    #total volume
    file['total_volume'] = file['bid_size1'] + file['bid_size2'] + file['ask_size1'] + file['ask_size2']
    #volume imbalances
    file['volume_imbalance'] = file['bid_size1'] / file['ask_size1']
    file['volume_imbalance2'] = file['bid_size2'] / file['ask_size2']
    file['total_volume'] = file['bid_size1'] + file['bid_size2'] + file['ask_size1'] + file['ask_size2']
    
    #create logdifferences
    file['logDifferences'] = file.groupby(['time_id'])['wt_avg1'].apply(logDiff)
    file = file[file['logDifferences'].notnull()]
    
    #add stock_id
    file['stock_id'] = stock_id
    
    #dict for aggregate
    create_feature_dct = {
        'logDifferences':[realized_vol],
        'wt_avg1':[np.mean],
        'wt_avg2':[np.mean],
        'spread1':[np.mean],
        'spread2':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'volume_imbalance2':[np.mean],
        'total_volume':[np.mean]
            }
    
    
    #agg stats
    df_agg = pd.DataFrame(file.groupby(['time_id']).agg(create_feature_dct)).reset_index()
    df_agg = df_agg.rename(columns = {'logDifferences': predictionColumn})

    #stats from last 300 seconds 
    second = 300
    df_feature_sec = pd.DataFrame(file.query(f'seconds_in_bucket >= {second}').groupby(['time_id']).agg(create_feature_dct)).reset_index()
    df_feature_sec = df_feature_sec.iloc[:,1:12]

    #add row_id
    df_agg['stock_id'] = str(stock_id)
    df_agg['time_id'] = df_agg['time_id'].apply(str)
    #df_agg['row_id'] = df_agg['stock_id'] + '-' + df_agg['time_id']
    df_agg['row_id'] = df_agg['time_id'].apply(lambda x:f'{stock_id}-{x}')
    df_agg.columns = ['time_id', 'target', 'wt_avg1_mean', 'wt_avg2_mean', 'spread1_mean', 'spread2_mean', 'bid_spread_mean',
                     'ask_spread_mean', 'volume_imbalance_mean', 'volume_imbalance2_mean', 'total_volume_mean', 'stock_id', 'row_id']

    df_feature_sec.columns = ['target_300', 'wt_avg1_mean_300', 'wt_avg2_mean_300', 'spread1_mean_300',
                             'spread2_mean_300', 'bid_spread_mean_300', 'ask_spread_mean_300', 'volume_imbalance_mean_300', 
                             'volume_imbalance2_mean_300','total_volume_mean_300']

    #concat the two dfs
    df_agg = pd.concat([df_agg, df_feature_sec], axis = 1)
    
    df_agg = df_agg.drop(['time_id'], axis = 1)
    
    return df_agg

In [None]:
#create log diffs
def logDiff(stock_prices):
    return np.log(stock_prices).diff()

In [None]:
#create realized vols for each time / stock price
def realized_vol(log_diffs):
    return np.sqrt(np.sum(log_diffs ** 2))

In [None]:
df_order = preprocess_order(orderPath[0], 'target')

In [None]:
df_order.head(5)

In [None]:
df_order.isnull().any().sum()

In [None]:
df_order.row_id.head(5)

In [None]:
tradePath = glob.glob('../input/optiver-realized-volatility-prediction/trade_train.parquet/*')

def preprocess_trade(path):
     #read file
    file = pd.read_parquet(path)
    #create stock_id
    stock_id = path.split('=')[1]
        
    #dictionary agg stats
    create_feature_dct = {
        'price':[np.std],
        'size': [np.sum],
        'order_count':[np.sum]        
    }
    
    #create agg stats
    df_agg = pd.DataFrame(file.groupby(['time_id']).agg(create_feature_dct)).reset_index()
    
    #create row_id
    #df_agg['row_id'] = str(stock_id) + '-' + df_agg['time_id'].apply(str)
    df_agg['row_id'] = df_agg['time_id'].apply(lambda x:f'{stock_id}-{x}')
    df_agg = df_agg.drop(['time_id'], axis = 1)
    df_agg.columns = ['price_std', 'size_sum', 'order_count_sum', 'row_id']
  
    return df_agg

In [None]:
df_trade = preprocess_trade(tradePath[0])

In [None]:
df_trade.head(5)

In [None]:
df_trade.isnull().any().sum()

In [None]:
import glob
import lightgbm as lgb
#loop thru all stocks make test submission
orderTest = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')
tradeTest = glob.glob('../input/optiver-realized-volatility-prediction/trade_test.parquet/*')

model_lgb = lgb.LGBMRegressor()

#create model function

def final_vol_calc(orderPath, tradePath, predictionColumn):
    #create dataframe to save values
    df_final = pd.DataFrame()
    
    #loop though the training stocks
    for (order, trade) in zip(orderPath, tradePath):
        #process order book
        df_order = preprocess_order(order, 'target')
        #process trade 
        df_trade = preprocess_trade(trade)
        #concat 2 data frames
        df = pd.merge(df_order, df_trade, on = ['row_id'], how = 'right')
        df['stock_id'] = int(order.split('=')[1])
        df_final = pd.concat([df_final, df], axis = 0)
    
    #fit model
    X = df_final.drop(['target', 'row_id'], axis = 1)
    y = df_final['target']
    model_lgb.fit(X, y)
    
    return model_lgb

In [None]:
model = final_vol_calc(orderPath, tradePath, 'target')

In [None]:
model

In [None]:
#loop through test set
def test_loop(orderTest, tradeTest, model):
    test_final = pd.DataFrame()
    #loop through the test stocks
    for (order, trade) in zip(orderTest, tradeTest):
        #process order book
        test_order = preprocess_order(order, 'target')
        #process trade 
        test_trade = preprocess_trade(trade)
        #concat 2 data frames
        df_test = pd.merge(test_order, test_trade, on = ['row_id'], how = 'right')
        df_test['stock_id'] = int(order.split('=')[1])
        test_final = pd.concat([test_final, df_test], axis = 0)
    return test_final

In [None]:
test_final = test_loop(orderTest, tradeTest, model)

In [None]:
test_final.head()

In [None]:
#create preds
def create_preds(final_df, model):
        X = final_df.drop(['target', 'row_id'], axis = 1)
        preds = model.predict(X)
        preds = pd.Series(preds, name = 'target')
        final = pd.concat([final_df['row_id'], preds], axis = 1)
        return final

In [None]:
submission = create_preds(test_final, model)

In [None]:
submission

In [None]:
#create final prediction submission
submission.to_csv('./submission.csv', index = False)