# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import gc
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Key functions and Features

In [None]:
#read each stock id pq file and store as dataframe  
def read_data(path):
    trade = pd.read_parquet(path)
    return trade

def RMSPE(y_true, y_pred):
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))
    return loss

def WAP1(df):
    WAP = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * 
           df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return WAP

def WAP2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * 
           df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(WAP):
    return np.log(WAP).diff() 

def realized_volatility(log_r):
    return np.sqrt((log_r**2).sum())

#book features
def consol_book_df(path):

    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1]) #extract stock id by removing directory
    
    #Spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2)
    df['bid_spread'] = abs(df['bid_price1'] - df['bid_price2'])
    df['ask_spread'] = abs(df['ask_price1'] - df['ask_price2'])
    
    #Volume features
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    #WAP
    df['WAP1'] = WAP1(df)
    df['WAP2'] = WAP2(df)
    
    #log return
    df['book_log_ret1'] = df.groupby('time_id')['WAP1'].apply(log_return).fillna(0)
    df['book_log_ret2'] = df.groupby('time_id')['WAP2'].apply(log_return).fillna(0)
    
    #Volume features
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    #Book features
    final_book = df.groupby(['stock_id', 'time_id']).agg(
                                              real_vol_1 =('book_log_ret1', realized_volatility),
                                              real_vol_2 = ('book_log_ret2', realized_volatility),
                                              price_spread_mean = ('price_spread', 'mean'),           
                                              bid_spread = ('bid_spread', 'mean'),
                                              ask_spread = ('ask_spread', 'mean'),
                                              total_vol_mean = ('total_volume', 'mean'),
                                              vol_imbal_mean = ('volume_imbalance', 'mean')).reset_index()
    return final_book

#Trade features
def consol_trade_df(path):
    
    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1])  #extract stock id by removing directory
    
    #trade log return
    df['trade_log_ret'] = df.groupby('time_id')['price'].apply(log_return).fillna(0)
    
    #position size = price * size
    df['position_size'] = df['price']*df['size']
    
    #average order size
    df['average_ord_size'] = df['size']/df['order_count']
    
    #Trade features
    final_trade = df.groupby(['time_id', 'stock_id']).agg(
                                                     total_size = ('size', 'sum'),
                                                     position_size_mean = ('position_size', 'mean'),
                                                     avg_ord_size_mean = ('average_ord_size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'),
                                                     real_vol_trade=('trade_log_ret', realized_volatility)).reset_index()

    return final_trade



# Joint Dataset
Create a aggregate dataset which contains both the book and trade features for modelling

In [None]:
def create_dataSet(df, book_paths, trade_paths):
    final_df = pd.DataFrame()
    for book_path, trade_path in tqdm(zip(book_paths, trade_paths)):
        book = consol_book_df(book_path)
        trade = consol_trade_df(trade_path)
        merged_df = (pd.merge(book, trade, on=['stock_id', 'time_id'], how='left')
                     .merge(df, on=['stock_id', 'time_id'], how='left'))
        final_df = pd.concat([final_df, merged_df])
        gc.collect()
    return final_df 

In [None]:
order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
trade_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
train_df = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

train_set = create_dataSet(train_df, order_book_training, trade_training)

## Linear Regression

In [None]:
#check rows for nans
train_set_final = train_set.replace([np.inf,-np.inf],np.nan).dropna()
train_set_final[train_set_final.isnull().any(axis=1)]

#prepare X and y
X = train_set_final.loc[ : , train_set.columns != 'target']

#remove id coluimns
del X['time_id']

y = train_set_final['target']

model = LinearRegression().fit(X,y)

# Features Importance

In [None]:
abs(train_set.corr().loc['target']).sort_values()

In [None]:
train_set.corr()

In [None]:
plt.scatter(train_set['target'], train_set['real_vol_1'])
plt.show()
plt.scatter(train_set['target'], train_set['price_spread_mean'])
plt.show()
plt.scatter(train_set['target'], train_set['total_vol_mean'])
plt.show()

In [None]:
importance = model.coef_
plt.bar([x for x in range(len(importance))], importance)

Feature 4 seems to be causing an issue

In [None]:
print(np.argmin(model.coef_))
X

## Prepare test set

New consol functions are written due to different length of the directory string for test files

In [None]:
order_book_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
trade_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')
test_df = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

test_set = create_dataSet(test_df, order_book_test, trade_test)

## Prediction

In [None]:
#check rows for nans
test_set = test_set.replace([np.inf,np.nan,-np.inf],0.0)
test_set[test_set.isnull().any(axis=1)]

#prepare X and y
X = test_set.loc[ : , test_set.columns != 'row_id']

#remove id coluimns
del X['time_id']


## Submission File

In [None]:
submission = pd.DataFrame({"row_id" : test_set['row_id'], "target": model.predict(X)})  
submission.to_csv('submission.csv',index = False)


In [None]:
submission