# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import gc
from tqdm import tqdm

# Key functions and Features

In [None]:
#read each stock id pq file and store as dataframe  
def read_data(path):
    trade = pd.read_parquet(path)
    return trade

def RMSPE(y_true, y_pred):
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))
    return loss

def WAP1(df):
    WAP = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * 
           df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return WAP

def WAP2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * 
           df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(WAP):
    return np.log(WAP).diff() 

def realized_volatility(log_r):
    return np.sqrt((log_r**2).sum())

def consol_book_df(path):

    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1]) #extract stock id by removing directory
    
    #WAP
    df['WAP1'] = WAP1(df)
    df['WAP2'] = WAP2(df)
    
    #log return
    df['book_log_ret1'] = df.groupby('time_id')['WAP1'].apply(log_return).fillna(0)
    df['book_log_ret2'] = df.groupby('time_id')['WAP2'].apply(log_return).fillna(0)
    
    #Book features
    final_book = df.groupby(['stock_id', 'time_id']).agg(
                                              real_vol_1 =('book_log_ret1', realized_volatility),
                                              real_vol_2 = ('book_log_ret2', realized_volatility),
                                              ).reset_index()
    return final_book

def consol_trade_df(path):
    
    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1])  #extract stock id by removing directory
    
    #trade log return
    df['trade_log_ret'] = df.groupby('time_id')['price'].apply(log_return).fillna(0)
    
    #Trade features
    final_trade = df.groupby(['time_id', 'stock_id']).agg(
                                                     real_vol_trade=('trade_log_ret', realized_volatility)).reset_index()

    return final_trade



# Joint Dataset
Create a aggregate dataset which contains both the book and trade features for modelling

In [None]:
def create_dataSet(df, book_paths, trade_paths):
    final_df = pd.DataFrame()
    for book_path, trade_path in tqdm(zip(book_paths, trade_paths)):
        book = consol_book_df(book_path)
        trade = consol_trade_df(trade_path)
        merged_df = (pd.merge(book, trade, on=['stock_id', 'time_id'], how='left')
                     .merge(df, on=['stock_id', 'time_id'], how='left'))
        final_df = pd.concat([final_df, merged_df])
        gc.collect()
    return final_df 

In [None]:
order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
trade_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
train_df = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

train_set = create_dataSet(train_df, order_book_training, trade_training)

## Prepare test set

New consol functions are written due to different length of the directory string for test files

In [None]:
order_book_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
trade_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')
test_df = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

test_set = create_dataSet(test_df, order_book_test, trade_test)

In [None]:
import xgboost
from sklearn.model_selection import train_test_split

X = train_set.drop(columns = ['target'], axis = 1)
y = train_set.target.values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [None]:
reg = xgboost.XGBRegressor(n_estimator=150)
reg.fit(X_train, y_train)

In [None]:
reg.score(X_test, y_test)

In [None]:
test = test_set.drop(columns = ['row_id'], axis = 1)
final_pred = reg.predict(test)

#final_pred = test_set.real_vol_trade + 0.001211

In [None]:
submission = pd.DataFrame({"row_id" : test_set['row_id'], "target": final_pred})  
submission.to_csv('submission.csv',index = False)