In [None]:
import warnings 
warnings.filterwarnings('ignore')

import os, gc
import glob
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import xgboost as xgb
from sklearn.model_selection import train_test_split
print(xgb.__version__)

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test.head()

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return ** 2))

def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b = df['bid_size1'] + df['ask_size1'] + df['bid_size2']+ df['ask_size2']
    return (a1 + a2)/ b

def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] = calculate_wap(df_book_data)
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id', prediction_column_name]]

def past_realized_volatility_per_stock(list_file, prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in tqdm(list_file):
        df_past_realized = pd.concat([df_past_realized, realized_volatility_per_time_id(file, prediction_column_name)])
    return df_past_realized

In [None]:
params = {
    'n_estimators': 10000, 
    'learning_rate': 0.01,
    'colsample_bytree': 0.8, 
    'max_depth': 5, 
    'subsample': 0.8,  
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse', 
#     'tree_method': 'gpu_hist', 
    'tree_method': 'hist', 
    'n_jobs': 4, 
    'seed': 42, 
    }

In [None]:
def rmspe(predictions, targets):
    return np.sqrt((((predictions - targets) / targets) ** 2).mean())

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
list_order_trade_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*')
list_order_book_file_test = sorted(list_order_book_file_test, key = lambda x: x.split('=')[1])
list_order_trade_file_test = sorted(list_order_trade_file_test, key = lambda x: x.split('=')[1])
test = test['row_id'].to_frame()
df_naive_pred_test = past_realized_volatility_per_stock(list_file = list_order_book_file_test, 
                                                        prediction_column_name = 'pred')
test = test.merge(df_naive_pred_test[['row_id', 'pred']], on = ['row_id'], how = 'left')
test.head()

In [None]:
n_splits = 5
xgb_preds = pd.DataFrame()
cols = ['seconds_in_bucket', 'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2', 'wap', 'log_return', 'pred']
for i, (b, t) in tqdm(enumerate(zip(list_order_book_file_test, list_order_trade_file_test)), total = len(list_order_book_file_test)):
    assert b.split('=')[1] == t.split('=')[1]
    stock_id = b.split('=')[1]
    book_data = pd.read_parquet(b)
    trade_data = pd.read_parquet(t)
    book_data['row_id'] = book_data['time_id'].apply(lambda x:f'{stock_id}-{x}')
    book_data['wap'] = calculate_wap(book_data)
    book_data['log_return'] = book_data.groupby(['time_id'])['wap'].apply(log_return).fillna(0)
    
    for trans in ['rank', 'cumcount', 'cummax', 'cummin', 'cumsum']:
        for feat in ['bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2', 'wap', 'log_return']:
            feat_trans = feat + '_' + trans
            if i == 0:
                cols.append(feat_trans)
            book_data[feat_trans] = book_data.groupby('time_id')[feat].transform(trans)
    
    for trans in ['mean', 'std', 'max', 'min']:
        for feat in ['price', 'size', 'order_count']:
            feat_trans = 'trade_' + feat + '_' + trans
            if i == 0:
                cols.append(feat_trans)
            trade_data[feat_trans] = trade_data.groupby('time_id')[feat].transform(trans)
            book_data = book_data.merge(trade_data[['time_id', feat_trans]].drop_duplicates(), on = 'time_id', how = 'left')
    
    book_data = book_data.merge(test, on = 'row_id', how = 'left')
    book_data['target'] = 0
    for fold in range(n_splits):
        clf = xgb.XGBRegressor(**params)
        clf.load_model(f'../input/optiver-cv-xgb/XGB_{stock_id}_{fold}')
        book_data['target'] += clf.predict(book_data[cols]) / n_splits
        
        del clf
        x = gc.collect()
    
    xgb_preds = pd.concat([xgb_preds, book_data[['row_id', 'target']]])
    
    del book_data, trade_data
    x = gc.collect()

In [None]:
xgb_preds = xgb_preds.groupby('row_id')['target'].mean().reset_index()
xgb_preds.head()

In [None]:
xgb_preds.to_csv('submission.csv', index = False)