In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
from joblib import Parallel, delayed
import xgboost as xgb
import copy
from xgboost.sklearn import XGBRegressor
import os
from sklearn.linear_model import LinearRegression
import warnings
import joblib
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
import datatable as dt
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
warnings.filterwarnings(action='ignore', category=UserWarning)

# Train: https://www.kaggle.com/yus002/realized-volatility-prediction-lgbm-train

# References: 
* https://www.kaggle.com/mayunnan/realized-volatility-prediction-code-template by Ma Yunnan
* https://www.kaggle.com/thanish/randomforest-starter-submission by Thanish Batcha

In [None]:
def my_metrics(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def rmspe(y_true, y_pred):  # f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
    output = my_metrics(y_true, y_pred)
    return 'rmspe', output, False
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def get_stock_stat(stock_id : int, dataType = 'train'):   
    book_train_subset = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by=['time_id', 'seconds_in_bucket'])

    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                               
    book_train_subset['wap'] = (book_train_subset['bid_price1'] * book_train_subset['ask_size1'] +
                            book_train_subset['ask_price1'] * book_train_subset['bid_size1']) / (
                            book_train_subset['bid_size1']+ book_train_subset['ask_size1'])
    book_train_subset['log_return'] = (book_train_subset.groupby(by = ['time_id'])['wap'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    stock_stat = pd.merge(
        book_train_subset.groupby(by = ['time_id'])['log_return'].agg(realized_volatility).reset_index(),
        book_train_subset.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'],
        how = 'left'
    )
    stock_stat['stock_id'] = stock_id
    return stock_stat
def get_dataSet(stock_ids : list, dataType = 'train'):
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)
    return stock_stat_df

In [None]:
keep_stock_id = 1
# keep_stock_id = 0


In [None]:
# train -------------------------
if keep_stock_id:
    td = dt.fread('../input/mytrain/X_131_features.csv')
    X = td.to_pandas()
    del td
else: 
    X = pd.read_csv("../input/mytrain/X.csv")
y = pd.read_csv("../input/mytrain/y.csv")
# to_test ----------------------------------------------------
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataSet = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test_dataSet = test_dataSet
final_pred1 = test_dataSet[['row_id']]
to_test = test_dataSet.drop(['row_id'], axis = 1).fillna(0)
if keep_stock_id:
    train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
    cols = [f'stock_id_{c}' for c in list(set(train.stock_id))]
    to_test[cols] = pd.DataFrame(np.stack([(to_test.stock_id == c).astype('int') for c in list(set(train.stock_id))]).T, columns = cols)
else:
    to_test = to_test.drop("stock_id", axis = 1)
    X = X.drop("stock_id", axis = 1)

In [None]:
config = {'input_path': "../input/optiver-realized-volatility-prediction/trade_",
          'train_path': '../input/optiver-realized-volatility-prediction/train.csv',
          'test_path' : '../input/optiver-realized-volatility-prediction/test.csv'}
test_df = pd.read_csv(config['test_path'])
def read_data(stock_id, data_type):
    file = glob.glob(config['input_path']+f'{data_type}.parquet/stock_id={stock_id}/*')[0]
    df = pd.read_parquet(file)
    return df
def get_final_df(df, data_type):
    final_df = pd.DataFrame()
    unique_id = df['stock_id'].unique().tolist()
    for stock_id in tqdm(unique_id):
        temp_stock_df = read_data(stock_id=stock_id, data_type=data_type)
        temp_stock_df['stock_id'] = stock_id
        final_df = pd.concat([final_df, temp_stock_df])
    final_df.reset_index(drop=True)
    return final_df
def get_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     median_sec_in_bucket = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median')
                                                    ).reset_index()
    
    return agg_df
test_final_df = get_final_df(df=test_df, data_type='test')
test_agg = get_agg_info(df=test_final_df)
test_final_df = pd.merge(test_df, test_agg, on=['stock_id', 'time_id'], how='left')
test_final_df.fillna(-999, inplace=True)
test_final_df = test_final_df.drop("row_id", axis = 1)
to_test = to_test.merge(test_final_df, on=['stock_id', 'time_id'], how='left')
to_test.fillna(-999, inplace=True)
to_test = to_test.drop("stock_id", axis = 1)

# Load in models

In [None]:
output = []
if keep_stock_id:
    for filepath in glob.iglob('../input/629-rvp-131-features-model/*.pkl'):
        model = joblib.load(filepath)
        y_pred = model.predict(to_test, num_iteration = model.best_iteration_)
        output.append(y_pred)
        del model
        del y_pred
else:
    for filepath in glob.iglob('../input/629rvpstock-id-as-continous/*.pkl'):
        model = joblib.load(filepath)
        y_pred = model.predict(to_test, num_iteration = model.best_iteration_)
        output.append(y_pred)
        del model
        del y_pred
y_pred = sum(output) / len(output)

In [None]:
final_pred1 = final_pred1.assign(target = y_pred)
final_pred1

# Best Naive baseline 0.29082
# Reference: https://www.kaggle.com/pratibha9/a-quick-model

In [None]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    a = (df_book_data['bid_price1'] * df_book_data['ask_size1'] +
                                df_book_data['ask_price1'] * df_book_data['bid_size1']) / (
                                       df_book_data['bid_size1']+ df_book_data['ask_size1'])

    b = (df_book_data['bid_price2'] * df_book_data['ask_size2'] +
                                df_book_data['ask_price2'] * df_book_data['bid_size2']) / (
                                       df_book_data['bid_size2']+ df_book_data['ask_size2'])
    df_book_data['wap'] = (a+b)/2
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
final_pred2 = past_realized_volatility_per_stock(list_file=list_order_book_file_test,
                                                           prediction_column_name='target')
final_pred2

In [None]:
output = final_pred1.merge(final_pred2, on = "row_id", how = "outer").fillna(0)
# x: my pred, y: naive
my_pred_weight = 0.5
output["target"] = output.target_x * my_pred_weight + output.target_y * (1 - my_pred_weight)
output = output[["row_id", "target"]]
output.to_csv('submission.csv',index = False)
print(output)