In [None]:
import os
from sklearn.metrics import r2_score
import glob
import numpy as np
import pandas as pd


list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
def realized_volatility_per_time_id(file_path):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    
    df_book_data['bas'] = (df_book_data[['ask_price1', 'ask_price2']].min(axis = 1)
                           / df_book_data[['bid_price1', 'bid_price2']].max(axis = 1) - 1) 
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id']).agg({'log_return': realized_volatility,
                                                                                     'bas': "mean"})).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return': "realized_volatility"})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id', "realized_volatility", "bas"]]

In [None]:
from tqdm import tqdm

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def past_realized_volatility_per_stock(list_file):
    df_past_realized = pd.DataFrame()
    for file in tqdm(list_file):
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file)])
    return df_past_realized

df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train)

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
df_joined = train.merge(df_past_realized_train, on = ['row_id'], how = 'left')

In [None]:
df_joined

In [None]:
!pip install lofo-importance

In [None]:
from lofo import Dataset, LOFOImportance, plot_importance
from sklearn.metrics import make_scorer

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


scorer = make_scorer(rmspe, greater_is_better=False, needs_proba=False, needs_threshold=False)

def get_importance():
    dataset = Dataset(df=df_joined, target="target", features=["stock_id", "time_id", "realized_volatility", "bas"])
    lofo_imp = LOFOImportance(dataset, cv=4, scoring=scorer,
                              fit_params={"categorical_feature": ["stock_id"]})
    return lofo_imp.get_importance()

importance_df = get_importance()
plot_importance(importance_df, figsize=(8, 4), kind="default")

In [None]:
plot_importance(importance_df, figsize=(8, 4), kind="box")