# Weighted Regression Baseline

The idea of this notebook is to show that a simple weight help optimise the RMSPE as discussed here : https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324

I got the idea to work on individual stock regression and the importance of taking the custom evaluation into account in my EDA Notebook here : https://www.kaggle.com/lucasmorin/target-error-exploration-stock-time-clustering


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
stock_id = '0'
book_example = book_example[book_example['time_id']==5]
book_example.loc[:,'stock_id'] = stock_id
trade_example = trade_example[trade_example['time_id']==5]
trade_example.loc[:,'stock_id'] = stock_id

In [None]:
book_example['wap'] = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])

In [None]:
fig = px.line(book_example, x="seconds_in_bucket", y="wap", title='WAP of stock_id_0, time_id_5')
fig.show()

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [None]:
book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

In [None]:
fig = px.line(book_example, x="seconds_in_bucket", y="log_return", title='Log return of stock_id_0, time_id_5')
fig.show()

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatility(book_example['log_return'])
print(f'Realized volatility for stock_id 0 on time_id 5 is {realized_vol}')

In [None]:
import os
from sklearn.metrics import r2_score
import glob

list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures


train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)

model_dict = {}

def realized_volatility_per_time_id_linear(file_path, prediction_column_name, train_test = True):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    poly = PolynomialFeatures(degree=3)
    
    if train_test:
        
        df_realized_vol_per_stock_joined = train.merge(df_realized_vol_per_stock[['row_id',prediction_column_name]], on = ['row_id'], how = 'right')


        X = np.array(df_realized_vol_per_stock_joined[[prediction_column_name]]).reshape(-1, 1)
        X_ = poly.fit_transform(X)
        y = df_realized_vol_per_stock_joined.target
        
        parameters = {'alpha':[1, 2, 3, 4, 5, 6, 7, 8, 9,10]}
        
        grid_kr = GridSearchCV(
        estimator=KernelRidge(),
        cv=5,
        param_grid=parameters
        )

        kr = grid_kr.fit(X_, y)
        df_realized_vol_per_stock[[prediction_column_name]] = kr.predict(X_)

        model_dict[stock_id] = kr

    else: 
        
        kr = model_dict[stock_id]
        
        X = np.array(df_realized_vol_per_stock[[prediction_column_name]]).reshape(-1, 1)
        X_ = poly.fit_transform(X)
        df_realized_vol_per_stock[[prediction_column_name]] = kr.predict(X_)
    
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [None]:
def past_realized_volatility_per_stock_linear(list_file,prediction_column_name, train_test = True):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id_linear(file,prediction_column_name,train_test)])
    return df_past_realized

df_past_realized_train = past_realized_volatility_per_stock_linear(list_file=list_order_book_file_train,prediction_column_name='pred')

In [None]:
df_past_realized_train

In [None]:
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')

In [None]:
df_joined.head()

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the kernel ridge regression prediction: R2 score: {R2}, RMSPE: {RMSPE}')

Submission

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock_linear(list_file=list_order_book_file_test,
                                                           prediction_column_name='target', train_test = False)
df_naive_pred_test.to_csv('submission.csv',index = False)