In [None]:
import numpy as np 
import pandas as pd
import os
from sklearn.metrics import r2_score
import glob

# Checking format of dataset

In [None]:
sub = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/sample_submission.csv")
train = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/train.csv")
test = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/test.csv")

In [None]:
train.head()

In [None]:
# CHECKING UNIQUE STOCK_IDS 
train["stock_id"].unique().sum()

In [None]:
# LOADING PARQUET
import glob
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
list_order_trade_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')

In [None]:
book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')

In [None]:
len(list_order_book_file_train)

In [None]:
len(list_order_trade_file_train)

For Comparing the stock price of yesterday and today, We are applying log after taking difference. 


In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

Here for calculating volatility (which is our target) we will be using Weighted Average Price (WAP) using bidsize, asksize, bidprize and askprize.
So the formula we will be using is as follows:

* a = (BidPrice1∗AskSize1+AskPrice1∗BidSize1) / (BidSize1+AskSize1)
* b = (BidPrice2∗AskSize2+AskPrice2∗BidSize2) / (BidSize2+AskSize2)
* WAP = (a + b) / 2

In [None]:
a = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])

b = (book_example['bid_price2'] * book_example['ask_size2'] +
                                book_example['ask_price2'] * book_example['bid_size2']) / (
                                       book_example['bid_size2']+ book_example['ask_size2'])

book_example['wap'] = (a + b) / 2

In [None]:
book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatility(book_example['log_return'])
print(f'Realized volatility for stock_id 0 on time_id 5 is {realized_vol}')

In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

Calculating Volatility per time_id

In [None]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    a = (df_book_data['bid_price1'] * df_book_data['ask_size1'] +
                                df_book_data['ask_price1'] * df_book_data['bid_size1']) / (
                                       df_book_data['bid_size1']+ df_book_data['ask_size1'])

    b = (df_book_data['bid_price2'] * df_book_data['ask_size2'] +
                                df_book_data['ask_price2'] * df_book_data['bid_size2']) / (
                                       df_book_data['bid_size2']+ df_book_data['ask_size2'])
    df_book_data['wap'] = (a+b)/2
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

Volatility per stock_id.

In [None]:
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
                                                           prediction_column_name='pred')

As we know submission file is in row_id and stock_id format. And we can represent row_id by concating stock_id value and time_id value with "-". 

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')

Calculating RMSE Score

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

Creating Submission File.

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_test,
                                                           prediction_column_name='target')
df_naive_pred_test.to_csv('submission.csv',index = False)

Reference Notebook:
https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data

# WORK IN PROGRESS......

# IF YOU FIND THIS HELPFUL PLEASE UPVOTE .