# A basic starter for the Optiver Realized Volatility Prediction Competition  

## Introduction 
This notebook is for the exploratory analysis of the **Optiver Competition** , the aim of this analysis is to better understand the data we are wrking with in order to spot patterns and trends.


### Notebook setup 

In [None]:
#importing libraries 
import os
import pandas as pd # pandas 
import numpy as np #numpy 
import plotly.express as px 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split, KFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import r2_score
import glob
from tqdm import tqdm
sns.set_theme(style="dark")


**Loading the data**

In [None]:
# loading the data 
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
sample= pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')


**Checking the Train data**

In [None]:
train.head() # printing the head of the train data 

In [None]:
train.shape # shape of the data 

In [None]:
train.info() # info about the data 

In [None]:
# basic statistic 
train.describe()

In [None]:
train['stock_id'].value_counts() # values of stock id on the train set 

In [None]:
train['time_id'].value_counts() #values of th time id on the train data 

In [None]:



corrMatrix= train.corr() # correlation between columns on the train data 

In [None]:
sns.heatmap(corrMatrix, cmap="YlGnBu",annot=True)
plt.show()

In [None]:
#checking the target distrbution 
sns.distplot(train['target'], color = 'b', label = 'target distribution')

**Checking the book and trade parquet**

In [None]:
# we load the data from book and train where stock id=0  and time id = 5 
book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
stock_id = '0'
book_example = book_example[book_example['time_id']==5]
book_example.loc[:,'stock_id'] = stock_id
trade_example = trade_example[trade_example['time_id']==5]
trade_example.loc[:,'stock_id'] = stock_id

In [None]:
book_example.head()

In [None]:
book_example.shape #302, 11

In [None]:
trade_example.head()

In [None]:
trade_example.shape # 40,6

In [None]:
#ploting the histogram for features in the book example 
book_example.hist(figsize=(16,12))
plt.show()

In [None]:
#ploting the histogram for features in the trade data 
trade_example.hist(figsize=(16,12))


#### Realized volatility calculation

In [None]:
a = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])

b = (book_example['bid_price2'] * book_example['ask_size2'] +
                                book_example['ask_price2'] * book_example['bid_size2']) / (
                                       book_example['bid_size2']+ book_example['ask_size2'])

book_example['wap'] = (a + b) / 2

In [None]:
sns.set(rc={"figure.figsize":(8, 6)})
sns.lineplot(data=book_example, x="seconds_in_bucket" , y="wap")

**Compute the log return**

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 
book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

**Realized volatility for stock_id 0 on time_id 5**

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatility(book_example['log_return'])
print(f'Realized volatility for stock_id 0 on time_id 5 is {realized_vol}')

**Calculating the past realized volatility across the training set**

In [None]:

list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
list_order_trade_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [None]:
len(list_order_book_file_train)
len(list_order_trade_file_train)

**getting the past realized volatility as prediction for each individual stocks.**

In [None]:
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
                                                           prediction_column_name='pred')

**joining the output dataframe with train.csv to see the performance of the naive prediction**

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')

**Evaluate the naive prediction result by two metrics: RMSPE and R squared.**

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

**Submission**


In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_test,
                                                           prediction_column_name='target')
df_naive_pred_test.to_csv('submission.csv',index = False)

#### Note 
This notebook is based on the tutorial provided by the team of the Optiver and which can be find here [Introduction to financial concepts and data.](https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data/notebook#Market-making-and-market-efficiency)

### We will be back 
> If you Appreciate thsi notebook please upvote.
