# Optiver Realized Volatility Prediction : EDA + Linear Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
from sklearn.metrics import r2_score
import glob

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

## Dataset

### book_[train/test].parquet: 
A parquet file partitioned by stock_id. Provides order book data on the most competitive buy and sell orders entered into the market. The top two levels of the book are shared. The first level of the book will be more competitive in price terms, it will then receive execution priority over the second level.

* <b>stock_id - </b>ID code for the stock. Not all stock IDs exist in every time bucket. Parquet coerces this column to the categorical data type when loaded; you may wish to convert it to int8.
* <b>time_id - </b>ID code for the time bucket. Time IDs are not necessarily sequential but are consistent across all stocks.
* <b>seconds_in_bucket - </b>Number of seconds from the start of the bucket, always starting from 0.
* <b>bid_price[1/2] - </b>Normalized prices of the most/second most competitive buy level.
* <b>ask_price[1/2] - </b>Normalized prices of the most/second most competitive sell level.
* <b>bid_size[1/2] - </b>The number of shares on the most/second most competitive buy level.
* <b>ask_size[1/2] - </b>The number of shares on the most/second most competitive sell level.

### trade_[train/test].parquet:
A parquet file partitioned by stock_id. Contains data on trades that actually executed. Usually, in the market, there are more passive buy/sell intention updates (book updates) than actual trades, therefore one may expect this file to be more sparse than the order book.

* <b>stock_id - </b>Same as above.
* <b>time_id - </b>Same as above.
* <b>seconds_in_bucket - </b>Same as above. Note that since trade and book data are taken from the * same time window and trade data is more sparse in general, this field is not necessarily starting from 0.
* <b>price - </b>The average price of executed transactions happening in one second. Prices have been normalized and the average has been weighted by the number of shares traded in each transaction.
* <b>size - </b>The sum number of shares traded.
* <b>order_count - </b>The number of unique trade orders taking place.

### train.csv:
The ground truth values for the training set.
* <b>stock_id - </b>Same as above, but since this is a csv the column will load as an integer instead of categorical.
* <b>time_id - </b>Same as above.
* <b>target - </b>The realized volatility computed over the 10 minute window following the feature data under the same stock/time_id. There is no overlap between feature and target data. You can find more info in our tutorial notebook.

### test.csv
Provides the mapping between the other data files and the submission file. As with other test files, most of the data is only available to your notebook upon submission with just the first few rows available for download.

* <b>stock_id - </b>Same as above.
* <b>time_id - </b>Same as above.
* <b>row_id - </b>Unique identifier for the submission row. There is one row for each existing time ID/stock ID pair. Each time window is not necessarily containing every individual stock.

### sample_submission.csv
A sample submission file in the correct format.

* <b>row_id - </b>Same as in test.csv.
* <b>target - </b>Same definition as in train.csv. The benchmark is using the median target value from train.csv.


In [None]:
train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
test = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
train.head()

In [None]:
test.head()

## Target Distribution:
Target is skewed on left side

In [None]:
mean = np.mean(train['target'])
print(f"Mean : {mean}")

plt.figure(figsize=(8, 5))
sns.distplot(train['target'], bins=50)
plt.title('Target Distribution')
plt.show()

### Let's see How many values are greater than 0.02

In [None]:
print(f"Target count greater than 0.02 : {train['target'][train['target'] >= 0.02].count()}")
print(f"Percentage of total: {(train['target'][train['target'] >= 0.02].count() / train.shape[0]) * 100} %")

In [None]:
print(f"Number of shares: {train.shape[0]}")
for col in train.columns:
    print(f" {col}: {len(train[col].unique())}")

<b> So there are different 112 stock ids, 3830 time ids and 414287 target. </b>

In [None]:
stock = train.groupby('stock_id')['target'].agg(['mean', 'sum']).reset_index()
print(f"Mean: {stock['mean'].mean()}")
print(f"Max value: {stock['sum'].mean()}")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.histplot(x=stock['mean'], ax=ax1)
sns.histplot(x=stock['sum'], ax=ax2)
ax1.set_title('Target mean distribution', size=12)
ax2.set_title('Target sum distribution', size=12)
plt.legend()
plt.show()

<b> So the mean value 0.003 which is close to 0 and Max value is on 14.8.</b>

In [None]:
book_train = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0")
book_test = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0")

trade_train = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
trade_test = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_test.parquet/stock_id=0")

book_train.head()

In [None]:
df_book = book_train[book_train['time_id'] == 5]
df_book.head()

In [None]:
plt.figure(figsize=(15, 5))
for col in ['bid_price1', 'bid_price2', 'ask_price1', 'ask_price2']:
    sns.lineplot(x='seconds_in_bucket', y=col, data=df_book, label=col)
plt.legend()
plt.show()

In [None]:
df_trade= trade_train[trade_train['time_id'] == 5]
df_trade.head()

In [None]:
plt.figure(figsize=(15, 5))
for col in ['bid_price1', 'bid_price2', 'ask_price1', 'ask_price2']:
    sns.lineplot(x='seconds_in_bucket', y=col, data=df_book, label=col)
    
sns.lineplot(x='seconds_in_bucket', y='price', data=df_trade, linewidth=3, color='black', label='price', )
plt.legend()
plt.show()

In [None]:
df_book['wap'] = (df_book['bid_price1'] * df_book['ask_size1']+df_book['ask_price1'] * df_book['bid_size1'])  / (df_book['bid_size1'] + df_book['ask_size1'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

df_book.loc[:,'log_return'] = log_return(df_book['wap'])
df_book = df_book[~df_book['log_return'].isnull()]

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatility(df_book['log_return'])
print(f'Realized volatility for stock_id 0 on time_id 5 is {realized_vol}')

In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)

model_dict = {}

def realized_volatility_per_time_id_linear(file_path, prediction_column_name, train_test = True):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    poly = PolynomialFeatures(degree=3)
    
    if train_test:
        
        df_realized_vol_per_stock_joined = train.merge(df_realized_vol_per_stock[['row_id',prediction_column_name]], on = ['row_id'], how = 'right')

        weights = 1/np.square(df_realized_vol_per_stock_joined.target)

        X = np.array(df_realized_vol_per_stock_joined[[prediction_column_name]]).reshape(-1, 1)
        X_ = poly.fit_transform(X)
        y = df_realized_vol_per_stock_joined.target


        reg = LinearRegression().fit(X_, y, sample_weight = weights)
        df_realized_vol_per_stock[[prediction_column_name]] = reg.predict(X_)

        model_dict[stock_id] = reg

    else: 
        
        reg = model_dict[stock_id]
        
        X = np.array(df_realized_vol_per_stock[[prediction_column_name]]).reshape(-1, 1)
        X_ = poly.fit_transform(X)
        df_realized_vol_per_stock[[prediction_column_name]] = reg.predict(X_)
    
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [None]:
def past_realized_volatility_per_stock_linear(list_file,prediction_column_name, train_test = True):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id_linear(file,prediction_column_name,train_test)])
    return df_past_realized

df_past_realized_train = past_realized_volatility_per_stock_linear(list_file=list_order_book_file_train,prediction_column_name='pred')

In [None]:
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

## Submission

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock_linear(list_file=list_order_book_file_test,
                                                           prediction_column_name='target', train_test = False)
df_naive_pred_test.to_csv('submission.csv',index = False)