In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#GridSearch for parameters
import warnings
warnings.filterwarnings(action='ignore')
import seaborn as sns
import plotly.express as px
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
train['stock_id'].value_counts()

In [None]:
sns.countplot(x="stock_id", data=train)

In [None]:
train['time_id'].value_counts()

In [None]:
fig = px.scatter(train, y="target", x="time_id")
fig.show()

In [None]:
train.shape

In [None]:
import seaborn as sns
sns.distplot(train['target'], color = 'r', label = 'target distribution')

In [None]:
sns.histplot(train['target'], color = 'r', label = 'target distribution')

In [None]:
sns.distplot(train['time_id'], color = 'y', label = 'time distribution')

In [None]:
#Read the data for stock id=0  and time id = 5 from the parquet file 
book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
stock_id = '0'
book_example = book_example[book_example['time_id']==5]
book_example.loc[:,'stock_id'] = stock_id
trade_example = trade_example[trade_example['time_id']==5]
trade_example.loc[:,'stock_id'] = stock_id


In [None]:
book_example.head()

In [None]:
trade_example.head()

In [None]:
import matplotlib.pyplot as plt

import matplotlib as mpl
%matplotlib inline

book_example.hist(figsize=(18,10))
plt.show()

In [None]:
trade_example.hist(figsize=(18,10), color = 'r')
plt.show()

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [None]:
#our target is to predict short-term realized volatility
#volatility is a statistical measure of price changes on a given stock, to calculate the price change we first need to have a stock valuation at the fixed interval (1 second)
book_example['wap'] = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])

fig = px.line(book_example, x="seconds_in_bucket", y="wap", title='WAP of stock_id_0, time_id_5')
fig.show()

In [None]:
plt.hist(book_example["seconds_in_bucket"], bins = 30)

In [None]:
sns.distplot(book_example['wap'])

In [None]:
book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

In [None]:
book_example.head()

In [None]:

sns.distplot(book_example['log_return'], color ='r', label = 'log_return distribution')


In [None]:
sns.distplot(book_example['bid_price1'], color ='b')

In [None]:
sns.distplot(book_example['bid_price2'], color ='y')

In [None]:
sns.distplot(book_example['ask_price1'], color ='g');

In [None]:
sns.distplot(book_example['ask_price2'], color ='m')

In [None]:
fig = px.line(book_example, x="seconds_in_bucket", y="log_return", title='Log return of stock_id_0, time_id_5')
fig.show()

In [None]:
#realized vol of stock 0 in this feature bucket
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatility(book_example['log_return'])
print(f'Realized volatility for stock_id 0 on time_id 5 is {realized_vol}')

In [None]:
import os
import glob
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [None]:
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
train.loc[train.stock_id == 0].head()

In [None]:

df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
                                                           prediction_column_name='pred')

In [None]:
df_past_realized_train

In [None]:
sns.distplot(df_past_realized_train['pred'], color = 'k')

In [None]:
plt.hist(df_past_realized_train['pred'], color = 'm')

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], 
                        on = ['row_id'], how = 'left')

In [None]:
df_joined

In [None]:
sns.regplot(x='target', y ='pred', data = df_joined)

In [None]:
from sklearn.metrics import r2_score
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)

In [None]:
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_test,
                                                           prediction_column_name='target')
df_naive_pred_test.to_csv('submission.csv',index = False)

## Upvote if you like it.