# Optiver Realized Volatility Prediction

In [None]:
# Importing basic necessary libraries before starting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
#from pandas_profiling import ProfileReport
import glob

# Sweetviz library is advanced statistical and visualization summary libraries which is near-similar to pandas profiling, it's exciting
!pip install sweetviz
import sweetviz as sv

In [None]:
# Let's have a look at the training dataset
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
# Let's have comprehensive look at the booking data through sweetviz library
profile = sv.analyze(train)
profile.show_notebook()

In [None]:
# Quick look at the unique values in each of the columns
for i in train.columns:
    print("There are", len(train[i].unique()), "unique values in", i, "column")

In [None]:
# Let's look at the train data statistically
print("Here is some statistical analysis of the train data:")
print(train.describe())
print("\n")
print("Here are the number of rows and columns in the dataset:")
print(train.shape)

In [None]:
# On-boarding test data
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test.head()

In [None]:
# Running the sample submission file to understand what are we going to predict
sample_predict = pd.read_csv("../input/optiver-realized-volatility-prediction/sample_submission.csv")
sample_predict.head()

#### Looking at the data for the orders entered into the market (not necessarily executed)

In [None]:
# Time to look at the book train data (including all the stock_ids)
book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
print("Here are top few records in book data:")
print("---------------------------------------")
print(book_train.head())
print("\n")
print("Here are last few rows in book data:")
print("---------------------------------------")
print(book_train.tail())
print("\n")
print("Let's also look at few random rows in the data:")
print("---------------------------------------")
print(book_train.sample(n=5))

In [None]:
# Let's look at the shape of the book_train data
book_train.shape

In [None]:
# Let's have comprehensive look at the booking data through sweetviz library
book_report = sv.analyze(book_train)
book_report.show_notebook()

In [None]:
# On-boarding the book_test data
book_test_main =  pd.read_parquet('../input/optiver-realized-volatility-prediction/book_test.parquet')
book_test_main.head()

In [None]:
# Let's have comprehensive look at the book_test data through sweetviz library
book_test_report = sv.analyze(book_test_main)
book_test_report.show_notebook()

#### Looking at the data for which trade has been actually executed (either bought or sold)

In [None]:
# Let's also have a look at the trade_train parquet file only for stock_id =0 because of the volume of the data.
trade_train =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
trade_train.head()

In [None]:
# Looking at the shape of the trade_train as well just to confirm
trade_train.shape

In [None]:
# Let's have a comprehensive look at the trade data through sweetviz
trade_profile = sv.analyze(trade_train)
trade_profile.show_notebook()

In [None]:
# On-boarding Trade_Test data
trade_test_main =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet')
trade_test_main.head()

In [None]:
# Let's have a comprehensive look at the trade_test data through sweetviz
trade_test_profile = sv.analyze(trade_test_main)
trade_test_profile.show_notebook()

#### Looking at the price fluctuations (Individual & whole) when trade was in the blucket

In [None]:
# Thanks Chumajin for this code and next line (I had a same approach)
book_test = book_train[book_train["time_id"]==5]
book_test.head()

Thanks [Chumajin](http://www.kaggle.com/chumajin/optiver-realized-eda-for-starter-english-version) for below line of code

In [None]:
samples = ["bid_price1","bid_price2","ask_price1","ask_price2"]

for num,a in enumerate(samples):
    plt.figure(figsize=(20,5))
   
    plt.subplot(4,1,num+1)
    plt.plot(book_test["seconds_in_bucket"],book_test[a])
    plt.title(a)
plt.show()
plt.figure(figsize=(20,5))

for num,a in enumerate(samples):
    
   
    plt.plot(book_test["seconds_in_bucket"],book_test[a],label=a)
plt.legend(fontsize=12)

#### Now let's see the comparison on the price when the it was in the bucket and the price when the trade was actually executed

In [None]:
trade_test = trade_train[trade_train["time_id"]==5]
trade_test.head()

Thanks [Chumajin](http://www.kaggle.com/chumajin/optiver-realized-eda-for-starter-english-version) for below line of code

In [None]:
# Adding actual trade line to the above graph to understand the fluctuations and how transactions happened
plt.figure(figsize=(20,5))

for num,a in enumerate(samples):
    
   
    plt.plot(book_test["seconds_in_bucket"],book_test[a],label=a)
    
plt.plot(trade_test["seconds_in_bucket"],trade_test["price"],label="trade_parquet",lw=4)
plt.legend(fontsize=12)

#### Purple line above is the actual transaction (trade) that was executed.

## Diving Deeper into Analysis and Prediction

In [None]:
# Importing necessary libraries
from sklearn.metrics import r2_score
import os
import glob
from tqdm import tqdm

#### Looking at the Weighted Average Price for Stock ID 0 and Time ID 5\
Thanks [Jiashen](https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data)

#### Little more comprehensive visualization

In [None]:
# Exploring more opportunity to understand the gaps between big and ask price
plt.plot(book_train['bid_price1'], c = 'blue', label = 'Bid Price')
plt.plot(book_train['ask_price1'], c = 'red', label = 'Ask Price', alpha=0.7)
plt.title('Analysis of Best Bid Price and Best Ask Price')
plt.xlabel('Time ID')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
# Exploring more opportunity to understand the gaps between big and ask price on Level 2
plt.plot(book_train['bid_price2'], c = 'blue', label = 'Bid Price')
plt.plot(book_train['ask_price2'], c = 'red', label = 'Ask Price', alpha=0.7)
plt.title('L2 Analysis of Best Bid Price and Best Ask Price')
plt.xlabel('Time ID')
plt.ylabel('Price')
plt.legend()
plt.show()

We have a glance that bid and ask price are not in the gaussian shape, let's dive further \
let's dive little more deeper.

In [None]:
plt.hist(book_train['bid_price1'], bins='auto', label='Best Bids')
plt.hist(book_train['bid_price2'], bins='auto', label='L2 Bids', alpha=0.7)
plt.title('Bids on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Bid Value')
plt.legend()
plt.show()

In [None]:
plt.hist(book_train['bid_price1'], bins='auto', label='Best Bids')
plt.hist(book_train['bid_price2'], bins='auto', label='L2 Bids', alpha=0.7)
plt.title('Bids on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Bid Value')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
plt.hist(book_train['ask_price1'], bins='auto', label='Best Ask')
plt.hist(book_train['ask_price2'], bins='auto', label='L2 Ask', alpha=0.7)
plt.title('Asks on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Ask Value')
plt.legend()
plt.show()

In [None]:
plt.hist(book_train['ask_price1'], bins='auto', label='Best Ask')
plt.hist(book_train['ask_price2'], bins='auto', label='L2 Ask', alpha=0.7)
plt.title('Asks on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Ask Value')
plt.yscale('log')
plt.legend()
plt.show()

### Compute Statistics for Orderbook

In [None]:
# Calculating log return
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [None]:
# Calculating Weighted Average Price for entire Dataset (with stock_id=0)
book_train['wap'] = (book_train['bid_price1'] * book_train['ask_size1'] +
                                book_train['ask_price1'] * book_train['bid_size1']) / (
                                       book_train['bid_size1']+ book_train['ask_size1'])

book_train.loc[:,'log_return'] = log_return(book_train['wap'])
book_train = book_train[~book_train['log_return'].isnull()]

book_train.head()

In [None]:
# Calculating realized volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

realized_vol = realized_volatility(book_train['log_return'])
print(f'Realized volatility for the taken sample data with time_id 05 & stock_id 0 is {realized_vol}')

### Getting to understand the volatility further
Thanks [Chumajin](https://www.kaggle.com/chumajin/optiver-realized-eda-for-starter-english-version)

In [None]:
stock_0 = train[train['stock_id']==0]
min_index = stock_0['target'].idxmin()
min_time_id = stock_0.iloc[min_index]['time_id']
print("min index is",min_time_id,"min target is",stock_0.iloc[min_index]["target"])

In [None]:
book_test_min = book_train[book_train["time_id"]==min_time_id]
trade_test_min = trade_train[trade_train["time_id"]==min_time_id]


plt.figure(figsize=(20,5))

for num,a in enumerate(samples):
    
   
    plt.plot(book_test_min["seconds_in_bucket"],book_test_min[a],label=a)
    
plt.plot(trade_test_min["seconds_in_bucket"],trade_test_min["price"],label="trade_parquet",lw=4)
plt.legend(fontsize=12)

Visualizing the highest volatility of stock_id 0

In [None]:
stock_0 = train[train["stock_id"]==0]
max_index = stock_0["target"].idxmax()
max_time_id = stock_0.iloc[max_index]["time_id"]
print("max index is",max_time_id,"max target is",stock_0.iloc[max_index]["target"])

In [None]:
book_test_max = book_train[book_train["time_id"]==max_time_id]
trade_test_max = trade_train[trade_train["time_id"]==max_time_id]


plt.figure(figsize=(20,5))

for num,a in enumerate(samples):
    
   
    plt.plot(book_test_max["seconds_in_bucket"],book_test_max[a],label=a)
    
plt.plot(trade_test_max["seconds_in_bucket"],trade_test_max["price"],label="trade_parquet",lw=5)
plt.legend(fontsize=12)

As the scale of vertical axis is different, comparing the actual transactions

In [None]:
plt.figure(figsize=(20,5))
plt.plot(trade_test_min["seconds_in_bucket"],trade_test_min["price"],lw=5,label="minimum_volatility_time")
plt.plot(trade_test_max["seconds_in_bucket"],trade_test_max["price"],lw=5,label = "maximum_volatility_time")
plt.legend(fontsize=15)

Price of a stock fluctuates considerably when the volatility is high and vis-a-versa.

### Submission file

Creating a dictionary of median values for each stock

In [None]:
stock = train.groupby("stock_id")["target"].agg(["mean","median","std","count","sum"]).reset_index()
stock.head()

In [None]:
stock2 = stock[["stock_id","median"]]
stock2 = stock2.set_index("stock_id")
stock2

In [None]:
stock_dict = stock2.to_dict()

# example : stock id = 0 median median value
stock_dict["median"][0]

In [None]:
sample_predict

In [None]:
sample_predict["stock_id"] = [s.split("-")[0] for s in sample_predict["row_id"]]
sample_predict

In [None]:
sample_predict["target"] = [stock_dict["median"][int(s)] for s in sample_predict["stock_id"]]
sample_predict

In [None]:
# Deleting the stock ID
sample_predict = sample_predict.drop("stock_id",axis=1)
sample_predict

In [None]:
sample_predict.to_csv("submission.csv",index=False)

### Thanks for exploring full notebook!

Please upvote if you liked the work!