The purpose of this competition is to predict volatility in the order book. More precisely, predict your volatility between buy and sell offers and base your buy and sell transactions using these values.

In [None]:
import pandas as pd
import numpy as np

import os 
import glob

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set(rc={'figure.figsize':(16,8)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

# Input Data

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
submission = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')

book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
trade_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet/stock_id=0')

In [None]:
train.head()

In [None]:
test

In [None]:
submission

In [None]:
book_train.head()

In [None]:
trade_train.head()

In [None]:
trade_test

# Functions and Statistics

In [None]:
def calc_log(vl):
    return np.log(vl).diff()

def calc_volatility(vl):
    return np.sqrt(np.sum(vl**2))

def calc_dif(data):
    data['size_spread_1'] = data['ask_size1'] - data['bid_size1']
    data['size_spread_2'] = data['ask_size2'] - data['bid_size2']
    data['price_spread_1'] = data['ask_price1'] - data['bid_price1']
    data['price_spread_2'] = data['ask_price2'] - data['bid_price2']
    
    # Volatility Calculation
    data['wap'] = (data['bid_price1'] * data['ask_size1'] + data['ask_price1'] * data['bid_size1']) / (data['bid_size1']+ data['ask_size1'])
    
    data.loc[:,'log'] = calc_log(data['wap'])
    data = data[~data['log'].isnull()]
    data['volatility'] = calc_volatility(data['log'])    
    return data

In [None]:
book = calc_dif(book_train)

# EDA

In [None]:
book.head()

In [None]:
plt.figure(figsize=(18,10))
sns.heatmap(book.corr(), annot=True, cmap=plt.cm.Blues);

In [None]:
book.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                             .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
train.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                             .background_gradient(subset=['50%'], cmap='coolwarm')

# Subset

In [None]:
book_5 = book[book['time_id']==5]
book_5.head()

In [None]:
book_exp = trade_train[trade_train['time_id']==5]
book_exp.head()

In [None]:
samples = ["bid_price1","bid_price2","ask_price1","ask_price2"]

for num,a in enumerate(samples):
    plt.figure(figsize=(18,18))
   
    plt.subplot(4,1,num+1)
    plt.plot(book_5["seconds_in_bucket"],book_5[a])
    plt.title(a)
plt.show()
plt.figure(figsize=(18,6));

for num,a in enumerate(samples):
    plt.plot(book_5["seconds_in_bucket"],book_5[a],label=a)
plt.legend(fontsize=12);

In [None]:
plt.figure(figsize=(18,6))

for num,a in enumerate(samples):
    plt.plot(book_5["seconds_in_bucket"],book_5[a],label=a)
    
plt.plot(book_exp["seconds_in_bucket"],book_exp["price"],label="trade_parquet",lw=5,color='magenta')
plt.legend(fontsize=12);

In [None]:
book_5.loc[:,'log_return'] = calc_log(book_5['wap'])
book_5 = book_5[~book_5['log_return'].isnull()]

In [None]:
fig = px.line(book_5, x="seconds_in_bucket", y="log_return", title='Log return of stock_id_0, time_id_5')
fig.show()

# Volatitity

In [None]:
volatility = calc_volatility(book_5['log_return'])
print(f'Volatility: {volatility}')

In [None]:
stock = train.groupby("stock_id")["target"].agg(["mean","median","std","count","sum"]).reset_index()
stock

In [None]:
result = stock[['stock_id','median']]
result = stock.set_index("stock_id")
result

In [None]:
result = result.to_dict()
result["median"][0]

In [None]:
submission

In [None]:
submission["stock_id"] = [s.split("-")[0] for s in submission["row_id"]]
submission

In [None]:
submission["target"] = [result["median"][int(s)] for s in submission["stock_id"]]
submission

In [None]:
submission = submission.drop("stock_id",axis=1)
submission

In [None]:
submission.to_csv("submission.csv",index=False)