In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
import glob
import warnings

In [None]:
#load data
train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv", header=0)
train

In [None]:
book_filepath = "../input/optiver-realized-volatility-prediction/book_train.parquet"
trade_filepath = "../input/optiver-realized-volatility-prediction/trade_train.parquet"

#get filename in book and trade files
book_filenames = os.listdir(book_filepath)
trade_filenames = os.listdir(trade_filepath)

print(book_filenames[:15])

In [None]:
sample = pd.read_parquet(os.path.join(book_filepath, book_filenames[0]))
sample.head(530)

In [None]:
sample2 = pd.read_parquet(os.path.join(trade_filepath, trade_filenames[43]))
sample2.head(50)

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
def get_book_data(stock_id, filepath):
    path = os.path.join(filepath, f"stock_id={stock_id}".format(stock_id))
    data = pd.read_parquet(path)
    data.insert(0, 'stock_id', stock_id)
    data['wap1'] = (data['bid_price1'] * data['ask_size1'] + data['ask_price1'] * data['bid_size1']) / (data['bid_size1']+ data['ask_size1'])
    data['wap2'] = (data['bid_price2'] * data['ask_size2'] + data['ask_price2'] * data['bid_size2']) / (data['bid_size2']+ data['ask_size2'])
    data['log_return1'] = data.groupby(['time_id'])['wap1'].apply(log_return)
#     print(data.groupby(['time_id'])['wap1'])
    data['log_return2'] = data.groupby(['time_id'])['wap2'].apply(log_return)
    data = data[~data['log_return1'].isnull()]
    data = data[~data['log_return2'].isnull()]
    return data

def get_book_volatility(data):
    data = data.groupby(['stock_id', 'time_id'])['log_return1', 'log_return2'].agg(realized_volatility).reset_index()
    data.rename(columns = {'log_return1': 'volatility1', 'log_return2': 'volatility2'}, inplace = True)
    return data

def get_trade_data(stock_id, filepath):
    path = os.path.join(filepath, f"stock_id={stock_id}".format(stock_id))
    data = pd.read_parquet(path)
    data.insert(0, 'stock_id', stock_id)
    return data

# **compare stock_id 0 and stock_id 43**

In [None]:
#define datasets
book_0 = get_book_data(0, book_filepath)
book_43 = get_book_data(43, book_filepath)

trade_0 = get_trade_data(0, trade_filepath)
trade_43 = get_trade_data(43, trade_filepath)

In [None]:
#stock_id and time_id data
book_0_time_5 = book_0[book_0["time_id"] == 5]
book_0_time_16 = book_0[book_0["time_id"] == 16]

book_43_time_5 = book_43[book_43["time_id"] == 5]
book_43_time_16 = book_43[book_43["time_id"] == 16]

compare Ask and Bid price

In [None]:
#plt ask and bid price
plt.figure(figsize = (8,5))
ax = plt.subplot(1,1,1)
sns.lineplot(data = book_0_time_5, x = "seconds_in_bucket", y = "ask_price1", label="ask")
sns.lineplot(data = book_0_time_5, x = "seconds_in_bucket", y = "bid_price1", label="bid")

plt.title("bid and askprice", fontsize=20, loc="left")
plt.ylabel("price")
plt.plot()


book_0_time_5["apread"] = book_0_time_5["ask_price1"] - book_0_time_5["ask_price2"]

compare difference between bid and ask "size"

In [None]:
book_0_time_5["net_size"] = book_0_time_5["bid_size1"] - book_0_time_5["ask_size1"]
book_0_time_5["net"] = ["neg" if x<0 else "pos" for x in book_0_time_5["net_size"].values]

book_0_time_16['net_size'] = book_0_time_16['bid_size1'] - book_0_time_16['ask_size1']
book_0_time_16['net'] = ['neg' if x < 0 else 'pos' for x in book_0_time_16['net_size'].values]

In [None]:
def net_size_data(data, stock_id, time_id):
    plt.figure(figsize = (8,5))
    ax = plt.subplot(1,1,1)
    sns.scatterplot(data=data, x="seconds_in_bucket", y="wap1", hue="net", hue_order=["neg", "pos"])
    
    for i in ['left', 'right', 'top']:
        ax.spines[i].set_visible(False)
    
    plt.title(f'Stock {stock_id} - time {time_id}: WAP fluctuation', fontsize = 20, pad = 20, loc = 'left')
    
    plt.show()

In [None]:
net_size_data(book_0_time_5, 0, 5)

In [None]:
net_size_data(book_0_time_16, 0, 16)

In [None]:
book_0_list = [book_0_time_5, book_0_time_16]
fig, axes = plt.subplots(figsize = (16, 4), ncols = 2)

for i, dat in enumerate(book_0_list):
    sns.lineplot(data = dat, x = 'seconds_in_bucket', y = 'wap1', label = str('time_')+str(i+1), ax = axes[0], 
                 legend = None)
    
for i, dat in enumerate(book_0_list):
    sns.lineplot(data = dat, x = 'seconds_in_bucket', y = 'wap2', label = str('time_')+str(i+1), ax = axes[1])

axes[0].set_title('WAP1', fontsize = 20, pad = 20, loc = 'left')
axes[1].set_title('WAP2', fontsize = 20, pad = 20, loc = 'left')
plt.show()

In [None]:
plt.figure(figsize=(6,6))
ax = sns.regplot(data = book_0_time_5, x = 'wap1', y = 'wap2')
plt.title('Correlation: wap1 & wap2', fontsize = 20, loc = 'left', pad = 20)
plt.show()


In [None]:
book_0_list = [book_0_time_5, book_0_time_16]
book_43_list = [book_43_time_5, book_43_time_16]
fig, axes = plt.subplots(figsize = (12, 8), nrows = 2)

for i, dat in enumerate(book_0_list):
    sns.lineplot(data = dat, x = 'seconds_in_bucket', y = 'wap1', label = str('time_')+str(i+1),ax = axes[0])

for i, dat in enumerate(book_43_list):
    sns.lineplot(data = dat, x = 'seconds_in_bucket', y = 'wap2', label = str('time_')+str(i+1), ax = axes[1])

axes[0].grid(axis='y', linestyle='--', alpha = 0.6)
axes[1].grid(axis='y', linestyle='--', alpha = 0.6)

axes[0].set_title('Stock 0: WAP', fontsize = 20, pad = 20, loc = 'left')
axes[1].set_title('Stock 43: WAP', fontsize = 20, pad = 20, loc = 'left')
plt.tight_layout()
plt.show()

trade data

In [None]:
trade0_time_5 = trade_0[trade_0["time_id"]==5]
trade0_time_16 = trade_0[trade_0["time_id"]==16]

trade43_time_5 = trade_43[trade_43["time_id"]==5]
trade43_time_16 = trade_43[trade_43["time_id"]==16]

In [None]:
book0_vol = get_book_volatility(book_0)
book43_vol = get_book_volatility(book_43)

In [None]:
def plot_price_fluc(data1, data2, stock_id):
    plt.figure(figsize = (12, 5))
    ax = plt.subplot(1, 1, 1)
    sns.scatterplot(data = data1, x = 'seconds_in_bucket', y = 'price', s = data1['size']/4
                ,label = 'time_id 5', alpha = 0.6)
    sns.scatterplot(data = data2, x = 'seconds_in_bucket', y = 'price', s = data2['size']/4
                , label = 'time_id 16', alpha = 0.6)
    plt.title(f'Stock {stock_id}: price fluctuation', fontsize = 20, pad = 20, loc = 'left')
    plt.show()

In [None]:
plot_price_fluc(trade0_time_5, trade0_time_16, 0)
plot_price_fluc(trade43_time_5, trade43_time_16, 43)

compare voratility of Stock0 and Stock43

In [None]:
book0_vol[(book0_vol['time_id'] == 5)| (book0_vol['time_id'] == 16)]

In [None]:
book43_vol[(book43_vol['time_id'] == 5)| (book43_vol['time_id'] == 16)]

In [None]:
plt.figure(figsize = (8, 5))
ax = plt.subplot(1, 1, 1)
sns.kdeplot(book0_vol['volatility1'],fill = True, alpha = 0.6, label = 'stock 0')
sns.kdeplot(book43_vol['volatility1'], fill = True, alpha = 0.6, label = 'stock 43')
plt.xlabel('Volatility')
plt.title('Stock 0 vs Stock 43: Volatility', fontsize = 20, loc = 'left', pad = 20)
plt.legend(frameon = False)
plt.show()