In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%%time
trade_df=pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')
trade_df.head()

In [None]:
train_df=pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train_df.head()

In [None]:
print("Number Of Trade Records:", len(trade_df))
print("Number Of Trade timeds:", len(trade_df[['stock_id', 'time_id']].drop_duplicates()))



In [None]:
num_stocks=trade_df.stock_id.nunique()
num_times=trade_df.time_id.nunique()
scarce_percent=1- len(trade_df)/(num_stocks * num_times * 600)

print("Number Of Stocks:", num_stocks)
print("Scarcity Percentage:{:.4f}".format( scarce_percent ))
print("Expected Number of trade records per stock:{:.4f}".format((1-scarce_percent) * 600))

In [None]:
len(trade_df[(trade_df.stock_id==1) & (trade_df.time_id==5)])

In [None]:
trade_price=trade_df[(trade_df.stock_id==20) & (trade_df.time_id==5)].price.values
trade_volume=trade_df[(trade_df.stock_id==20) & (trade_df.time_id==5)]['size'].values
trade_order_count=trade_df[(trade_df.stock_id==20) & (trade_df.time_id==5)]['order_count'].values


_,ax,=plt.subplots(1, 3, figsize=(12, 5))
ax[0].plot(trade_price, label='Trade Price')
ax[1].plot(trade_volume, label='Trade Volume')
ax[2].plot(trade_order_count, label='Trade Order Counts')

plt.legend(loc='best')
plt.show()

In [None]:
df=trade_df[(trade_df.stock_id==20) & (trade_df.time_id==5)]
x=df['size'].values
y=df['order_count'].values

plt.scatter(x, y)
plt.show()

order_count vs target

In [None]:
order_count_df=trade_df.groupby(['stock_id', 'time_id'])[['order_count']].mean().reset_index()
order_count_df=order_count_df.merge(train_df)
order_count_df['log_order_count'] = np.log(order_count_df['order_count'])

order_count_df.head()

In [None]:
_,ax=plt.subplots(1, 2, figsize=(15, 5))
plt.title("Mean Order Counts:")
ax[0].hist(order_count_df.order_count.values, bins=100)
sns.boxplot(y=order_count_df.order_count.values, ax=ax[1])
plt.show()

In [None]:
_,ax=plt.subplots(1, 2, figsize=(15, 5))
plt.title("Mean Order Counts Normalized")
ax[0].hist(order_count_df.log_order_count.values, bins=100, density=True)
sns.boxplot(y=order_count_df.log_order_count.values, ax=ax[1])
plt.show()

In [None]:
order_count_df.log_order_count.describe()

In [None]:
def get_order_count_bin(x):
    return x//0.2

In [None]:
order_count_df['bin']=order_count_df.log_order_count.apply(get_order_count_bin)
order_count_df.head()

In [None]:
plt.title("Order Count Bin Distribution")
sns.histplot(order_count_df.bin)
plt.show()

In [None]:
order_count_df.head()

In [None]:
order_bin_stat=order_count_df.groupby('bin')[['target']].agg(list).reset_index()
order_bin_stat['avg']=order_bin_stat['target'].apply(lambda x : np.mean(x))
order_bin_stat['q_50']=order_bin_stat['target'].apply(lambda x : np.quantile(x, q=0.5))
order_bin_stat['q_75']=order_bin_stat['target'].apply(lambda x : np.quantile(x, q=0.75))
order_bin_stat['q_90']=order_bin_stat['target'].apply(lambda x : np.quantile(x, q=0.9))

order_bin_stat.drop(columns='target', inplace=True)
order_bin_stat.head()

In [None]:
order_bin_stat.plot(x='bin')

1. As the order counts increases the realized volatitly is increasing in most of the situations.
2. Most of the order counts in bin range of [4-8]

In [None]:
share_df=trade_df.groupby(['stock_id', 'time_id'])[['size']].mean().reset_index()
share_df=share_df.merge(train_df)
share_df['log_size']=np.log(share_df['size'])
share_df.head()

In [None]:
_,ax=plt.subplots(1, 2, figsize=(15, 5))
plt.title("Mean Order Sizes:")
ax[0].hist(share_df['size'].values, bins=100)
sns.boxplot(y=share_df['size'].values, ax=ax[1])
plt.show()

In [None]:
_,ax=plt.subplots(1, 2, figsize=(15, 5))
plt.title("Mean Order Log Sizes:")
ax[0].hist(share_df['log_size'].values, bins=100)
sns.boxplot(y=share_df['log_size'].values, ax=ax[1])
plt.show()

In [None]:
def get_shares_count_bin(x):
    return x//0.5

In [None]:
share_df['bins']=share_df['log_size'].apply(get_shares_count_bin)
plt.title("Total Shares Bin Distribution")
sns.histplot(share_df.bins)
plt.show()

In [None]:
share_bin_stat=share_df.groupby('bins')[['target']].agg(list).reset_index()
share_bin_stat['avg']=share_bin_stat['target'].apply(lambda x : np.mean(x))
share_bin_stat['q_50']=share_bin_stat['target'].apply(lambda x : np.quantile(x, q=0.5))
share_bin_stat['q_75']=share_bin_stat['target'].apply(lambda x : np.quantile(x, q=0.75))
share_bin_stat['q_90']=share_bin_stat['target'].apply(lambda x : np.quantile(x, q=0.9))

share_bin_stat.drop(columns='target', inplace=True)
share_bin_stat.head()

In [None]:
share_bin_stat.plot(x='bins')

1. As the Number of the shares traded increases the volatility decreses as a trend, this could be due to high volatile activity in the current 10-min window followed by less-volatility