Reference: https://www.kaggle.com/munumbutt/wip-eda-on-the-orderbook

# Import Libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (10, 8)

In [None]:
book = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
trade =  pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')

In [None]:
book = book.set_index('time_id')
trade = trade.set_index('time_id')

In [None]:
book.head()

In [None]:
trade.head()

In [None]:
book

# Calculate Orderbook Statistics

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

# More to come...
def calc_stats(df):
    df['size_spread_l1'] = df['ask_size1'] - df['bid_size1']
    df['size_spread_l2'] = df['ask_size2'] - df['bid_size2']
    
    df['price_spread_l1'] = df['ask_price1'] - df['bid_price1']
    df['price_spread_l2'] = df['ask_price2'] - df['bid_price2']
    
    df['wap'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1']+ df['ask_size1'])
    
    df.loc[:,'log_return'] = log_return(df['wap'])
    df = df[~df['log_return'].isnull()]
    
    # This is wrong
    df['realized_vol'] = realized_volatility(df['log_return'])
    
    return df

In [None]:
book = calc_stats(book)

In [None]:
book

In [None]:
book.describe()

# Size Spread Plots

In [None]:
plt.plot(book['size_spread_l1'])
plt.title('Layer 1 Size Spread on Stock 0')
plt.xlabel('Time ID')
plt.ylabel('Spread')
plt.show()

In [None]:
plt.hist(book['size_spread_l1'], bins='auto')
plt.title('Layer 1 Size Spread on Stock 0 Distribution')
plt.yscale('log')
plt.xlabel('Time ID')
plt.ylabel('Spread')
plt.show()

In [None]:
plt.hist(book['size_spread_l1'], bins='auto')
plt.title('Layer 1 Size Spread on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Spread')
plt.show()

In [None]:
book['size_spread_l1'].describe()

In [None]:
plt.plot(book['size_spread_l2'])
plt.title('Layer 2 Size Spread on Stock 0')
plt.xlabel('Time ID')
plt.ylabel('Spread')
plt.show()

In [None]:
plt.hist(book['size_spread_l2'], bins='auto')
plt.title('Layer 2 Size Spread on Stock 0 Distribution')
plt.yscale('log')
plt.xlabel('Time ID')
plt.ylabel('Spread')
plt.show()

In [None]:
plt.hist(book['size_spread_l2'], bins='auto')
plt.title('Layer 2 Size Spread on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Spread')
plt.show()

In [None]:
book['size_spread_l2'].describe()

# Price Spread Plots

In [None]:
book.head()

In [None]:
plt.plot(book['bid_price1'], c='b', label='Bid Price')
plt.plot(book['ask_price1'], c='r', label='Ask Price', alpha=0.7)
plt.title('Best Bid/Ask Prices')
plt.xlabel('Time ID')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
plt.plot(book[:100000]['bid_price1'], c='b', label='Bid Price')
plt.plot(book[:100000]['ask_price1'], c='r', label='Ask Price', alpha=0.7)
plt.title('Best Bid/Ask Prices')
plt.xlabel('Time ID')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
plt.plot(book['bid_price2'], c='b', label='Bid Price')
plt.plot(book['ask_price2'], c='r', label='Ask Price', alpha=0.7)
plt.title('L2 Bid/Ask Prices')
plt.xlabel('Time ID')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
plt.plot(book[:100000]['bid_price2'], c='b', label='Bid Price')
plt.plot(book[:100000]['ask_price2'], c='r', label='Ask Price', alpha=0.7)
plt.title('L2 Bid/Ask Prices')
plt.xlabel('Time ID')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
plt.hist(book['bid_price1'], bins='auto', label='Best Bids')
plt.hist(book['bid_price2'], bins='auto', label='L2 Bids', alpha=0.7)
plt.title('Bids on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Bid Value')
plt.legend()
plt.show()

In [None]:
plt.hist(book['bid_price1'], bins='auto', label='Best Bids')
plt.hist(book['bid_price2'], bins='auto', label='L2 Bids', alpha=0.7)
plt.title('Bids on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Bid Value')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
plt.hist(book['ask_price1'], bins='auto', label='Best Ask')
plt.hist(book['ask_price2'], bins='auto', label='L2 Ask', alpha=0.7)
plt.title('Asks on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Ask Value')
plt.legend()
plt.show()

In [None]:
plt.hist(book['ask_price1'], bins='auto', label='Best Ask')
plt.hist(book['ask_price2'], bins='auto', label='L2 Ask', alpha=0.7)
plt.title('Asks on Stock 0 Distribution')
plt.xlabel('Time ID')
plt.ylabel('Ask Value')
plt.yscale('log')
plt.legend()
plt.show()

# Returns Analysis

In [None]:
plt.plot(book['log_return'], label='Log Returns')
plt.title('Log Returns on Stock 0')
plt.xlabel('Time ID')
plt.ylabel('Returns')
plt.legend()
plt.show()