In [None]:
import numpy as np # linear algebra
import pandas as pd
import re

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

from plotly.offline import iplot
import plotly.express as px

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

from sklearn.model_selection import train_test_split

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
sns.set_palette('Set3_r')

import os

print(os.listdir('/kaggle/input/optiver-realized-volatility-prediction/'))

import warnings
warnings.simplefilter('ignore')

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
base_dir = '/kaggle/input/optiver-realized-volatility-prediction/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
train['stock_id'].unique()

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
print(f"Number of unique stocks is {train['stock_id'].nunique()}")
print(f"Number of unique time_id is {train['time_id'].nunique()}")

print(f"Number of files in trade_train.parquet: {len(os.listdir(base_dir + 'trade_train.parquet'))}")
print(f"Number of files in book_train.parquet: {len(os.listdir(base_dir + 'book_train.parquet'))}")

print(f"Number of files in trade_test.parquet: {len(os.listdir(base_dir + 'trade_test.parquet'))}")
print(f"Number of files in book_test.parquet: {len(os.listdir(base_dir + 'book_test.parquet'))}")

In [None]:
plt.title('Target Distribution')
sns.kdeplot(train['target'], shade = True, color = 'crimson')
plt.axvline(train['target'].mean(), color = 'blue', linestyle = ':', label = f"Mean: {train['target'].mean()}")
plt.axvline(train['target'].median(), color = 'green', linestyle = '--', label = f"Median: {train['target'].median()}")
plt.legend()
plt.show()

### Check the book and trade parquets for stock code 10

In [None]:
book_10 = pd.read_parquet(base_dir + 'book_train.parquet/stock_id=10')
trade_10 =  pd.read_parquet(base_dir + 'trade_train.parquet/stock_id=10')

stock_id = '10'
#book_10 = book_10[book_10['time_id'] == 5]
book_10.loc[:, 'stock_id'] = stock_id
#trade_10 = trade_10[trade_10['time_id']==5]
trade_10.loc[:, 'stock_id'] = stock_id

In [None]:
book_10.head()

In [None]:
trade_10.head()

In [None]:
book_10['time_id'].unique(), trade_10['time_id'].unique()

In [None]:
bid_ask = [('bid_price1', 'ask_price1', 'bid_size1', 'ask_size1'), ('bid_price2', 'ask_price2', 'bid_size2', 'ask_size2')]
for ba in bid_ask:
    fig, ax = plt.subplots(1, 4, figsize = (16, 8))
    ax = ax.ravel()
    for i, c in enumerate(ba):
        sns.boxplot(data = book_10, y = c, ax = ax[i]).set_title(c)
        ax[i].set_ylabel('')

#### Utils

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()


def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return ** 2))


def get_log_return_df_per_time_id(file_path):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] = (df_book_data['bid_price1'] * df_book_data['ask_size1'] + df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1'] + df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]

    stock_id = file_path.split('=')[1]
    df_book_data['row_id'] = df_book_data['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    return df_book_data


def get_realized_volatility_df_per_time_id(file_path):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] = (df_book_data['bid_price1'] * df_book_data['ask_size1'] + df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1'] + df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    
    df_book_data['realized_volatility'] = df_book_data.groupby(['time_id'])['log_return'].apply(realized_volatility)
    df_book_data = df_book_data[~df_book_data['realized_volatility'].isnull()]

    stock_id = file_path.split('=')[1]
    df_book_data['row_id'] = df_book_data['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    return df_book_data

def calculate_stats(df):
    df['size_spread1'] = df['ask_size1'] - df['bid_size1']
    df['size_spread2'] = df['ask_size2'] - df['bid_size2']
    
    df['price_spread1'] = df['ask_price1'] - df['bid_price1']
    df['price_spread2'] = df['ask_price2'] - df['bid_price2']
    
    df['wap'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    
    df.loc[:, 'log_return'] = log_return(df['wap'])
    df = df[~df['log_return'].isnull()]
    
    #df.loc[:, 'realized_vol'] = realized_volatility(df['log_return'])
    
    return df

# What Is a Bid-Ask Spread?
- A bid-ask spread is the amount by which the ask price exceeds the bid price for an asset in the market. The bid-ask spread is essentially the difference between the highest price that a buyer is willing to pay for an asset and the lowest price that a seller is willing to accept. An individual looking to sell will receive the bid price while one looking to buy will pay the ask price.
- The spread is the transaction cost. Price takers buy at the ask price and sell at the bid price, but the market maker buys at the bid price and sells at the ask price.
- The bid represents demand and the ask represents supply for an asset.
- The bid-ask spread can be considered a measure of the supply and demand for a particular asset. Because the bid can be said to represent demand and the ask to represent the supply for an asset.
- The bid-ask spread is the de facto measure of market liquidity, more liquid markets means lower spread while less liquid markets will have higher spread.
- Typically, a security with a narrow bid-ask spread will have high demand. By contrast, a security with a wide bid-ask spread may illustrate a low volume of demand, therefore influencing wider discrepancies in its price.
- Example: Stocks that are traded heavily, such as Google, Apple, and Microsoft will have a smaller bid-ask spread.

# Weighted Averaged Price

- Indicates if the market is bearish or bullish - The market is bullish when the price is below the WAP and bearish if the price is above the WAP.

More Info: https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data

In [None]:
#Calculate spread and WAP for Stock 10

book_10 = calculate_stats(book_10)
print(book_10.shape)
book_10.head()

In [None]:
book_10.describe().T

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 8))
ax1.plot(book_10['time_id'], book_10['price_spread1'], color = 'orange')
ax1.set_xlabel('Time_id')
ax1.set_ylabel('Price Spread')
ax1.set_title('Price Spread of Stock 10 - Layer 1')
ax2.plot(book_10['time_id'], book_10['price_spread2'], color = 'teal')
ax2.set_xlabel('Time_id')
ax2.set_ylabel('Price Spread')
ax2.set_title('Price Spread of Stock 10 - Layer 2')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 8))
ax1.plot(book_10['time_id'], book_10['size_spread1'], color = 'orange')
ax1.set_xlabel('Time_id')
ax1.set_ylabel('Size Spread')
ax1.set_title('Size Spread of Stock 10 - Layer 1')
ax2.plot(book_10['time_id'], book_10['size_spread2'], color = 'teal')
ax2.set_xlabel('Time_id')
ax2.set_ylabel('Size Spread')
ax2.set_title('Size Spread of Stock 10 - Layer 2')
plt.show()

In [None]:
plt.plot(book_10['time_id'], book_10['wap'])
plt.xlabel('Time_id')
plt.ylabel('WAP')
plt.title('Weighted Averaged Price (WAP) of Stock 10')
plt.show()

In [None]:
fig = px.line(book_10[book_10['time_id'] == 5], x = 'seconds_in_bucket', y = 'wap', 
             labels = {'x': 'seconds_in_bucket', 'y': 'WAP'}, 
             title = 'WAP for Stock 10')

for t in book_10['time_id'].unique()[1:8]:
    fig.add_scatter(x = book_10[book_10['time_id'] == t]['seconds_in_bucket'], 
                    y = book_10[book_10['time_id'] == t]['wap'], name = str(t) )
fig.show()

In [None]:
for t in book_10['time_id'].unique()[:5]:
    fig = px.line(book_10[book_10['time_id'] == t], x = 'seconds_in_bucket', y = 'log_return',  
                 title = f'Log_return for Stock 10 at time_id {t}')
    fig.show()

#### Plotting the bid/ask price with trade price

In [None]:
def plot_bidaskprice(time_id):
    plt.figure(figsize = (16, 6))
    temp = book_10[book_10['time_id'] == time_id]
    temptrade = trade_10[trade_10['time_id'] == time_id]
    colors = ['y', 'r', 'b', 'g']
    for i, ba in enumerate(['bid_price1', 'bid_price2', 'ask_price1', 'ask_price2']):
        plt.plot(temp['seconds_in_bucket'], temp[ba], color = colors[i], label = ba)
    plt.plot(temptrade['seconds_in_bucket'], temptrade['price'], linestyle = '--', color = 'black', 
             lw = 2, label = 'Trade price')
    plt.legend()
    plt.title(f"Bid/Ask Price for Stock 10 at time_id {time_id}")
    plt.show()

In [None]:
for t in book_10['time_id'].unique()[:5]:
    plot_bidaskprice(t)

In [None]:
def plot_price_vol(time_id):
    temp = trade_10[trade_10['time_id'] == time_id]
    fig = plt.figure(figsize = (16, 6))
    ax = fig.add_subplot(111)
    ax.plot(temp['seconds_in_bucket'], temp['size'], ':g', label = 'Size')
    ax.legend(loc = 'upper left')

    ax2 = ax.twinx()
    ax2.plot(temp['seconds_in_bucket'], temp['price'], '-r', label = 'Price')
    ax2.legend()
    plt.title(f"Price/Size of Stock 10 at time_id: {time_id}")
    plt.show()

In [None]:
for t in book_10['time_id'].unique()[:5]:
    plot_price_vol(t)

In [None]:
stock_target_stats = train.groupby('stock_id')['target'].agg(['mean', 'median', 'std', 'sum'])
plt.figure(figsize = (16, 8))
plt.subplot(1, 2, 1)
sns.distplot(stock_target_stats['mean'])
plt.title('Target Mean')
plt.subplot(1, 2, 2)
sns.distplot(stock_target_stats['std'])
plt.title('Target STD');

# WIP

Ref: https://www.investopedia.com/terms/b/bid-askspread.aspv