In [None]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ML
from sklearn.metrics import r2_score

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')  
train.head(2)

In [None]:
PATH_T = '../input/optiver-realized-volatility-prediction/trade_train.parquet/'
PATH_B = '../input/optiver-realized-volatility-prediction/book_train.parquet/'

In [None]:
filter = [('stock_id', '>', '95')]

In [None]:
%%time
dataset = pq.ParquetDataset(PATH_T, filters = filter)  
table = dataset.read()
trades = table.to_pandas()
trades['stock_id'] = trades['stock_id'].astype(np.int8)
trades.info()

In [None]:
%%time
dataset = pq.ParquetDataset(PATH_B, filters = filter) 
books = dataset.read()
books = books.to_pandas()  # I overwrite the pyarrow table object here to save memory
books['stock_id'] = books['stock_id'].astype(np.int8)
books.info()

In [None]:
print(f'Found {books.time_id.nunique()} unique time ids in tardes/books.')
print(f'Found {books.stock_id.nunique()} unique stock ids in tardes/books.')

In [None]:
def append_wap1(df):
    df['wap1'] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1']+ df['ask_size1'])
    return df

In [None]:
def append_wap2(df):
    df['wap2'] = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2']+ df['ask_size2'])
    return df

In [None]:
books = append_wap1(books)
books = append_wap2(books)

In [None]:
def plot_wap(books_df, time_id, stock_id_arr):
    
    fig, ax = plt.subplots(len(stock_id_arr), 1, figsize=(18 , 16))
    fig.tight_layout(pad=2.0)
    i= 0
    
    for stock_id in stock_id_arr:
        mask = np.logical_and(books_df["stock_id"]==stock_id, books_df["time_id"]==time_id)
        ax2 = ax[i].twinx()
        
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["wap1"], ax=ax[i], color='blue')
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["ask_price1"], ax=ax[i], color='red')
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["bid_price1"], ax=ax[i], color='green')
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["bid_size1"] + books_df[mask]["ask_size1"], ax=ax2, color='orange')
        #sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["log_return"], ax=ax2, color='orange')
        
        ax[i].set_title(f'Stock_id: {stock_id}', loc='left', fontweight='bold')     
        ax[i].legend(['wap', 'ask_price1', 'bid_price1'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        ax2.legend(["bid_size1 + ask_size1"], bbox_to_anchor=(1.05, 0.78), loc=2, borderaxespad=0.)
        i+=1
    plt.show()

In [None]:
stock_id_arr = books["stock_id"].unique()
plot_wap(books, 5, stock_id_arr)

In [None]:
def wap_balance(books_df):
    books_df['wap_balance'] = abs(books_df['wap1'] - books_df['wap2'])
    return books_df

In [None]:
books = wap_balance(books)

In [None]:
def plot_wap_balance(books_df, time_id, stock_id_arr):
    
    fig, ax = plt.subplots(len(stock_id_arr), 1, figsize=(18 , 16))
    fig.tight_layout(pad=2.0)
    i= 0
    
    for stock_id in stock_id_arr:
        mask = np.logical_and(books_df["stock_id"]==stock_id, books_df["time_id"]==time_id)
        ax2 = ax[i].twinx()
        
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["wap1"], ax=ax[i], color='blue')
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["wap2"], ax=ax[i], color='green')
        
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["wap_balance"], ax=ax2, color='orange')
        
        ax[i].set_title(f'Stock_id: {stock_id}', loc='left', fontweight='bold')     
        ax[i].legend(['wap1', 'wap2'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        ax2.legend(["wap_balance"], bbox_to_anchor=(1.05, 0.78), loc=2, borderaxespad=0.)
        i+=1
    plt.show()

In [None]:
stock_id_arr = books["stock_id"].unique()
plot_wap_balance(books, 5, stock_id_arr)

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()


In [None]:
# books.loc[:,'log_return'] = log_return(books['wap'])
# books = books[~books['log_return'].isnull()]
# # apply log_return function and use data to create new row
books.loc[:,'log_return1'] = books.groupby(['stock_id','time_id'])['wap1'].apply(log_return)
books = books[~books['log_return1'].isnull()]
books.loc[:,'log_return2'] = books.groupby(['stock_id','time_id'])['wap2'].apply(log_return)
books = books[~books['log_return2'].isnull()]

In [None]:
def wap_vs_logretrun(books_df, time_id, stock_id_arr):
    
    fig, ax = plt.subplots(len(stock_id_arr), 1, figsize=(18 , 16))
    fig.tight_layout(pad=2.0)
    i= 0
    
    for stock_id in stock_id_arr:
        mask = np.logical_and(books_df["stock_id"]==stock_id, books_df["time_id"]==time_id)
        ax2 = ax[i].twinx()
        
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["wap1"], ax=ax[i], color='blue')
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["log_return1"], ax=ax2, color='orange')
        
#         sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["wap2"], ax=ax[i], color='green')
#         sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["log_return2"], ax=ax2, color='purple')
        
        
        ax[i].set_title(f'Stock_id: {stock_id}', loc='left', fontweight='bold')     
        ax[i].legend(['wap1'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        ax2.legend(["log_return1"], bbox_to_anchor=(1.05, 0.9), loc=2, borderaxespad=0.)
        i+=1
    plt.show()

In [None]:
stock_id_arr = books["stock_id"].unique()
wap_vs_logretrun(books, 5, stock_id_arr)

In [None]:
def realized_volatility(books_df, time_id, stock_id_arr):
    for stock_id in stock_id_arr:
        mask = np.logical_and(books_df["stock_id"]==stock_id, books_df["time_id"]==time_id)
        
        realized_vol1 = np.sqrt(np.sum(books_df[mask]['log_return1']**2))
        realized_vol2 = np.sqrt(np.sum(books_df[mask]['log_return2']**2))
        
        print(f'Realized volatility1 for stock_id {stock_id} on time_id {time_id} is {realized_vol1}')
        print(f'Realized volatility2 for stock_id {stock_id} on time_id {time_id} is {realized_vol2}')

In [None]:
stock_id_arr = books["stock_id"].unique()
realized_volatility(books, 5, stock_id_arr)

In [None]:
def log_return_hist(books_df, time_id, stock_id_arr):
    fig, ax = plt.subplots(len(stock_id_arr), 1, figsize=(8 , 20))
    fig.tight_layout(pad=4.0)
    
    i= 0
    for stock_id in stock_id_arr:
        mask = np.logical_and(books_df["stock_id"]==stock_id, books_df["time_id"]==time_id)
        realized_vol1 = np.sqrt(np.sum(books_df[mask]['log_return1']**2))
        
        sns.histplot(data=books[mask]["log_return1"], ax= ax[i])
        _std = np.std(books[mask]["log_return1"])
        ax[i].set_title(f'Realized volatility1 for stock_id {stock_id} on time_id {time_id} is {round(realized_vol1, 5)}, teh std is: {round(_std, 5)}',
                        loc='left', fontweight='bold')     
        i+=1
    
    plt.show()

In [None]:
stock_id_arr = books["stock_id"].unique()
log_return_hist(books, 5, stock_id_arr)

In [None]:
trades.loc[:,'log_return'] = trades.groupby(['stock_id','time_id'])['price'].apply(log_return)
trades = trades[~trades['log_return'].isnull()]

In [None]:
trades.loc[:,'amount'] = trades['price']*trades['size']


In [None]:
trades

In [None]:
def amount_vs_logretrun(books_df, time_id, stock_id_arr):
    
    fig, ax = plt.subplots(len(stock_id_arr), 1, figsize=(18 , 16))
    fig.tight_layout(pad=2.0)
    i= 0
    
    for stock_id in stock_id_arr:
        mask = np.logical_and(books_df["stock_id"]==stock_id, books_df["time_id"]==time_id)
        ax2 = ax[i].twinx()
        
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["amount"], ax=ax[i], color='blue')
        sns.lineplot(x=books_df[mask]["seconds_in_bucket"], y=books_df[mask]["log_return"], ax=ax2, color='orange')
        
        ax[i].set_title(f'Stock_id: {stock_id}', loc='left', fontweight='bold')     
        ax[i].legend(['amount'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        ax2.legend(["log_return1"], bbox_to_anchor=(1.05, 0.9), loc=2, borderaxespad=0.)
        i+=1
    plt.show()

In [None]:
trades_id_arr = trades["stock_id"].unique()
amount_vs_logretrun(trades, 5, stock_id_arr)

In [None]:
def tendency(price, vol):    
    df_diff = np.diff(price)
    val = (df_diff/price[1:])*100
    power = np.sum(val*vol[1:])
    return(power)

In [None]:
mask = np.logical_and(trades["stock_id"]==98, trades["time_id"]==5)
tendency(trades[mask]["price"].values, trades[mask]['size'].values)