In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Lets Start with EDA

In [None]:
#import lib
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as gr_ob
import seaborn as sns
from matplotlib import pyplot as plt


In [None]:
#import train csv files
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
print("train shape:",train.shape)
train.sample(5)

group by stock id and see the target distribution

In [None]:
train.groupby(['stock_id']).size()
plt.figure(figsize=(14, 7))
sns.histplot(train['target'], label = 'Target distribution')

# time id distribution

In [None]:
plt.figure(figsize=(14, 7))
sns.distplot(train['time_id'], label = 'Time_id distribution')
#time id vs target distribution
plt.figure(figsize=(14, 7))
sns.scatterplot(data=train, x="time_id", y="target")

# Lets read order books. for more about order books: https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data

In [None]:
#lets read a order book
stock_id='1'
time_id=5

#book read
book = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id={stock_id}')
#trade_read
trade =  pd.read_parquet(f'../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id={stock_id}')
#define the book
book = book[book['time_id']==time_id]
book.loc[:,'stock_id'] = stock_id
trade = trade[trade['time_id']==time_id]
trade.loc[:,'stock_id'] = stock_id
#see the book now
book.head(10)

# see the book histogram

In [None]:
book.hist(figsize=(16,10))
plt.show()

# Lets investigate using pairplot and scatter histograms

In [None]:
sns.pairplot(book[['seconds_in_bucket','ask_price1','ask_price2','ask_size1','ask_size2']],  diag_kind="hist",height=4)
sns.pairplot(book[['seconds_in_bucket','bid_price1','bid_price2','bid_size1','bid_size2']],  diag_kind="hist",height=4)

# lets investigate timeseries plot

In [None]:
plt.figure(figsize = (10,5)) 
sns.lineplot(x = book["seconds_in_bucket"], y =book["ask_price1"], data = book)
sns.lineplot(x = book["seconds_in_bucket"], y =book["ask_price2"], data = book)
plt.legend(labels=['ask_price1', 'ask_price2'])
plt.show()

In [None]:
plt.figure(figsize = (10,5)) 
sns.lineplot(x = book["seconds_in_bucket"], y =book["ask_size1"], data = book)
sns.lineplot(x = book["seconds_in_bucket"], y =book["ask_size2"], data = book)
plt.legend(labels=['ask_size1', 'ask_size2'])
plt.show()
#bid size
plt.figure(figsize = (10,5)) 
sns.lineplot(x = book["seconds_in_bucket"], y =book["bid_size1"], data = book)
sns.lineplot(x = book["seconds_in_bucket"], y =book["bid_size2"], data = book)
plt.legend(labels=['bid_size1', 'bid_size2'])
plt.show()


# Lets investigate Trade book

In [None]:
#plot trade book hist data
trade.hist(figsize=(16,12))
plt.show()

In [None]:
#investigate pairplots
sns.pairplot(trade[['seconds_in_bucket','price','size']],  diag_kind="hist",height=3)
# see line plot by seconds and trade size
plt.figure(figsize = (10,5)) 
sns.lineplot(x = trade["seconds_in_bucket"], y =trade["size"], data = trade)
plt.legend(labels=['size'])
plt.show()
#another one by price
plt.figure(figsize = (10,5)) 
sns.lineplot(x = trade["seconds_in_bucket"], y =trade["price"], data = trade)
plt.legend(labels=['price'])
plt.show()

# Lets analyze the order book by WAP- weighted averaged price

In [None]:
#sort the book by wap
#calculate wap price by rules
book['wap'] = (book['bid_price1'] * book['ask_size1'] +book['ask_price1'] * book['bid_size1']) / (book['bid_size1']+ book['ask_size1'])
plt.figure(figsize = (10,5)) 
#lineplot for wap
sns.lineplot(x = book["seconds_in_bucket"], y =book["wap"], data = book)
plt.legend(labels=['wap'])
plt.title("WAP of stock_id_1, time_id_5")
plt.show()


# Log Returns

In [None]:
book.loc[:,'log_return'] = np.log(book['wap']).diff() 
book = book[~book['log_return'].isnull()]
book.head(5)
#plot the log returns
plt.figure(figsize = (10,5)) 
sns.lineplot(x = book["seconds_in_bucket"], y =book["log_return"], data = book)
plt.legend(labels=['log_return'])
plt.title("log_return of stock_id_1, time_id_5")
plt.show()

### Now we try to build the prediction model

In [None]:
#important lib
import plotly.express as px
from sklearn.metrics import r2_score
import os
import glob
from tqdm import tqdm
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

from joblib import Parallel, delayed

In [None]:
#some impfunctions
#https://www.kaggle.com/konradb/naive-optuna-tuned-stacked-ensemble-model
class CFG:
    data_dir = '../input/optiver-realized-volatility-prediction/'
    nfolds = 5
    
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def rv(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))


def rv2(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))


# taken from https://www.kaggle.com/yus002/realized-volatility-prediction-lgbm-train
def my_metrics(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def rmspe(y_true, y_pred):  
    output = my_metrics(y_true, y_pred)
    return 'rmspe', output, False
def get_stock_stat(stock_id : int, dataType = 'train'):
    
    df_book = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataType}.parquet/stock_id={stock_id}/')
    df_book.sort_values(by=['time_id', 'seconds_in_bucket'])

    # compute different vwap
    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (
                            df_book['bid_size1']+ df_book['ask_size1'])
 # wap2
    a = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
    b = df_book['bid_size2']+ df_book['ask_size2']
    df_book['wap2'] = a/b
    
    # wap3
    a1 = df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']
    a2 = df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']
    b = df_book['bid_size1'] + df_book['ask_size1'] + df_book['bid_size2']+ df_book['ask_size2']    
    df_book['wap3'] = (a1 + a2)/ b
    
     # wap4 
    a = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] * df_book['bid_size1']) / (
                                       df_book['bid_size1']+ df_book['ask_size1'])
    b = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] * df_book['bid_size2']) / (
                                       df_book['bid_size2']+ df_book['ask_size2'])
    df_book['wap4'] = (a + b) / 2
                    
    df_book['vol_wap1'] = (df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).reset_index(drop = True).fillna(0))
    df_book['vol_wap2'] = (df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).reset_index(drop = True).fillna(0))
    df_book['vol_wap3'] = (df_book.groupby(by = ['time_id'])['wap3'].apply(log_return).reset_index(drop = True).fillna(0))
    df_book['vol_wap4'] = (df_book.groupby(by = ['time_id'])['wap4'].apply(log_return).reset_index(drop = True).fillna(0))
    df_book['bas'] = (df_book[['ask_price1', 'ask_price2']].min(axis = 1)
                                / df_book[['bid_price1', 'bid_price2']].max(axis = 1) - 1)                               

    # different spreads
    df_book['h_spread_l1'] = df_book['ask_price1'] - df_book['bid_price1']
    df_book['h_spread_l2'] = df_book['ask_price2'] - df_book['bid_price2']
    df_book['v_spread_b'] = df_book['bid_price1'] - df_book['bid_price2']
    df_book['v_spread_a'] = df_book['ask_price1'] - df_book['bid_price2']
    
    # attach volatitilies based on different VWAPs
    stock_stat = pd.merge(
        df_book.groupby(by = ['time_id'])['vol_wap1'].agg(rv).reset_index(),
        df_book.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'], how = 'left'
    )
    stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap2'].agg(rv).reset_index(),
        stock_stat, on = ['time_id'], how = 'left'
    )
    
    stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap3'].agg(rv).reset_index(),
        stock_stat, on = ['time_id'], how = 'left'
    )
        
    stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['vol_wap4'].agg(rv).reset_index(),
        stock_stat, on = ['time_id'], how = 'left'
    )     
    
    # spread summaries
    stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['h_spread_l1'].agg(max).reset_index(),
        stock_stat, on = ['time_id'], how = 'left'
    )     
    stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['h_spread_l2'].agg(max).reset_index(),
        stock_stat, on = ['time_id'], how = 'left'
    )
    stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['v_spread_b'].agg(max).reset_index(),
        stock_stat, on = ['time_id'], how = 'left'
    )   
    stock_stat = pd.merge( df_book.groupby(by = ['time_id'])['v_spread_a'].agg(max).reset_index(),
        stock_stat, on = ['time_id'], how = 'left'
    )   
        
    stock_stat['stock_id'] = stock_id
    return stock_stat
def get_dataSet(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)
    return stock_stat_df


In [None]:
#merge the data into train and test
train = pd.read_csv(CFG.data_dir + 'train.csv')
train.loc[train.stock_id == 0].head(3)
#prpare training dataset
train_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')
train_dataSet = pd.merge(train, train_stat_df, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
#prepare testdataset
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

test_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataSet = pd.merge(test, test_stat_df, on = ['stock_id', 'time_id'], how = 'left')

# Build Model
##

In [None]:
covariates = [f for f in train_dataSet.columns if f not in ['time_id', 'target']]

# taken from https://www.kaggle.com/yus002/realized-volatility-prediction-lgbm-train
def my_metrics(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def rmspe(y_true, y_pred):  
    output = my_metrics(y_true, y_pred)
    return 'rmspe', output, False
prval = np.zeros((train_dataSet.shape[0],1))
prfull = np.zeros((test_dataSet.shape[0],1))

xdat = train_dataSet[covariates].copy()
ydat = train_dataSet['target'].copy()
xtest = test_dataSet[covariates].copy()

params = {'metric': 'rmse','reg_alpha': 0.9,  'reg_lambda': 5.61, 
          'num_leaves': 56, 'learning_rate': 0.08, 
          'max_depth': 5, 'n_estimators': 1000, 'min_child_weight': 0.11, 
          'subsample': 0.7, 'colsample_bytree': 0.8,  'min_child_samples': 28}
kf = KFold(n_splits= CFG.nfolds, shuffle = True, random_state = 42)
for (ii, (id0, id1)) in enumerate(kf.split(train_dataSet)):
    x0, x1 = xdat.loc[id0], xdat.loc[id1]
    y0, y1 = ydat.loc[id0], ydat.loc[id1]
    
    model = lgbm.LGBMRegressor(**params)
    model.fit(x0, y0, eval_set=[(x0, y0), (x1, y1)], eval_metric = rmspe,
              early_stopping_rounds= 50,  verbose= 250)
    prval[id1,0] = model.predict(x1)
    prfull[:,0] += model.predict(xtest)/CFG.nfolds
    
del x0,x1,y0,y1,id0,id1

#plot
lgbm.plot_importance(model, max_num_features= 25)


In [None]:
#feeding data
xref = pd.DataFrame()
xref['ydat'] = ydat
xref['prval'] = prval
del xdat, ydat

R2 = round(r2_score(y_true = xref['ydat'], y_pred = xref['prval']),3)
a = (xref['ydat'] - xref['prval'])/xref['ydat']
RMSPE =  np.round((np.sqrt(np.mean(np.square(a )))) ,4)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

# My Submission

In [None]:
test_dataSet['target'] = prfull
test_dataSet[['row_id', 'target']].to_csv('submission.csv', index = False)

Ref:
https://www.kaggle.com/damoonshahhosseini/volatility-prediction