# Optimal Realized Volatility Prediction

This notebook contains some techniques of modeling process. If you are interested in EDA of this data, please check [here](https://www.kaggle.com/hyewon328/understand-and-visualize-volatility-data)!

## Process
1. [Preprocessing](#pre)
2. [Feature Selection](#fs)
3. [Bayesian Optimization](#opt)
4. [Modeling](#model)
5. [Prediction & Submission](#pred)

# Load package & data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
import glob
from joblib import Parallel, delayed

import eli5
from eli5.sklearn import PermutationImportance
from bayes_opt import BayesianOptimization

from tqdm import tqdm
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")
test = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
sub = pd.read_csv("../input/optiver-realized-volatility-prediction/sample_submission.csv")

In [None]:
book_train_filepath = "/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet"
trade_train_filepath = "/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet"
book_test_filepath = "/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet"
trade_test_filepath = "/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet"

# get filenames in book and trade files
book_train_filenames = os.listdir(book_train_filepath)
trade_train_filenames = os.listdir(trade_train_filepath)
book_test_filenames = os.listdir(book_test_filepath)
trade_test_filenames = os.listdir(trade_test_filepath)

## Data Preprocessing <a class="anchor" id="pre"></a>

In **preprocessing part**, we generate some additional variables for LGBM Modeling.

Reference:
<https://www.kaggle.com/manels/lgb-starter>

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def RMSPE(true, pred):
    rmspe = np.sqrt(np.mean(np.square((true-pred)/true)))
    return rmspe

def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(RMSPE(true = labels, pred = preds),5), False

In [None]:
def pre_data(stock_id, train = True):
    
    if train == True:
        book_filepath = book_train_filepath
        trade_filepath = trade_train_filepath
    else:
        book_filepath = book_test_filepath
        trade_filepath = trade_test_filepath
    
    path = os.path.join(book_filepath, f"stock_id={stock_id}".format(stock_id))
    data = pd.read_parquet(path)
    
    data['stock_id'] = stock_id
    
    data['wap1'] = (data['bid_price1'] * data['ask_size1'] + data['ask_price1'] * data['bid_size1']) / (data['bid_size1']+ data['ask_size1'])
    data['wap2'] = (data['bid_price2'] * data['ask_size2'] + data['ask_price2'] * data['bid_size2']) / (data['bid_size2']+ data['ask_size2'])
    
    data['wap_balance'] = data['wap1'] - data['wap2']
    
    data['log_return1'] = data.groupby(['time_id'])['wap1'].apply(log_return)
    data['log_return2'] = data.groupby(['time_id'])['wap2'].apply(log_return)
    
    data['spread1'] = (data['ask_price1'] - data['bid_price1'])/data['bid_price1']
    data['spread2'] = data['ask_price2'] - data['bid_price2']/data['bid_price2']
    
    data['net_size1'] = data['bid_size1'] - data['ask_size1']
    data['net_size2'] = data['bid_size2'] - data['ask_size2']
    
    
    data = data.groupby(['stock_id', 'time_id']).agg(wap_balance_mean = ('wap_balance', 'mean'),
                                                     volatility1 = ('log_return1', realized_volatility),
                                                     volatility2 = ('log_return2', realized_volatility),
                                                     spread1_mean = ('spread1', 'mean'),
                                                     spread2_mean = ('spread2', 'mean'),
                                                     net_size1_mean = ('net_size1', 'mean'),
                                                     net_size2_mean = ('net_size2', 'mean'))
    
    
    
    # trade 

    trade_path = os.path.join(trade_filepath, f"stock_id={stock_id}".format(stock_id))
    trade_data = pd.read_parquet(trade_path)
    trade_data['stock_id'] = stock_id
    
    trade_data = trade_data.groupby(['stock_id', 'time_id']).agg(trade_price_mean = ('price', 'mean'),
                                                                 trade_size_mean = ('size', 'mean'),
                                                                 trade_order_mean = ('order_count', 'mean')).reset_index()
    
    

    final = data.merge(trade_data, how = 'left', on = ['stock_id', 'time_id'])
    #final = final.merge(volatility, how = 'left', on = ['stock_id', 'time_id'])
    
    #final['row_id'] = final['time_id'].apply(lambda x: f'{stock_id}-{x}')
        
    return final


def get_dataset(id_list, train = True):

    stock = Parallel(n_jobs=-1)(
        delayed(pre_data)(stock_id, train = True) 
        for stock_id in id_list
    )
    
    stock_df = pd.concat(stock, ignore_index = True)

    return stock_df

In [None]:
stock_train_df = get_dataset(train['stock_id'].unique(), train = True)
train_df = train.merge(stock_train_df, how = 'left', on = ['stock_id', 'time_id'])

stock_test_df = pre_data(0, train = False)
test_df = test.merge(stock_test_df, how = 'left', on = ['stock_id', 'time_id']).fillna(-999)

In [None]:
plt.figure(figsize = (8, 8))
corr = train_df.corr()
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, cmap = 'coolwarm', mask = mask, linewidth = 0.5, vmin = -1, vmax = 1,
           cbar_kws = {'shrink': .5})
plt.title('Correlation Heatmap of train features', fontsize = 20, fontweight = 'bold')
plt.show()

In [None]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 0)

features = train_df.drop(['target'], axis = 1)
target = train_df['target']

# define LGBM model
lgbm = LGBMRegressor(random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size = 0.2, random_state = 0)

## Hyperparameter Tuning: Bayesian Optimization <a class="anchor" id="opt"></a>
To select optimal parameter, we need to conduct hyperparameter tuning.

**GridSearchCV** take too long since this tests all possible combinations of parameters. **RandomSearchCV** take less time but it chooses set of parameters randomly(does not test all combinations of parameters), selected parameter may not be an optimal parameter. Both algorithms do not contains prior knowledge information

**Bayesian Optimization** keep track of past evaluation results which they use to form a probabilistic model mapping hyperparameters to a probability of a score on the objective function. Also it's fatster than GridSearchCV, and more precise than RandomSearchCV.
(Ref: <https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f>)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size = 0.2, random_state = 0)


def lgbm_cv(learning_rate, n_estimators, max_depth, num_leaves, subsample, min_data_in_leaf, silent = True):
    params = {'learning_rate': learning_rate,
          'n_estimators': int(n_estimators),
          'max_depth': int(max_depth),
          'num_leaves': int(num_leaves),
          'subsample': subsample,
          'min_data_in_leaf': int(min_data_in_leaf),
          'verbose': -1,
          'force_col_wise': True
             }
    
    lgbm_train = lgb.Dataset(X_train, label = y_train, categorical_feature = ['stock_id'], weight = 1/np.square(y_train))
    lgbm_val = lgb.Dataset(X_val, label = y_val, categorical_feature = ['stock_id'], weight = 1/np.square(y_val))
    
    model = lgb.train(params = params,
                      train_set = lgbm_train,
                      valid_sets = [lgbm_train, lgbm_val],
                      feval = feval_RMSPE,
                      verbose_eval = 1,
                      early_stopping_rounds = 100,
                      num_boost_round = 1000)
    
    pred = model.predict(X_val)
    rmspe_score = RMSPE(y_val, pred)
    
    return rmspe_score

In [None]:
pbounds = {'learning_rate': (0.005, 0.1),
          'n_estimators': (1000, 2000),
          'max_depth': (10, 20),
          'num_leaves': (7, 14),
          'subsample': (0.5, 0.9),
          'min_data_in_leaf': (5, 20)
          }

lgbm_bo = BayesianOptimization(f = lgbm_cv, pbounds = pbounds, verbose = 2, random_state = 0)

# init_points :  initial number of Random Search points 
# n_iter : iteration number
# acq : Acquisition Function - we use EI
# xi : exploration (default: 0)
lgbm_bo.maximize(init_points = 2, n_iter = 10, acq = 'ei', xi = 0.01)

In [None]:
# optimal parameter
print(lgbm_bo.max)

opt_params = lgbm_bo.max['params']

In [None]:
# GridSearchCV

# scoring function
#rmspe = make_scorer(RMSPE, greater_is_better = False)

#X_train, X_val, y_train, y_val = train_test_split(new_features, target, test_size = 0.2, random_state = 0)

#grid_model = GridSearchCV(lgbm, param_grid = params, cv = 5, scoring = rmspe)
#grid_model.fit(X_train, y_train)

#print('Best parameter: ', grid_model.best_params_)
#print('Best score: ', grid_model.best_score_)

## LGBM Modeling <a class="anchor" id="model"></a>

In [None]:
params = {'learning_rate': opt_params['learning_rate'],
          'objective': 'regression',
          'n_estimators': int(np.round(opt_params['n_estimators'], 0)), 
          'max_depth': int(np.round(opt_params['max_depth'], 0)),
          'num_leaves': int(np.round(opt_params['num_leaves'], 0)),
          'min_data_in_leaf': int(np.round(opt_params['min_data_in_leaf'], 0)),
          'subsample': opt_params['subsample'],
          'force_col_wise': True,
          'verbose': -1
          }

    
# KFold
rmspe_list = []
model_list = []
for i, (train_idx, val_idx) in enumerate(kfold.split(features)):
    print(f'################# {i+1}th Fold #################')
    X_train, X_val = features.iloc[train_idx, :], features.iloc[val_idx, :]
    y_train, y_val = target[train_idx], target[val_idx]
        
    train_set = lgb.Dataset(X_train, label = y_train, categorical_feature = ['stock_id'], weight = 1/np.square(y_train))
    val_set = lgb.Dataset(X_val, label = y_val, categorical_feature = ['stock_id'], weight = 1/np.square(y_val))

    model = lgb.train(params = params,
                      train_set = train_set,
                      valid_sets = [train_set, val_set],
                      feval = feval_RMSPE,
                      early_stopping_rounds = 100,
                      num_boost_round = 1000,
                      verbose_eval = 100)
    
    model_list.append(model)
    
    pred = model.predict(X_val)
    rmspe_score = RMSPE(y_val, pred)
    
    print(f'RMSPE: {np.round(rmspe_score, 4)}')
    rmspe_list.append(rmspe_score)

In [None]:
feat_imp = pd.DataFrame()
feat_imp['feature'] = features.columns.tolist()
feat_imp['importance'] = model.feature_importance(importance_type = 'gain')

feat_imp = feat_imp.sort_values(by = ['importance'], ascending = False).reset_index(drop = True)
feat_imp

In [None]:
plt.figure(figsize = (8, 6))
ax = sns.barplot(data = feat_imp, x = 'importance', y = 'feature', color = '#006699', edgecolor = 'black')

for i in ['right', 'top']:
        ax.spines[i].set_visible(False)
        
plt.title('Feature Importance', fontsize = 20, fontweight = 'bold')
plt.xlabel('Importance', fontsize = 10, fontweight = 'bold')
plt.ylabel('Feature', fontsize = 10, fontweight = 'bold')
plt.show()

In [None]:
ax = plt.subplot(1, 1, 1)
#ax.plot(range(1, 6), rmspe_list, color ='#006699', marker = 'o')
ax.fill_between(range(1, 6), 0, rmspe_list, alpha = 0.4, color = '#d9e6f2')
ax.scatter(range(1, 6), rmspe_list, color = '#006699')
plt.axhline(y = np.mean(rmspe_list), color = '#cc0000', linestyle = ':', linewidth = 2)
plt.text(1 ,np.mean(rmspe_list)+0.0015, 'mean', 
         bbox = dict(facecolor ='#cc0000', edgecolor='#cc0000', boxstyle='round', alpha = 0.2))


ax.set_ylim([0.23, 0.26])
for i in ['left', 'right', 'top']:
        ax.spines[i].set_visible(False)

ax.set_xlabel('Fold', fontweight = 'bold')
ax.set_ylabel('RMSPE', fontweight = 'bold')
ax.set_xticks(range(1,6))
plt.title('RMSPE in each fold', fontsize = 15, fontweight = 'bold', pad = 20)
plt.show()

## Prediction & Submission <a class="anchor" id="pred"></a>

In [None]:
test_df_new = test_df.drop(['row_id'], axis = 1)

In [None]:
test_pred = np.zeros(len(test_df_new))
for model in model_list:
    pred = model.predict(test_df_new)
    test_pred += pred/len(model_list)

In [None]:
test_df['target'] = test_pred
sub = test_df[['row_id', 'target']]
sub

In [None]:
sub.to_csv('submission.csv', index = False)