 # Volatility Prediction

## Introduction
 
**Task**: To predict Voaltility

**Volatility** : It is a rate at which the price of a security increases or decreases for a given set of returns. 

> A higher volatility means that a security's value can potentially be spread out over a larger range of values. This means that the price of the security can change dramatically over a short time period in either direction.

> A lower volatility means that a security's value does not fluctuate dramatically, and tends to be more steady.

**Bid Price**: The bid price is the amount of money a buyer is willing to pay for a security.

**Ask Price**: The ask price is the lowest price a seller of a stock is willing to accept for a share of that given stock.

**Bid Size**: Bid size represents the quantity of a security that investors are willing to purchase at a specified bid price. 

**Ask Size**: The ask size is the amount of a security that a market maker is offering to sell at the ask price.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import plotly.express as px
import seaborn as sns
sns.set_style('whitegrid')

import os
import warnings
warnings.filterwarnings('ignore')
from joblib import Parallel, delayed

from xgboost import XGBRegressor
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import r2_score
import lightgbm as lgbm
from lightgbm import LGBMRegressor

In [None]:
# Import data
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
book_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet')
book_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_test.parquet')
trade_train = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet')
trade_test = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_test.parquet')
submission = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')

In [None]:
train.head()

In [None]:
book_train.head()

In [None]:
trade_train.head()

## Data Visualization

In [None]:
train['stock_id'].value_counts()

In [None]:
sns.histplot(train['target']);

### Bid price and Ask Price

In [None]:
def visualize_bid_ask(df, groupby, bid_col, ask_col):
    group = df.groupby([groupby])[bid_col, ask_col].mean()
    
    # Plot bid vs ask
    fig = px.scatter(group, x=bid_col, y=ask_col, template='plotly_dark+presentation', width=1000, height=400)
    fig.show()

In [None]:
visualize_bid_ask(book_train, 'stock_id', 'bid_price1', 'ask_price1')

In [None]:
visualize_bid_ask(book_train, 'stock_id', 'bid_price2', 'ask_price2')

## High Volatile and low Volatile stocks

In [None]:
volatility = train.sort_values('target', ascending=False)

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x='stock_id', y='target', data=volatility[:20]);
plt.title('High Volatile Stocks');

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x='stock_id', y='target', data=volatility[-50:]);
plt.title('Low Volatile Stocks');

## Preprocessing

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
def get_stock_stat_df(stock_id, dataset):
    
    book = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataset}.parquet/stock_id={stock_id}/')
    book['stock_id'] = stock_id
    
    # Calculate WAP
    book['wap1'] = (book.bid_price1 * book.ask_size1 + book.ask_price1 * book.bid_size1) / (book.bid_size1 + book.ask_size1)
    book['wap2'] = (book.bid_price2 * book.ask_size2 + book.ask_price2 * book.bid_size2) / (book.bid_size2+ book.ask_size2)
    
    # Log return
    book['log_return1'] = log_return(book['wap1'])
    book['log_return2'] = log_return(book['wap2'])
    
    # Bid Ask Spread
    book['bid_ask_spread1'] = abs(book['bid_price1'] - book['ask_price1'])
    book['bid_ask_spread2'] = abs(book['bid_price2'] - book['ask_price2'])
    
    book = book[~book['log_return1'].isnull()]
    book = book[~book['log_return2'].isnull()]
    
#     stock_stat = book.groupby(['stock_id', 'time_id']).agg({'bid_ask_spread1':['mean'],
#                                                            'bid_ask_spread2':['mean'],
#                                                            'log_return1':[realized_volatility],
#                                                            'log_return2':[realized_volatility]}).reset_index()

    stock_stat = book.groupby(['stock_id', 'time_id']).agg(sec_in_bucket=('seconds_in_bucket', 'mean'),
                                                           bid_ask_spread_1=('bid_ask_spread1', 'mean'),
                                                           bid_ask_spread_2=('bid_ask_spread2', 'mean'),
                                                           log_return_1=('log_return1', realized_volatility),
                                                           log_return_2=('log_return2', realized_volatility)).reset_index()
    
    # Trade
    trade =  pd.read_parquet(f'../input/optiver-realized-volatility-prediction/trade_{dataset}.parquet/stock_id={stock_id}/')
    trade['stock_id'] = stock_id
    trade['trade_log_return'] = trade.groupby(['time_id'])['price'].apply(log_return).fillna(0)

#     trade_stat = trade.groupby(['stock_id', 'time_id']).agg({'price':['mean'], 
#                                                              'size':['sum'], 
#                                                              'order_count':['sum'],
#                                                              'trade_log_return':[realized_volatility]}).reset_index()

    trade_stat = trade.groupby(['stock_id', 'time_id']).agg(Price=('price', 'mean'),
                                                            Size=('size', 'sum'),
                                                            order_count=('order_count', 'sum'),
                                                            trade_count=('order_count', 'count'),
                                                            Trade_log=('trade_log_return', realized_volatility)).reset_index()
    
    # Merge book and trade dataframe
    stats = stock_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left')
    
    return stats

def get_dataSet(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat_df)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

In [None]:
%%time
stock_ids = list(train['stock_id'].unique())
train_stock_stats_df = get_dataSet(stock_ids, 'train')

In [None]:
data = pd.merge(train, train_stock_stats_df, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
data.columns

In [None]:
# data=data.rename(columns={('bid_ask_spread1', 'mean'):'bid_ask_spread1', 
#              ('bid_ask_spread2', 'mean'):'bid_ask_spread2',
#              ('log_return1', 'realized_volatility'):'log_return1',
#              ('log_return2', 'realized_volatility'):'log_return2',
#              ('price', 'mean'):'Price',
#              ('size', 'sum'):'Size',
#              ('order_count', 'sum'):'Order_count',
#              ('trade_log_return', 'realized_volatility'):'Trade_log_return'
#             })

In [None]:
data.fillna(0, inplace=True)

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
X = data.drop('target', axis=1)
y = data['target']

## XGBoost

In [None]:
def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

In [None]:
def Xgboost(X, y, params, folds=5):
    
    best_model = None
    best_rmspe = None
    
    skf = KFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (tr_idx, ts_idx) in enumerate(skf.split(X)):
        print(f'Fold: {fold}')
        
        x_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        x_ts, y_ts = X.iloc[ts_idx], y.iloc[ts_idx]
        
        model = XGBRegressor(**params)
        model.fit(x_tr, y_tr,
                 eval_set=[(x_ts, y_ts)],
                 early_stopping_rounds=100,
                  verbose=False)
        
        pred = model.predict(x_ts)
        error = rmspe(y_ts, pred)
        print(f'RMSPE: {error}')
        print('-'*50)
        
        if best_rmspe == None or best_rmspe > error:
            best_rmspe = error
            best_model = model
        
    return best_model

In [None]:
xgb_params = {
    'objective':'reg:squarederror',
    'n_jobs':-1,
    'eval_metric': "rmse",
#     "tree_method": 'gpu_hist', 
#     "gpu_id": 0
}

In [None]:
model_xgb = Xgboost(X, y, xgb_params, 5)

In [None]:
xgb.plot_importance(model_xgb)

## LGBM

In [None]:
def LGBM(X, y, params, folds=5, print_error=True):
    
    best_model = None
    best_rmspe = None
    
    cat_col = ['stock_id']
    skf = KFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (tr_idx, ts_idx) in enumerate(skf.split(X)):
        
        x_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        x_ts, y_ts = X.iloc[ts_idx], y.iloc[ts_idx]

        train = lgbm.Dataset(x_tr, label=y_tr, categorical_feature=cat_col, weight=1/np.power(y_tr,2))
        test = lgbm.Dataset(x_ts, label=y_ts, categorical_feature=cat_col, weight=1/np.power(y_ts,2))

        model = lgbm.train(params, 
                          train, 
                          3000, 
                          valid_sets=test, 
                          feval=feval_RMSPE,
                          early_stopping_rounds=500,
                          verbose_eval=False
                         )

        pred = model.predict(x_ts)
        error = rmspe(y_ts, pred)
        
        if print_error:
            print(f'Fold: {fold}')
            print(f'RMSPE: {error}')
            print('-'*50)

        if best_rmspe == None or best_rmspe > error:
            best_rsmpe = error
            best_model = model
        
    return best_model

In [None]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'objective': 'regression',
    'metric': 'None',
    'max_depth': -1,
    'n_jobs': -1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'lambda_l2': 1,
    'verbose': -1
}

model_lgbm = LGBM(X, y, lgbm_params, 5)

In [None]:
lgbm.plot_importance(model_lgbm);

## Submission

In [None]:
%%time
test_stock_ids = list(test['stock_id'].unique())
test_stock_stats_df = get_dataSet(test_stock_ids, 'test')

In [None]:
test = pd.merge(test, test_stock_stats_df, on = ['stock_id', 'time_id'], how = 'left')
test.fillna(0, inplace=True)

In [None]:
# test=test.rename(columns={('bid_ask_spread1', 'mean'):'bid_ask_spread1', 
#              ('bid_ask_spread2', 'mean'):'bid_ask_spread2',
#              ('log_return1', 'realized_volatility'):'log_return1',
#              ('log_return2', 'realized_volatility'):'log_return2',
#              ('price', 'mean'):'Price',
#              ('size', 'sum'):'Size',
#              ('order_count', 'sum'):'Order_count',
#              ('trade_log_return', 'realized_volatility'):'Trade_log_return'
#             })

In [None]:
test_df = test.drop('row_id', axis=1)

In [None]:
pred_xgb = model_xgb.predict(test_df)
pred_lgbm = model_lgbm.predict(test_df)

In [None]:
submission['target'] = pred_lgbm

In [None]:
submission.to_csv('submission.csv', index=False)