# Go Deeper and Deeper! [Optimise RV]

How far can we go without using any machine learning (ML) models and just try the realised volatility (RV) function? The answer is it is much deeper than you think.

In this notebook, I show how to optimise the number of seconds in bucket we should consider for calculating the RV value for each stock. The insight is that old data may provide less importance to the future RV value that can be ignored.

You may consider this as a feature into your model.

References:

[We need to go deeper - and validate!][1]

[1]: https://www.kaggle.com/konradb/we-need-to-go-deeper-and-validate

In [None]:
import numpy as np
import pandas as pd
import glob, gc

from joblib import Parallel, delayed
from tqdm.auto import tqdm
import os

In [None]:
data_path = '../input/optiver-realized-volatility-prediction/'

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    b1 = df['bid_size1'] + df['ask_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b2 = df['bid_size2']+ df['ask_size2']
    return (a1 + a2) / (b1 + b2), (a1 / b1 + a2 / b2) / 2

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return ** 2))

In [None]:
def rmspe(predictions, targets):
    return np.sqrt((((predictions - targets) / targets) ** 2).mean())

# Optimise the Portion Number

In [None]:
def optimise_num(df : pd.DataFrame, stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(data_path + f'book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by = ['time_id', 'seconds_in_bucket'])
    book_train_subset['wap1'], book_train_subset['wap2'] = calculate_wap(book_train_subset)

    book_train_subset['log_return1'] = (book_train_subset.groupby(by = ['time_id'])['wap1'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    book_train_subset['log_return2'] = (book_train_subset.groupby(by = ['time_id'])['wap2'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    best_rmspe = np.inf
    for sec in tqdm(np.arange(0, 600, 10), leave = False):
        book_train = book_train_subset[book_train_subset['seconds_in_bucket'] >= sec]
        stock_stat = pd.concat([
            book_train.groupby(['time_id'])['log_return1'].agg(realized_volatility).rename('rv1_new'),
            book_train.groupby(['time_id'])['log_return2'].agg(realized_volatility).rename('rv2_new'),
            ], 
            axis = 1, 
        ).reset_index()
        stock_stat['rv_new'] = (stock_stat['rv1_new'] + stock_stat['rv2_new']) / 2
        stock_stat = stock_stat.merge(df.loc[df['stock_id'] == stock_id, ['time_id', 'target']], on = 'time_id', how = 'left')
        rmspe_score = rmspe(stock_stat['rv_new'], stock_stat['target'])
        if rmspe_score < best_rmspe:
            best_rmspe = rmspe_score
            best_sec = sec
    print(stock_id, best_sec, best_rmspe)
    return best_sec

In [None]:
def generate_stock_nums(df : pd.DataFrame, stock_ids : list, dataType = 'train', parallel = False):
    if parallel:
        nums = Parallel(n_jobs = -1)(
            delayed(optimise_num)(df, stock_id, dataType) 
            for stock_id in tqdm(stock_ids, total = len(stock_ids))
        )
    else:
        nums = []
        for stock_id in tqdm(stock_ids, total = len(stock_ids)):
            nums.append(optimise_num(df, stock_id, dataType))
    stock_nums = dict(zip(stock_ids, nums))
    return stock_nums

In [None]:
stock_secs = generate_stock_nums(df = train, stock_ids = train['stock_id'].unique(), dataType = 'train', parallel = True)

In [None]:
print(stock_secs)

In [None]:
x = gc.collect()

# Validate on the Train Set

In [None]:
def get_stock_stat(stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(data_path + f'book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by = ['time_id', 'seconds_in_bucket'])
    book_train_subset['wap1'], book_train_subset['wap2'] = calculate_wap(book_train_subset)

    book_train_subset['log_return1'] = (book_train_subset.groupby(by = ['time_id'])['wap1'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    book_train_subset['log_return2'] = (book_train_subset.groupby(by = ['time_id'])['wap2'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    book_train = book_train_subset[book_train_subset['seconds_in_bucket'] >= stock_secs[stock_id]]
    stock_stat = pd.concat([
        book_train.groupby(['time_id'])['log_return1'].agg(realized_volatility).rename('rv1_new'),
        book_train.groupby(['time_id'])['log_return2'].agg(realized_volatility).rename('rv2_new'),
        ], 
        axis = 1, 
    ).reset_index()
    stock_stat['rv_new'] = (stock_stat['rv1_new'] + stock_stat['rv2_new']) / 2
    stock_stat['stock_id'] = stock_id
    return stock_stat[['stock_id', 'time_id', 'rv_new']]

In [None]:
def get_dataSet(stock_ids : list, dataType = 'train', parallel = False):
    if parallel:
        stock_stat = Parallel(n_jobs = -1)(
            delayed(get_stock_stat)(stock_id, dataType) 
            for stock_id in tqdm(stock_ids, total = len(stock_ids))
        )
    else:
        stock_stat = []
        for stock_id in tqdm(stock_ids, total = len(stock_ids)):
            stock_stat.append(get_stock_stat(stock_id, dataType))
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)
    return stock_stat_df

In [None]:
train_dataSet = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train', parallel = True)
train_dataSet = pd.merge(train, train_dataSet, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
print(rmspe(train_dataSet['rv_new'], train_dataSet['target']))

In [None]:
del train_dataSet
x = gc.collect()

# Predict on the Test Set

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test.head()

In [None]:
test_dataSet = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test', parallel = True)
test_dataSet = pd.merge(test, test_dataSet, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
x = gc.collect()

# Submit

In [None]:
sub = pd.DataFrame()
sub[['row_id', 'target']] = test_dataSet[['row_id', 'rv_new']]
sub.head()

In [None]:
sub.to_csv('submission.csv', index = False)