In [1]:
from fastai.vision.all import *

from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import notebook

PATH = Path('../input/optiver-realized-volatility-prediction/')

In [2]:
train = pd.read_csv(PATH/'train.csv')
list_stocks = train.stock_id.unique().tolist()

In [3]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [4]:
def add_wap(df):
    df['wap'] = (df['bid_price1'] * df['ask_size1']+df['ask_price1'] * df['bid_size1'])  / (df['bid_size1']+ df['ask_size1'])

In [5]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

In [6]:
list_order_book_file_train = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [7]:
%%time
def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_train,
                                                           prediction_column_name='pred')

CPU times: user 2min 59s, sys: 7.71 s, total: 3min 7s
Wall time: 4min 24s


In [8]:
df_past_realized_train

Unnamed: 0,row_id,pred
0,125-5,0.001873
1,125-11,0.000858
2,125-16,0.001049
3,125-31,0.000998
4,125-62,0.001150
...,...,...
3825,114-32751,0.003184
3826,114-32753,0.001562
3827,114-32758,0.003165
3828,114-32763,0.002105


In [9]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')

In [10]:
df_joined

Unnamed: 0,row_id,target,pred
0,0-5,0.004136,0.004499
1,0-11,0.001445,0.001204
2,0-16,0.002168,0.002369
3,0-31,0.002195,0.002574
4,0-62,0.001747,0.001894
...,...,...,...
428927,126-32751,0.003461,0.003691
428928,126-32753,0.003113,0.004104
428929,126-32758,0.004070,0.003118
428930,126-32763,0.003357,0.003661


In [11]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

Performance of the naive prediction: R2 score: 0.628, RMSPE: 0.341


In [12]:
file_path = list_order_book_file_train[0]
df_book_data = pd.read_parquet(file_path)

In [13]:
file_path

'../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=125'

In [14]:
df_book_data

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.000094,1.000281,0.999906,1.000468,300,900,815,917
1,5,1,1.000094,1.000281,0.999906,1.000468,530,402,815,800
2,5,2,1.000094,1.000281,0.999906,1.000468,430,601,815,1075
3,5,3,1.000094,1.000281,0.999906,1.000468,530,302,815,1075
4,5,4,1.000094,1.000281,0.999906,1.000468,530,400,815,901
...,...,...,...,...,...,...,...,...,...,...
1891259,32767,587,1.001343,1.001522,1.001164,1.001701,1270,1101,970,800
1891260,32767,589,1.001343,1.001522,1.001164,1.001701,900,1001,970,900
1891261,32767,590,1.001343,1.001522,1.001164,1.001701,900,1001,970,1000
1891262,32767,593,1.001343,1.001522,1.001164,1.001701,900,1101,970,1000


In [15]:
def pd_to_pytorch(df, columns):
    return [torch.tensor(df[col].values).cuda() for col in columns]

In [16]:
def realized_volatility_numpy(file_path):
    df_book_data = pd.read_parquet(file_path)
    stock_id = file_path.split('=')[1]
    time_ids, bpr, bsz, apr, asz = (df_book_data[col].values for col in ['time_id', 'bid_price1','bid_size1','ask_price1','ask_size1' ])
    wap = (bpr * asz +apr * bsz) / (asz + bsz)
    log_wap = np.log(wap)
    ids, index = np.unique(time_ids, return_index=True)

    splits = np.split(log_wap, index[1:])
    ret=[]
    for time_id, x in zip(ids, splits):
        log_ret = np.diff(x)
        volatility = np.sqrt((log_ret ** 2).sum())
        ret.append((f'{stock_id}-{time_id}', volatility.item()))
    return pd.DataFrame(ret, columns=['row_id', 'pred'])

In [17]:
def realized_volatility_pytorch(file_path):
    df_book_data = pd.read_parquet(file_path)
    stock_id = file_path.split('=')[1]
    time_ids, bpr, bsz, apr, asz = pd_to_pytorch(df_book_data, ['time_id', 'bid_price1','bid_size1','ask_price1','ask_size1' ])
    wap = (bpr * asz +apr * bsz) / (asz + bsz)
    log_wap = wap.log()
    ids, counts = torch.unique(time_ids, return_counts=True)

    splits = log_wap.split(counts.tolist())
    ret=[]
    for time_id, x in zip(ids.tolist(), splits):
        log_ret = x[1:]-x[:-1]
        volatility = (log_ret ** 2).sum().sqrt()
        ret.append((f'{stock_id}-{time_id}', volatility.item()))
    return pd.DataFrame(ret, columns=['row_id', 'pred'])

In [18]:
%%time
vals = realized_volatility_pytorch(list_order_book_file_train[0])

CPU times: user 3.22 s, sys: 409 ms, total: 3.63 s
Wall time: 3.59 s


In [19]:
%%time
vals = realized_volatility_numpy(list_order_book_file_train[0])

CPU times: user 191 ms, sys: 32.7 ms, total: 224 ms
Wall time: 217 ms


In [20]:
vals

Unnamed: 0,row_id,pred
0,125-5,0.001873
1,125-11,0.000858
2,125-16,0.001049
3,125-31,0.000998
4,125-62,0.001150
...,...,...
3825,125-32751,0.001449
3826,125-32753,0.001215
3827,125-32758,0.000980
3828,125-32763,0.001505


In [21]:
%%time
all_results = pd.concat( [realized_volatility_pytorch(file) for file in list_order_book_file_train])


CPU times: user 2min 36s, sys: 5.4 s, total: 2min 42s
Wall time: 2min 32s


In [22]:
%%time
all_results = pd.concat( [realized_volatility_numpy(file) for file in list_order_book_file_train])


CPU times: user 19.5 s, sys: 4.58 s, total: 24.1 s
Wall time: 20.4 s


In [23]:
all_results

Unnamed: 0,row_id,pred
0,125-5,0.001873
1,125-11,0.000858
2,125-16,0.001049
3,125-31,0.000998
4,125-62,0.001150
...,...,...
3825,114-32751,0.003184
3826,114-32753,0.001562
3827,114-32758,0.003165
3828,114-32763,0.002105


In [24]:
df_joined = train.merge(all_results[['row_id','pred']], on = ['row_id'], how = 'left')

In [25]:
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

Performance of the naive prediction: R2 score: 0.628, RMSPE: 0.341
