In [None]:
import sys
!cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/


In [None]:
import cudf
import cupy as cp
import plotly.express as px
import os
import glob
from tqdm import tqdm

cudf.__version__

In [None]:
PATH = '../input/optiver-realized-volatility-prediction/'

# Functions

In [None]:
def log_return(list_stock_prices):
    return cp.diff(cp.log(list_stock_prices))

def realized_volatility(series_log_return):
    return cp.sqrt(cp.sum(series_log_return**2))


def calc_realized_volatility(list_stock_prices):
    return realized_volatility(log_return(list_stock_prices))


def rmspe(y_true, y_pred):
    return  (cp.sqrt(cp.mean(cp.square((y_true - y_pred) / y_true))))

def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b = df['bid_size1'] + df['ask_size1'] + df['bid_size2']+ df['ask_size2']
    
    x = (a1 + a2)/ b
    return x


def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book = cudf.read_parquet(file_path)
    df_book['wap'] = calculate_wap(df_book)
    df_realized_vol_per_stock = df_book.groupby(['time_id'])['wap'].apply(calc_realized_volatility).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {0: prediction_column_name})
    stock_id = file_path.split('=')[1].split("/")[0]
    df_realized_vol_per_stock['row_id'] = stock_id + "-" + df_realized_vol_per_stock['time_id'].astype(str)
    return df_realized_vol_per_stock[['row_id', prediction_column_name]]

In [None]:
def past_realized_volatility_per_stock(list_file, prediction_column_name):
    df_past_realized = cudf.DataFrame()
    for file in tqdm(list_file):
        df_past_realized = cudf.concat([df_past_realized,
                                        realized_volatility_per_time_id(file, prediction_column_name)])
    return df_past_realized

# Data

In [None]:
train = cudf.read_csv(f'{PATH}/train.csv')
train.loc[train.stock_id == 0].head(3)

# Naive prediction: using past realized volatility as target

A commonly known fact about volatility is that it tends to be autocorrelated. We can use this property to implement a naive model that just "predicts" realized volatility by using whatever the realized volatility was in the initial 10 minutes.

Let's calculate the past realized volatility across the training set to see how predictive a single naive signal can be.

In [None]:
list_train = glob.glob(f'{PATH}/book_train.parquet/*/*')
len(list_train)

In [None]:
df_past_realized_train = past_realized_volatility_per_stock(list_file=list_train,
                                                           prediction_column_name='pred')

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], 
                        on = ['row_id'], how = 'left')

We will evaluate the naive prediction result by two metrics: RMSPE and R squared. 

In [None]:
from cuml.metrics.regression import r2_score

R2 = cp.round(r2_score(df_joined['target'], df_joined['pred']),3)
RMSPE = cp.round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

The performance of the naive model is not amazing but as a benchmark it is a reasonable start.

# Submission

In [None]:
list_order_book_file_test = glob.glob(f'{PATH}/book_test.parquet/*/*')

df_naive_pred_test = past_realized_volatility_per_stock(list_file=list_order_book_file_test,
                                                           prediction_column_name='target')
df_naive_pred_test.to_csv('submission.csv',index=False)
df_naive_pred_test.head()