In [None]:
# This notebook estimates a linear regression over the past volatilities computed over different windows over the total 600 seconds considered.

# Functions
import sys
import pandas as pd
import numpy as np
import glob
import re

data_path = "/kaggle/input/optiver-realized-volatility-prediction/"
data      = "/kaggle/input/optiver/"

def add_score_time_ids(df):
    new_col = df.stock_id.astype(str).values + '-' + df.time_id.astype(str).values
    df['id_row'] = new_col
    co = ['id_row','stock_id','time_id']
    return df[co+[c for c in df.columns if c not in co]]

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def add_wap_lr(df):
    df['wap'] = (df['bid_price1'] * df['ask_size1']+df['ask_price1'] * df['bid_size1'])  / (df['bid_size1']+ df['ask_size1'])
    df['log_return'] = df.groupby(['time_id'])['wap'].apply(log_return)
    df = df[~df['log_return'].isnull()]
    return df

def add_past_vol(df):
    df = df[~df['log_return'].isnull()]
    vol = pd.DataFrame(df.groupby(['time_id','stock_id'])['log_return'].agg(realized_volatility)).reset_index()
    vol = add_score_time_ids(vol).set_index('id_row')
    return pd.concat((df,pd.DataFrame({'past_vol':vol['log_return']},index=df.index)),axis=1)

def treatment(df):
    df = add_wap_lr(df)
    df = add_score_time_ids(df)
    return df.set_index('id_row')

def lighten(df,cols):
    return df.drop(columns=cols)

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

cols = ['bid_price1',	'ask_price1',	'bid_price2',	'ask_price2',	'bid_size1',	'ask_size1',	'bid_size2',	'ask_size2',	'wap',]
def make_light_book(file_,cols):
    df = pd.DataFrame()
    book = pd.read_parquet(file_)
    book['stock_id'] = re.findall('stock_id=(.*)/',file_)[0]
    df = pd.concat((df,lighten(treatment(book),cols)),axis=0)
    return df

def make_volatility(df,Train,lags=4):
    volatilities = pd.DataFrame()    
    for i in map(str,range(1,lags)):
        volatilities['t'+i] = df.query('seconds_in_bucket <' + i + '00').groupby(['time_id','stock_id'])['log_return'].agg(realized_volatility)

    volatilities['t_all'] = df.groupby(['time_id','stock_id'])['log_return'].agg(realized_volatility)
    volatilities = add_score_time_ids(volatilities.reset_index()).set_index('id_row')

    if Train:
        y = add_score_time_ids(pd.read_csv(data_path+'train.csv')).set_index('id_row')
        volatilities['target'] = y.loc[volatilities.index]['target']

    floats = volatilities.select_dtypes(include=np.float64).columns
    volatilities[floats] = volatilities[floats]#.astype(np.float32)
    return volatilities


In [None]:
# Make Dataset
from joblib import Parallel, delayed
book_files_train = glob.glob(data_path+'book_train.parquet/stock_id=*/*')
book_files_test  = glob.glob(data_path+'book_test.parquet/stock_id=*/*')

# Use parallel api to call paralle for loop
for_joblib = lambda file_: make_volatility(make_light_book(file_, cols),Train=1)

df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(file_) for file_ in book_files_train)
# Concatenate all the dataframes that return from Parallel
train = pd.concat(df, ignore_index = True)

for_joblib = lambda file_: make_volatility(make_light_book(file_, cols),Train=False)
df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(file_) for file_ in book_files_test)
# Concatenate all the dataframes that return from Parallel
test = pd.concat(df, ignore_index = True)

In [None]:
# Model Estimation
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr = LinearRegression()
X = train.select_dtypes(include=[np.float64,np.float32]).apply(np.log)
X = X.replace([np.inf,-np.inf],np.nan).dropna()
y = X[['target']]
X = X.drop(columns='target')

X_fit, y_fit = X, y
lr.fit(X_fit,y_fit)

# Compute score over the train set
y_pred = lr.predict(X)
y_true, y_pred = np.exp(y.values), np.exp(y_pred)
score = rmspe(y_true=y_true,y_pred=y_pred)
print('\nScore obtained for linear regression is {:.3}.\nR2 is {:.3}.'.format(score,r2_score(y_true=y_true,y_pred=y_pred)))

# Prediction
X_test = test.select_dtypes(include=[np.float64,np.float32]).apply(np.log)
X_test = X_test.replace([np.inf,np.nan,-np.inf],0.0)

test['target'] = np.exp(lr.predict(X_test))
test["row_id"] = test["stock_id"].astype(str) + "-" + test["time_id"].astype(str)
print(test[['row_id', 'target']].head(3))
test[['row_id', 'target']].to_csv('submission.csv',index = False)