## Optiver Realized Volatility Prediction Linear Regression

This notebook attempts to exploit correlations between current and future stock volatility. There are two types of dependency considered:

 - volatility of stock i at time t -> volatility of stock i at time t+10min
 - volatility of stock i at time t -> volatility of stock j at time t+10min
 
Volatility of stock i at time t denotes the realized volatility of stock i computed using WAP(ask1,bid1) on the first 10 minutes of the time slot in the time bucket t. Shortly: volatility(stock_id=i,time_id=t)

Volatility of stock j at time t+10min denotes the realized volatility of stock j on the last 10 minutes of the time slot in the time bucket t. Shortly: target_volatility(stock_id=j,time_id=t)


## Aggregation along time axis

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from sklearn.metrics import r2_score
from sklearn import linear_model
import os
import pickle
import matplotlib.pyplot as plt


data_path=Path('../input/optiver-realized-volatility-prediction')

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_feature(df):
    df = df.set_index(['seconds_in_bucket'])
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) /\
                  (df['bid_size1'] + df['ask_size1'])
    r = log_return(wap).to_numpy(dtype=np.float32)
    r = r[~np.isnan(r)]
    return realized_volatility(r)

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def compute_metrics(y_true, y_pred):
    nans = np.isnan(y_true) 
    y_true = y_true[~nans]
    y_pred = y_pred[~nans]
    R2 = r2_score(y_true, y_pred)
    RMSPE = rmspe(y_true, y_pred)
    return R2, RMSPE
    

In [None]:
!cp ../input/optiver-volatility-by-stock-and-buckett/train_data.csv .

In [None]:
%%time
if not os.path.isfile("train_data.csv"):
    columns = ['stock_id','time_id','seconds_in_bucket','bid_price1', 'ask_price1', 'bid_size1', 'ask_size1']
    book_df = pq.read_table(data_path / f"book_train.parquet", columns = columns).to_pandas()\
        .groupby(['stock_id','time_id'])\
        .apply(lambda x: realized_volatility_feature(x))\
        .rename('volatility')

    train_df = pd.read_csv(data_path/f"train.csv").set_index(['stock_id','time_id'])['target']
    train_data_df = pd.concat([book_df, train_df], axis=1).reset_index()
    train_data_df.to_csv('train_data.csv', index=False)
else:
    train_data_df = pd.read_csv('train_data.csv')
    
train_data_df.head()

## Vectorization

In [None]:
volatility=[bucket.set_index("stock_id")["volatility"].reindex(np.arange(0,127), method=None) \
            for time_id, bucket in train_data_df.groupby('time_id')]
    
target=[bucket.set_index("stock_id")["target"].reindex(np.arange(0,127), method=None) \
            for time_id, bucket in train_data_df.groupby('time_id')]    

In [None]:
x = np.array(volatility)
y = np.array(target)

In [None]:
x.shape, y.shape

## Linear regression

In [None]:
reg = linear_model.LinearRegression(fit_intercept=False)

x = np.where(np.isnan(x), np.nanmean(x), x)
y = np.where(np.isnan(y), np.nanmean(y), y)

ntest = 300
x_train, y_train = x[:-ntest], y[:-ntest]
x_test, y_test = x[-ntest:], y[-ntest:]

reg.fit(x_train,y_train)

y_pred = reg.predict(x_test)

R2, RMSPE = compute_metrics(y_test.reshape(-1), y_pred.reshape(-1))
print(f'Performance on test set: \nR2={R2:.3f}, RMSPE={RMSPE:.3f}')

## Visualization for a given stock

In [None]:
stock_id = 101

# Plot outputs
plt.rcParams['figure.figsize'] = (6,6)
plt.scatter(y_test[:,stock_id], y_pred[:,stock_id],  color='black')
plt.plot(y_test[:,stock_id], y_test[:,stock_id], color='blue', linewidth=3)
plt.axis('equal')
plt.title(f'Stock ID={stock_id}')
plt.ylabel('True Volatility')
plt.xlabel('Predicted Volatility')
plt.grid()
plt.show()

R2, RMSPE = compute_metrics(y_test[:,stock_id].reshape(-1), y_pred[:,stock_id].reshape(-1))
print(f'Performance for stock_id={stock_id}]: \nR2={R2:.3f}, RMSPE={RMSPE:.3f}')

## Fit all data

In [None]:
reg.fit(x,y)

## Submission

In [None]:
columns = ['stock_id','time_id','seconds_in_bucket','bid_price1', 'ask_price1', 'bid_size1', 'ask_size1']
book_df = pq.read_table(data_path / f"book_test.parquet", columns = columns).to_pandas()\
    .groupby(['stock_id','time_id'])\
    .apply(lambda x: realized_volatility_feature(x))\
    .rename('volatility')

book_df

In [None]:
test_df = pd.read_csv(data_path/f"test.csv").set_index(['stock_id','time_id'])['row_id']
test_data_df = pd.concat([book_df, test_df], axis=1).reset_index()

In [None]:
test_data_df

In [None]:
x_test = [bucket.set_index("stock_id")["volatility"].reindex(np.arange(0,127), method=None) \
            for time_id, bucket in test_data_df.groupby('time_id')]
x_test = np.array(x_test)
x_test = np.where(np.isnan(x_test), np.nanmean(x_test), x_test)

y_test = dict(zip(
    [time_id for time_id, bucket in test_data_df.groupby('time_id')], 
    reg.predict(x_test)))


In [None]:
def predict(stock_id, time_id, row_id):
    #print(stock_id, time_id, row_id)
    print('time_id', time_id, 'stock_id', stock_id, y_test[time_id][stock_id])
    return y_test[time_id][stock_id]

In [None]:
predictions = test_df.reset_index().apply(lambda x: predict(*x), axis=1)

In [None]:
submission = pd.DataFrame({ 'row_id' : test_data_df['row_id'], 'target' : predictions})
submission.to_csv('submission.csv', index=None)
submission

In [None]:
!cat submission.csv