In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

First thing first, let's read the train set. 

It contains 3 columns:
1. stock_id: ID code for the stock
2. time_id: ID code for the time bucket
3. target: The realized volatility computed over the 10 minute window following the feature data under the same stock/time_id.

We want to predict the last feature in the following ten minutes window.

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

Let's load the parquet files also.

In [None]:
import glob

order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

Here two useful functions too calculate the **WAP** and the **rel_volatility** 

In [None]:
def calc_wap(df):
    temp = np.log(df).diff()
    return np.sqrt(np.sum(temp**2))

def rel_vol_time_id(path):
    book = pd.read_parquet(path) 
    # calculating WAP
    p1 = book['bid_price1']
    p2 = book['ask_price1']
    s1 = book['bid_size1']
    s2 = book['ask_size1']
    
    book['wap'] = (p1*s2 + p2*s1) / (s1 + s2)
    transbook = book.groupby('time_id')['wap'].agg(calc_wap)
    return transbook

The following code chunk will take a while, for each stock_id finds the realized volatility for all time_id of temp_stock


In [None]:
%%time 
stock_id = []
time_id = []
relvol = []
for i in order_book_training:
    # finding the stock_id
    temp_stock = int(i.split("=")[1])
    # find the realized volatility for all time_id of temp_stock
    temp_relvol = rel_vol_time_id(i)
    stock_id += [temp_stock]*temp_relvol.shape[0]
    time_id += list(temp_relvol.index)
    relvol += list(temp_relvol)

Create a pandas df

In [None]:
past_volatility = pd.DataFrame({"stock_id": stock_id, "time_id": time_id, "volatility": relvol})

Let's calculate the baseline R2 and RMSE:

In [None]:
from sklearn.metrics import r2_score

joined = train.merge(past_volatility, on = ["stock_id","time_id"], how = "left")
R2 = round(r2_score(y_true = joined['target'], y_pred = joined['volatility']),3)
print(f'R2 score: {R2}')

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

rmspe = rmspe(joined['target'], joined['volatility'])
print(f'RMSPE: {rmspe}')

After all the preprocessing now it's the turn for our baseline model, we'll use the Polynomial Features of the sklear preprocessing.

The Polynomial regression extends the linear model by adding extra predictors, obtained by raising each of the original predictors to a power. For example, a cubic regression uses three variables, X, X2, and X3, as predictors. This approach provides a simple way to provide a non-linear fit to data.

**The degree is a parameter to be tuned.**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# for training
def linear_training(X,y,degree):
    # instantiating polynomial features
    polyfeat = PolynomialFeatures(degree = degree)
    linreg = LinearRegression()
    # preprocessing the training data
    x = np.array(X).reshape(-1,1)
    # creating the polynomial features
    X_ = polyfeat.fit_transform(x)
    # training the model
    weights = 1/np.square(y)
    return linreg.fit(X_, np.array(y).reshape(-1,1), sample_weight = weights)


stock_id_train = train.stock_id.unique() # all stock_id for the train set
models = {} # dictionary for holding trained models for each stock_id
degree = 2
for i in stock_id_train:
    temp = joined[joined["stock_id"]==i]
    X = temp["volatility"]
    y = temp["target"]
    models[i] = linear_training(X,y,degree)

In [None]:
# listing all test order books
order_book_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')

In [None]:
%%time 
stock_id = []
time_id = []
relvol = []
for i in order_book_test:
    # finding the stock_id
    temp_stock = int(i.split("=")[1])
    # find the realized volatility for all time_id of temp_stock
    temp_relvol = rel_vol_time_id(i)
    stock_id += [temp_stock]*temp_relvol.shape[0]
    time_id += list(temp_relvol.index)
    relvol += list(temp_relvol)
    
past_test_volatility = pd.DataFrame({"stock_id": stock_id, "time_id": time_id, "volatility": relvol})

Last two steps are the predictions and the submissions.

In [None]:
# for inference
def linear_inference(models, stock_id, past_volatility, degree):
    model = models[stock_id]
    polyfeat = PolynomialFeatures(degree = degree)
    return model.predict(polyfeat.fit_transform([[past_volatility]]))[0][0]

In [None]:
submission = pd.DataFrame({'row_id' : [], 'target' : []})  
submission['row_id'] = past_test_volatility.apply(lambda x: str(int(x.stock_id)) + '-' + str(int(x.time_id)), axis=1)
submission['target'] = past_test_volatility.apply(lambda x: linear_inference(models,\
                                                                            x.stock_id,\
                                                                            x.volatility,\
                                                                            degree), axis = 1)

In [None]:
submission.to_csv('submission.csv',index = False)

Hope you liked this basic notebook and hope it would be helpful, more advanced are coming! Please upvote! :)  