* <a href="#eda">EDA</a>

* <a href="#func">Functions</a>

* <a href="#model">Model</a>

* <a href="#sub">Submission</a>

In [None]:
import numpy as np
import pandas as pd
import glob

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(16,8)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
book_train = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')

<a name="eda">
    
# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">EDA</p>

In [None]:
book_0 = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
book_5 = book_0[book_0['time_id']==5]
book_5.head()

In [None]:
plt.figure(figsize=(18,10))
sns.heatmap(book_5.corr(), annot=True, cmap=plt.cm.Blues);

In [None]:
book_5.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                             .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
samples = ["bid_price1","bid_price2","ask_price1","ask_price2"]

for num,a in enumerate(samples):
    plt.figure(figsize=(18,18))
   
    plt.subplot(4,1,num+1)
    plt.plot(book_5["seconds_in_bucket"],book_5[a])
    plt.title(a)
plt.show()
plt.figure(figsize=(18,6));

for num,a in enumerate(samples):
    plt.plot(book_5["seconds_in_bucket"],book_5[a],label=a)
plt.legend(fontsize=12);

In [None]:
book_1 = pd.read_parquet('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0')
book_1 = book_1[book_1['time_id']==5]
book_1.head()

In [None]:
plt.figure(figsize=(18,6))

for num,a in enumerate(samples):
    plt.plot(book_5["seconds_in_bucket"],book_5[a],label=a)
    
plt.plot(book_1["seconds_in_bucket"],book_1["price"],label="trade_parquet",lw=5,color='magenta')
plt.legend(fontsize=12);

<a name="func">
    
# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">Functions</p>

In [None]:
def calc_volume(df):
    temp = np.log(df).diff()
    return np.sqrt(np.sum(temp**2)) 

def calc_wap(path):
    book = pd.read_parquet(path) 
    # calculating WAP
    p1 = book['bid_price1']
    p2 = book['ask_price1']
    s1 = book['bid_size1']
    s2 = book['ask_size1']
    
    book['wap'] = (p1*s2 + p2*s1) / (s1 + s2)
    transbook = book.groupby('time_id')['wap'].agg(calc_volume)
    return transbook

In [None]:
stock_id = []
time_id = []
relvol = []
for i in book_train:
    temp_stock = int(i.split("=")[1])
    temp_relvol = calc_wap(i)
    stock_id += [temp_stock]*temp_relvol.shape[0]
    time_id += list(temp_relvol.index)
    relvol += list(temp_relvol)

In [None]:
past_volatility = pd.DataFrame({'stock_id': stock_id, 'time_id': time_id, 'volatility': relvol})

In [None]:
joined = train.merge(past_volatility, on = ['stock_id','time_id'], how = 'left')
R2 = round(r2_score(y_true = joined['target'], y_pred = joined['volatility']),3)
print(f'R2: {R2}')

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

rmspe = rmspe(joined['target'], joined['volatility'])
print(f'RMSPE: {rmspe}')

<a name="model">
    
# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">Model</p>

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

def training(X,y,degree):
    polyfeat = PolynomialFeatures(degree = degree)
    linreg = LinearRegression()
    x = np.array(X).reshape(-1,1)
    X_ = polyfeat.fit_transform(x)
    weights = 1/np.square(y)
    return linreg.fit(X_, np.array(y).reshape(-1,1), sample_weight = weights)


stock_id_train = train.stock_id.unique()
models = {} 
degree = 4
for i in stock_id_train:
    temp = joined[joined['stock_id']==i]
    X = temp['volatility']
    y = temp['target']
    models[i] = training(X,y,degree)

In [None]:
book_test = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')

In [None]:
stock_id = []
time_id = []
relvol = []
for i in book_test:
    temp_stock = int(i.split("=")[1])
    temp_relvol = calc_wap(i)
    stock_id += [temp_stock]*temp_relvol.shape[0]
    time_id += list(temp_relvol.index)
    relvol += list(temp_relvol)
    
past_test_volatility = pd.DataFrame({'stock_id': stock_id, 'time_id': time_id, 'volatility': relvol})

In [None]:
def linear_inference(models, stock_id, past_volatility, degree):
    model = models[stock_id]
    polyfeat = PolynomialFeatures(degree = degree)
    return model.predict(polyfeat.fit_transform([[past_volatility]]))[0][0]

<a name="sub">
    
# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">Submission</p>

In [None]:
submission = pd.DataFrame({'row_id' : [], 'target' : []})  
submission['row_id'] = past_test_volatility.apply(lambda x: str(int(x.stock_id)) + '-' + str(int(x.time_id)), axis=1)
submission['target'] = past_test_volatility.apply(lambda x: linear_inference(models,\
                                                                            x.stock_id,\
                                                                            x.volatility,\
                                                                            degree), axis = 1)

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index = False)