In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Stock id, randomly shuffled time_Id, realized volatility for the next 10 minutes
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
def calc_vol(df):
    temp = np.log(df).diff()
    # vol
    return np.sqrt(np.sum(temp**2))

def rel_vol_time_id(path):
    book = pd.read_parquet(path) 
    # calculating WAP
    p1 = book['bid_price1']
    p2 = book['ask_price1']
    s1 = book['bid_size1']
    s2 = book['ask_size1']
    
    book['wap'] = (p1*s2 + p2*s1) / (s1 + s2)
    transbook = book.groupby('time_id')['wap'].agg(calc_vol)
    return transbook

In [None]:
%%time 

order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
stock_id = []
time_id = []
relvol = []
for i in order_book_training:
    # finding the stock_id
    temp_stock = int(i.split("=")[1])
    # find the realized volatility for all time_id of temp_stock
    temp_relvol = rel_vol_time_id(i)
    stock_id += [temp_stock]*temp_relvol.shape[0]
    time_id += list(temp_relvol.index)
    relvol += list(temp_relvol)
past_volatility = pd.DataFrame({"stock_id": stock_id, "time_id": time_id, "volatility": relvol})
past_volatility.head()

In [None]:
from sklearn.metrics import r2_score

joined = train.merge(past_volatility, on = ["stock_id","time_id"], how = "left")
R2 = round(r2_score(y_true = joined['target'], y_pred = joined['volatility']),3)
print(f'R2 score: {R2}')

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

train_rmspe = rmspe(joined['target'], joined['volatility'])
print(f'RMSPE: {train_rmspe}')

In [None]:
book_test = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')

In [None]:

stock_id = []
time_id = []
relvol = []
for i in book_test:
    temp_stock = int(i.split("=")[1])
    temp_relvol = rel_vol_time_id(i)
    stock_id += [temp_stock]*temp_relvol.shape[0]
    time_id += list(temp_relvol.index)
    relvol += list(temp_relvol)
    
past_test_volatility = pd.DataFrame({'stock_id': stock_id, 'time_id': time_id, 'volatility': relvol})

In [None]:
submission = pd.DataFrame({'row_id' : [], 'target' : []})  
submission['row_id'] = past_test_volatility.apply(lambda x: str(int(x.stock_id)) + '-' + str(int(x.time_id)), axis=1)
submission['target'] = past_test_volatility["volatility"]

submission.to_csv('submission.csv',index = False)


In [None]:
submission.head()