In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Stock id, randomly shuffled time_Id, realized volatility for the next 10 minutes
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
def calc_vol(df):
    temp = np.log(df).diff()
    # vol
    return np.sqrt(np.sum(temp**2))

In [None]:
%%time
order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
stock_id = []
time_id = []
wap1_val = []
wap2_val = []
wap1_total_val = []
wap2_total_val = []
for i in order_book_training:
    # finding the stock_id
    temp_stock = int(i.split("=")[1])
    book = pd.read_parquet(i)
    
    book['wap1'] = (book['bid_price1']*book['ask_size1'] + book['ask_price1']*book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])
    wap1 = book.groupby('time_id')['wap1'].agg(calc_vol)
    
    book['wap2'] = (book['bid_price2']*book['ask_size2'] + book['ask_price2']*book['bid_size2']) / (book['bid_size2'] + book['ask_size2'])
    wap2 = book.groupby('time_id')['wap2'].agg(calc_vol)
    
    book['wap_total1'] = (book['wap1'] + book['wap2']) / 2
    wap_total1 = book.groupby('time_id')['wap_total1'].agg(calc_vol)
    
    book['wap_total2'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1'] + book['bid_price2'] * book['ask_size2'] + book['ask_price2'] * book['bid_size2']) / (book['bid_size1'] + book['ask_size1'] + book['bid_size2']+ book['ask_size2'])
    wap_total2 = book.groupby('time_id')['wap_total2'].agg(calc_vol)
    
    stock_id += [temp_stock]*wap1.shape[0]
    time_id += list(wap1.index)
    wap1_val += list(wap1)
    wap2_val += list(wap2)
    wap1_total_val += list(wap_total1)
    wap2_total_val += list(wap_total2)
past_volatility = pd.DataFrame({"stock_id": stock_id, "time_id": time_id, "wap1_val": wap1_val, "wap2_val":wap2_val, "wap1_total_val":wap1_total_val, "wap2_total_val":wap2_total_val})
past_volatility.head()

In [None]:
def get_agg_info(df):
    agg_df = df.groupby(['stock_id', 'time_id']).agg(mean_sec_in_bucket = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     median_sec_in_bucket = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median')
                                                    ).reset_index()
    
    return agg_df

In [None]:
trade_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
trade_stat = pd.DataFrame()
for i in trade_book_training:
    temp_stock = int(i.split("=")[1])
    trade = pd.read_parquet(i)
    trade["stock_id"] = temp_stock
    trade_val = get_agg_info(trade)
    trade_stat = trade_stat.append(trade_val)
stats = past_volatility.merge(trade_stat, on=["stock_id", "time_id"], how="left")
stats.head()

In [None]:
joined = train.merge(stats, on = ["stock_id","time_id"], how = "left").dropna()
joined.head()

In [None]:
X = joined.drop("target", axis=1)
y = joined["target"]

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

kf = KFold(n_splits=5, random_state=12, shuffle=True)

In [None]:
# Use all columns
scores = []
for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):

    # create dataset
    X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_valid = y.iloc[trn_idx], y.iloc[val_idx]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    # validation 
    y_pred = model.predict(X_valid)
    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),3)
    scores.append(RMSPE)
print(scores)
print(sum(scores)/5)

In [None]:
# Stock by stock traing using only wap1_val
all_df = pd.DataFrame()
for stock in list(set(joined["stock_id"])):
    stock_joined = joined[joined["stock_id"] == stock]
    X = stock_joined.drop(["target", "stock_id"], axis=1)
    X = X[["wap1_val"]]
    y = stock_joined["target"]
    for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
        # create dataset
        X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_valid = y.iloc[trn_idx], y.iloc[val_idx]

        model = LinearRegression()
        model.fit(X_train, y_train)
        # validation 
        y_pred = model.predict(X_valid)
        fold_df = pd.DataFrame()
        fold_df["y_valid"] = y_valid
        fold_df["y_pred"] = y_pred
        fold_df["stock"] = stock
        fold_df["fold"] = fold
        all_df = all_df.append(fold_df)
scores = []
for fold in range(0, 5):
    fold_score = all_df[all_df["fold"] == fold]
    RMSPE = round(rmspe(y_true = fold_score["y_valid"], y_pred = fold_score["y_pred"]),3)
    scores.append(RMSPE)
print(scores)
print(sum(scores)/5)

In [None]:
# Stock by stock traing using all Waps
all_df = pd.DataFrame()
for stock in list(set(joined["stock_id"])):
    stock_joined = joined[joined["stock_id"] == stock]
    X = stock_joined.drop(["target", "stock_id"], axis=1)
    X = X[["wap1_val", "wap2_val", "wap1_total_val", "wap2_total_val"]]
    y = stock_joined["target"]
    for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
        # create dataset
        X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_valid = y.iloc[trn_idx], y.iloc[val_idx]

        model = LinearRegression()
        model.fit(X_train, y_train)
        # validation 
        y_pred = model.predict(X_valid)
        fold_df = pd.DataFrame()
        fold_df["y_valid"] = y_valid
        fold_df["y_pred"] = y_pred
        fold_df["stock"] = stock
        fold_df["fold"] = fold
        all_df = all_df.append(fold_df)
scores = []
for fold in range(0, 5):
    fold_score = all_df[all_df["fold"] == fold]
    RMSPE = round(rmspe(y_true = fold_score["y_valid"], y_pred = fold_score["y_pred"]),3)
    scores.append(RMSPE)
print(scores)
print(sum(scores)/5)

In [None]:
# Stock by stock traing using all features
all_df = pd.DataFrame()
for stock in list(set(joined["stock_id"])):
    stock_joined = joined[joined["stock_id"] == stock]
    X = stock_joined.drop(["target", "stock_id", "time_id"], axis=1)
    y = stock_joined["target"]
    for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
        # create dataset
        X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_valid = y.iloc[trn_idx], y.iloc[val_idx]

        model = LinearRegression()
        model.fit(X_train, y_train)
        # validation 
        y_pred = model.predict(X_valid)
        fold_df = pd.DataFrame()
        fold_df["y_valid"] = y_valid
        fold_df["y_pred"] = y_pred
        fold_df["stock"] = stock
        fold_df["fold"] = fold
        all_df = all_df.append(fold_df)
scores = []
for fold in range(0, 5):
    fold_score = all_df[all_df["fold"] == fold]
    RMSPE = round(rmspe(y_true = fold_score["y_valid"], y_pred = fold_score["y_pred"]),3)
    scores.append(RMSPE)
print(scores)
print(sum(scores)/5)

In [None]:
pd.read_parquet("/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0")

In [None]:
pd.read_parquet("/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")