In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Stock id, randomly shuffled time_Id, realized volatility for the next 10 minutes
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
def calc_vol(df):
    temp = np.log(df).diff()
    # vol
    return np.sqrt(np.sum(temp**2))

In [None]:
%%time

order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
order_ft = pd.DataFrame()
for i in order_book_training:
    # finding the stock_id
    temp_stock = int(i.split("=")[1])
    book = pd.read_parquet(i)
    
    book['bid_volume1'] = book['bid_price1'] * book['bid_size1']
    book['bid_volume2'] = book['bid_price2'] * book['bid_size2']
    book['bid_volume'] = book['bid_volume1'] + book['bid_volume2']
    book['ask_volume1'] = book['ask_price1'] * book['ask_size1']
    book['ask_volume2'] = book['ask_price2'] * book['ask_size2']
    book['ask_volume'] = book['ask_volume1'] + book['ask_volume2']
    
    book['wap1'] = (book['bid_price1']*book['ask_size1'] + book['ask_price1']*book['bid_size1']) / (book['bid_size1'] + book['ask_size1'])
    book['wap2'] = (book['bid_price2']*book['ask_size2'] + book['ask_price2']*book['bid_size2']) / (book['bid_size2'] + book['ask_size2'])
    book['wap_total1'] = (book['wap1'] + book['wap2']) / 2
    book['wap_total2'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1'] + book['bid_price2'] * book['ask_size2'] + book['ask_price2'] * book['bid_size2']) / (book['bid_size1'] + book['ask_size1'] + book['bid_size2']+ book['ask_size2'])
    book['wap_balance'] = abs(book['wap1'] - book['wap2'])
    
    book['price_spread'] = (book['ask_price1'] - book['bid_price1']) / (book['ask_price1'] + book['bid_price1'])
    book['bid_spread'] = book['bid_price1'] - book['bid_price2']
    book['ask_spread'] = book['ask_price1'] - book['ask_price2']
    book['total_size'] = (book['ask_size1'] + book['ask_size2']) + (book['bid_size1'] + book['bid_size2'])
    book['total_volume'] = (book['ask_size1'] * book['ask_price1'] + book['ask_size2'] * book['ask_price2']) + (book['bid_size1'] * book['bid_price1'] + book['bid_size2'] * book['bid_price2'])
    book['size_imbalance'] = abs((book['ask_size1'] + book['ask_size2']) - (book['bid_size1'] + book['bid_size2']))
    book['size_imbalance_spread'] = book['size_imbalance'] / book['total_size']
    book['volume_imbalance'] = abs((book['ask_size1'] * book['ask_price1'] + book['ask_size2'] * book['ask_price2']) - (book['bid_size1'] * book['bid_price1'] + book['bid_size2'] * book['bid_price2']))
    book['volume_imbalance_spread'] = book['volume_imbalance'] / book['total_volume']
    
    #dict for aggregate
    create_feature_dict = {
        'bid_price1': [np.mean],
        'bid_size1': [np.mean],
        'ask_price1': [np.mean],
        'ask_size1': [np.mean],
        'bid_price2': [np.mean],
        'bid_size2': [np.mean],
        'ask_price2': [np.mean],
        'ask_size2': [np.mean],
        'bid_volume1': [np.mean],
        'bid_volume2': [np.mean],
        'ask_volume1': [np.mean],
        'ask_volume2': [np.mean],
        'bid_volume': [np.mean],
        'ask_volume': [np.mean],
        'wap1':[calc_vol],
        'wap2':[calc_vol],
        'wap_total1':[calc_vol],
        'wap_total2':[calc_vol],
        'wap_balance':[np.mean],
        'price_spread':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'total_size': [np.mean],
        'total_volume':[np.mean],
        'size_imbalance':[np.mean],
        'size_imbalance_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'volume_imbalance_spread':[np.mean],
            }
    df_feature = pd.DataFrame(book.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    df_feature['stock_id'] = temp_stock
    order_ft = order_ft.append(df_feature)
order_ft.columns = ['_'.join(col) for col in order_ft.columns.values]
order_ft = order_ft.rename(columns={"time_id_": "time_id", "stock_id_": "stock_id"})
order_ft.head()

In [None]:
def get_agg_info(df):
    df["size_all"] = df["size"] * df["order_count"]
    df["volume"] = df["price"] * df["size_all"]
    agg_df = df.groupby(['time_id']).agg(mean_sec_in_bucket = ('seconds_in_bucket', 'mean'), 
                                                     mean_price = ('price', 'mean'),
                                                     mean_size = ('size', 'mean'),
                                                     mean_order = ('order_count', 'mean'),
                                                     mean_size_all = ('size_all', 'mean'),
                                                     mean_volume = ('volume', 'mean'),
                                                     max_sec_in_bucket = ('seconds_in_bucket', 'max'), 
                                                     max_price = ('price', 'max'),
                                                     max_size = ('size', 'max'),
                                                     max_order = ('order_count', 'max'),
                                                     max_size_all = ('size_all', 'max'),
                                                     max_volume = ('volume', 'max'),
                                                     min_sec_in_bucket = ('seconds_in_bucket', 'min'), 
                                                     min_price = ('price', 'min'),
                                                     min_size = ('size', 'min'),
                                                     min_order = ('order_count', 'min'),
                                                     min_size_all = ('size_all', 'min'),
                                                     min_volume = ('volume', 'min'),
                                                     median_sec_in_bucket = ('seconds_in_bucket', 'median'), 
                                                     median_price = ('price', 'median'),
                                                     median_size = ('size', 'median'),
                                                     median_order = ('order_count', 'median'),
                                                     median_size_all = ('size_all', 'median'),
                                                     median_volume = ('volume', 'median'),
                                                     sum_size = ('size', 'sum'),
                                                     sum_order = ('order_count', 'sum'),
                                                     sum_size_all = ('size_all', 'sum'),
                                                     sum_volume = ('volume', 'sum')
                                                    ).reset_index()
    return agg_df

In [None]:
trade_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
trade_stat = pd.DataFrame()
for i in trade_book_training:
    temp_stock = int(i.split("=")[1])
    trade = pd.read_parquet(i)
    trade_val = get_agg_info(trade)
    trade_val["stock_id"] = temp_stock
    trade_stat = trade_stat.append(trade_val)
stats = order_ft.merge(trade_stat, on=["stock_id", "time_id"], how="left")
stats.head()

In [None]:
stats['trade_bid_price_spread1'] = abs(stats['mean_price'] - stats['bid_price1_mean'])
stats['trade_bid_price_spread2'] = abs(stats['mean_price'] - stats['bid_price2_mean'])
stats['trade_ask_price_spread1'] = abs(stats['mean_price'] - stats['ask_price1_mean'])
stats['trade_ask_price_spread2'] = abs(stats['mean_price'] - stats['ask_price2_mean'])
stats['trade_bid_size_spread1'] = abs(stats['mean_size_all'] - stats['bid_size1_mean'])
stats['trade_bid_size_spread2'] = abs(stats['mean_size_all'] - stats['bid_size2_mean'])
stats['trade_ask_size_spread1'] = abs(stats['mean_size_all'] - stats['ask_size1_mean'])
stats['trade_ask_size_spread2'] = abs(stats['mean_size_all'] - stats['ask_size2_mean'])
stats['trade_bid_volume_spread1'] = abs(stats['mean_volume'] - stats['bid_volume1_mean'])
stats['trade_bid_volume_spread2'] = abs(stats['mean_volume'] - stats['bid_volume2_mean'])
stats['trade_ask_volume_spread1'] = abs(stats['mean_volume'] - stats['ask_volume1_mean'])
stats['trade_ask_volume_spread2'] = abs(stats['mean_volume'] - stats['ask_volume2_mean'])
stats['trade_bid_volume_spread'] = abs(stats['mean_volume'] - stats['bid_volume_mean'])
stats['trade_ask_volume_spread'] = abs(stats['mean_volume'] - stats['ask_volume_mean'])
stats.columns

In [None]:
joined = train.merge(stats, on = ["stock_id","time_id"], how = "left").dropna()

In [None]:
pd.set_option('display.max_rows', 100)
joined[joined.columns[2:]].corr()['target'][:-1].sort_values(ascending=False)

In [None]:
len(stats.columns)

In [None]:
X = joined.drop("target", axis=1)
y = joined["target"]

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

kf = KFold(n_splits=10, random_state=133, shuffle=True)

In [None]:
pd.set_option('display.max_columns', 100)

In [None]:
X

In [None]:
X.describe()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
# calculate the correlation matrix
plt.figure(figsize=(20,20))
corr = joined.corr()

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:

# calculate the correlation matrix
plt.figure(figsize=(20,20))
corr = joined[['trade_bid_price_spread1',
       'trade_bid_price_spread2', 'trade_ask_price_spread1',
       'trade_ask_price_spread2', 'trade_bid_size_spread1',
       'trade_bid_size_spread2', 'trade_ask_size_spread1',
       'trade_ask_size_spread2', 'trade_bid_volume_spread1',
       'trade_bid_volume_spread2', 'trade_ask_volume_spread1',
       'trade_ask_volume_spread2', 'trade_bid_volume_spread',
       'trade_ask_volume_spread', ]].corr()

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:

# calculate the correlation matrix
plt.figure(figsize=(20,20))
corr = joined[['trade_bid_price_spread1',
       'trade_ask_price_spread1',
       'trade_bid_size_spread1']].corr()
print(corr)

In [None]:
print(corr)

In [None]:
joined[['trade_bid_price_spread1',
       'trade_bid_price_spread2', 'trade_ask_price_spread1',
       'trade_ask_price_spread2', 'trade_bid_size_spread1',
       'trade_bid_size_spread2', 'trade_ask_size_spread1',
       'trade_ask_size_spread2', 'trade_bid_volume_spread1',
       'trade_bid_volume_spread2', 'trade_ask_volume_spread1',
       'trade_ask_volume_spread2', 'trade_bid_volume_spread',
       'trade_ask_volume_spread', ]]

In [None]:
# Stock by stock traing using all features
all_df = pd.DataFrame()
for stock in list(set(joined["stock_id"])):
    stock_joined = joined[joined["stock_id"] == stock]
    X = stock_joined.drop(["target", "stock_id", "time_id"], axis=1)
    X = X[['bid_price1_mean', 'bid_size1_mean', 'ask_price1_mean',
       'ask_size1_mean', 'bid_price2_mean', 'bid_size2_mean',
       'ask_price2_mean', 'ask_size2_mean', 'bid_volume1_mean',
       'bid_volume2_mean', 'ask_volume1_mean', 'ask_volume2_mean',
       'bid_volume_mean', 'ask_volume_mean', 'wap1_calc_vol', 'wap2_calc_vol',
       'wap_total1_calc_vol', 'wap_total2_calc_vol', 'wap_balance_mean',
       'price_spread_mean', 'bid_spread_mean', 'ask_spread_mean',
       'total_size_mean', 'total_volume_mean', 'size_imbalance_mean',
       'size_imbalance_spread_mean', 'volume_imbalance_mean',
       'volume_imbalance_spread_mean', 'mean_sec_in_bucket', 'mean_price',
       'mean_size', 'mean_order', 'mean_size_all', 'mean_volume',
       'max_sec_in_bucket', 'max_price', 'max_size', 'max_order',
       'max_size_all', 'max_volume', 'min_sec_in_bucket', 'min_price',
       'min_size', 'min_order', 'min_size_all', 'min_volume',
       'median_sec_in_bucket', 'median_price', 'median_size', 'median_order',
       'median_size_all', 'median_volume', 'sum_size', 'sum_order',
       'sum_size_all', 'sum_volume', 'trade_bid_price_spread1',
       'trade_bid_price_spread2', 'trade_ask_price_spread1',
       'trade_ask_price_spread2']]
    y = stock_joined["target"]
    for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
        # create dataset
        X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_valid = y.iloc[trn_idx], y.iloc[val_idx]

        model = LinearRegression()
        weights = 1/np.square(y_train)
        model.fit(X_train, y_train, sample_weight=weights)
        # validation 
        y_pred = model.predict(X_valid)
        fold_df = pd.DataFrame()
        fold_df["y_valid"] = y_valid
        fold_df["y_pred"] = y_pred
        fold_df["stock"] = stock
        fold_df["fold"] = fold
        all_df = all_df.append(fold_df)
scores = []
for fold in range(0, 10):
    fold_score = all_df[all_df["fold"] == fold]
    RMSPE = round(rmspe(y_true = fold_score["y_valid"], y_pred = fold_score["y_pred"]),3)
    scores.append(RMSPE)
print(scores)
print(sum(scores)/10)

[0.494, 0.246, 0.244, 0.332, 0.265, 0.252, 0.251, 0.243, 0.252, 0.249]
0.2828


In [None]:
# Stock by stock traing using all features
all_df = pd.DataFrame()
for stock in list(set(joined["stock_id"])):
    stock_joined = joined[joined["stock_id"] == stock]
    X = stock_joined.drop(["target", "stock_id", "time_id"], axis=1)
    X = X[['bid_price1_mean', 'bid_size1_mean', 'ask_price1_mean',
       'ask_size1_mean', 'bid_price2_mean', 'bid_size2_mean',
       'ask_price2_mean', 'ask_size2_mean', 'bid_volume1_mean',
       'bid_volume2_mean', 'ask_volume1_mean', 'ask_volume2_mean',
       'bid_volume_mean', 'ask_volume_mean', 'wap1_calc_vol', 'wap2_calc_vol',
       'wap_total1_calc_vol', 'wap_total2_calc_vol', 'wap_balance_mean',
       'price_spread_mean', 'bid_spread_mean', 'ask_spread_mean',
       'total_size_mean', 'total_volume_mean', 'size_imbalance_mean',
       'size_imbalance_spread_mean', 'volume_imbalance_mean',
       'volume_imbalance_spread_mean', 'mean_sec_in_bucket', 'mean_price',
       'mean_size', 'mean_order', 'mean_size_all', 'mean_volume',
       'max_sec_in_bucket', 'max_price', 'max_size', 'max_order',
       'max_size_all', 'max_volume', 'min_sec_in_bucket', 'min_price',
       'min_size', 'min_order', 'min_size_all', 'min_volume',
       'median_sec_in_bucket', 'median_price', 'median_size', 'median_order',
       'median_size_all', 'median_volume', 'sum_size', 'sum_order',
       'sum_size_all', 'sum_volume', 'trade_bid_price_spread1',
       'trade_bid_price_spread2', 'trade_ask_price_spread1',
       'trade_ask_price_spread2', 'trade_bid_size_spread1',
       'trade_bid_size_spread2', 'trade_ask_size_spread1',
       'trade_ask_size_spread2', 'trade_bid_volume_spread1',
       'trade_bid_volume_spread2', 'trade_ask_volume_spread1',
       'trade_ask_volume_spread2', 'trade_bid_volume_spread',
       'trade_ask_volume_spread', ]]
    y = stock_joined["target"]
    for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
        # create dataset
        X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_valid = y.iloc[trn_idx], y.iloc[val_idx]

        model = LinearRegression()
        weights = 1/np.square(y_train)
        model.fit(X_train, y_train, sample_weight=weights)
        # validation 
        y_pred = model.predict(X_valid)
        fold_df = pd.DataFrame()
        fold_df["y_valid"] = y_valid
        fold_df["y_pred"] = y_pred
        fold_df["stock"] = stock
        fold_df["fold"] = fold
        all_df = all_df.append(fold_df)
scores = []
for fold in range(0, 10):
    fold_score = all_df[all_df["fold"] == fold]
    RMSPE = round(rmspe(y_true = fold_score["y_valid"], y_pred = fold_score["y_pred"]),3)
    scores.append(RMSPE)
print(scores)
print(sum(scores)/10)

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
X.columns

In [None]:
# Stock by stock training each feature
feature_df = pd.DataFrame()
for feature in ['bid_price1_mean', 'bid_size1_mean', 'ask_price1_mean',
       'ask_size1_mean', 'bid_price2_mean', 'bid_size2_mean',
       'ask_price2_mean', 'ask_size2_mean', 'bid_volume1_mean',
       'bid_volume2_mean', 'ask_volume1_mean', 'ask_volume2_mean',
       'bid_volume_mean', 'ask_volume_mean', 'wap1_calc_vol', 'wap2_calc_vol',
       'wap_total1_calc_vol', 'wap_total2_calc_vol', 'wap_balance_mean',
       'price_spread_mean', 'bid_spread_mean', 'ask_spread_mean',
       'total_size_mean', 'total_volume_mean', 'size_imbalance_mean',
       'size_imbalance_spread_mean', 'volume_imbalance_mean',
       'volume_imbalance_spread_mean', 'mean_sec_in_bucket', 'mean_price',
       'mean_size', 'mean_order', 'mean_size_all', 'mean_volume',
       'max_sec_in_bucket', 'max_price', 'max_size', 'max_order',
       'max_size_all', 'max_volume', 'min_sec_in_bucket', 'min_price',
       'min_size', 'min_order', 'min_size_all', 'min_volume',
       'median_sec_in_bucket', 'median_price', 'median_size', 'median_order',
       'median_size_all', 'median_volume', 'sum_size', 'sum_order',
       'sum_size_all', 'sum_volume', 'trade_bid_price_spread1',
       'trade_bid_price_spread2', 'trade_ask_price_spread1',
       'trade_ask_price_spread2', 'trade_bid_size_spread1',
       'trade_bid_size_spread2', 'trade_ask_size_spread1',
       'trade_ask_size_spread2', 'trade_bid_volume_spread1',
       'trade_bid_volume_spread2', 'trade_ask_volume_spread1',
       'trade_ask_volume_spread2', 'trade_bid_volume_spread',
       'trade_ask_volume_spread', ]:
    all_df = pd.DataFrame()
    for stock in list(set(joined["stock_id"])):
        stock_joined = joined[joined["stock_id"] == stock]
        X = stock_joined.drop(["target", "stock_id", "time_id"], axis=1)
        X = X[[feature]]
        y = stock_joined["target"]
        for fold, (trn_idx, val_idx) in enumerate(kf.split(X)):
            # create dataset
            X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
            y_train, y_valid = y.iloc[trn_idx], y.iloc[val_idx]

            model = LinearRegression()
            weights = 1/np.square(y_train)
            model.fit(X_train, y_train, sample_weight=weights)
            # validation 
            y_pred = model.predict(X_valid)
            fold_df = pd.DataFrame()
            fold_df["y_valid"] = y_valid
            fold_df["y_pred"] = y_pred
            fold_df["stock"] = stock
            fold_df["fold"] = fold
            all_df = all_df.append(fold_df)
    scores = []
    for fold in range(0, 5):
        fold_score = all_df[all_df["fold"] == fold]
        RMSPE = round(rmspe(y_true = fold_score["y_valid"], y_pred = fold_score["y_pred"]),3)
        scores.append(RMSPE)
    print(feature, sum(scores)/5)
    feature_df = feature_df.append(pd.DataFrame({"feature": [feature], "score":[sum(scores)/5]}))

In [None]:
feature_df.sort_values("score", ascending=True)

In [None]:
pd.read_parquet("/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")