Training Process <br>
Keras model: Bi-GRU + Bi-LSTM


In [None]:

import cupy as cp
import cudf
import cuml
import glob
import xgboost as xgb
from tqdm import tqdm

PATH = "/kaggle/input/optiver-realized-volatility-prediction"


def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return cudf.read_csv(file_name)

dev_df = load_data("train", path=PATH)
dev_df.head()
SCALE = 100
dev_df["target"] *= SCALE

stock_ids = dev_df["stock_id"].unique()
len(stock_ids)
order_book_training = glob.glob(f'{PATH}/book_train.parquet/*/*')
order_book_test = glob.glob(f'{PATH}/book_test.parquet/*/*')

len(order_book_training), len(order_book_test)
trades_training = glob.glob(f'{PATH}/trade_train.parquet/*/*')
trades_test = glob.glob(f'{PATH}/trade_test.parquet/*/*')

len(trades_training), len(trades_test)
%cd /kaggle/input/rapids-kaggle-utils/

import cu_utils.transform as cutran



def log_diff(df, in_col, null_val):
    df["logx"] = df[in_col].log()
    df["logx_shifted"] = (df[["time_id", "logx"]].groupby("time_id")
                             .apply_grouped(cutran.get_cu_shift_transform(shift_by=1, null_val=null_val),
                                            incols={"logx": 'x'},
                                            outcols=dict(y_out=cp.float32),
                                            tpb=32)["y_out"])
    df["keep_row"] = df[f"logx_shifted"] != null_val
    return df["logx"] - df["logx_shifted"]



def extract_raw_book_features(df, null_val=-9999):
    for n in range(1, 3):
        p1 = df[f"bid_price{n}"]
        p2 = df[f"ask_price{n}"]
        s1 = df[f"bid_size{n}"]
        s2 = df[f"ask_size{n}"]
        df[f"wap{n}"] = (p1*s2 + p2*s1) / (s1 + s2)
        df[f"log_return{n}"] = log_diff(df, in_col=f"wap{n}", null_val=null_val)
        df[f"realized_vol{n}"] = df[f"log_return{n}"]**2
        
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    df["c"] = 1
    
    df = df[df["keep_row"]]
    return df


def extract_raw_trade_features(df, null_val=-9999):
    df["realized_vol_trade"] = log_diff(df, in_col=f"price", null_val=null_val)**2
    df = df[df["keep_row"]]
    return df


def agg(df, feature_dict):
    agg_df = df.groupby("time_id").agg(feature_dict).reset_index()
    def f(x):
        if x[1] == "":
            return x[0]
        return x[0] + "_" + x[1]
    
    agg_df.columns = [f(x) for x in agg_df.columns]
    return agg_df    


def extract_book_stats(df):
    default_stats = ["sum", "mean", "std"]
    feature_dict = {
        'wap1': default_stats,
        'wap2': default_stats,
        'log_return1': default_stats,
        'log_return2': default_stats,
        'wap_balance': default_stats,
        'price_spread': default_stats,
        'bid_spread': default_stats,
        'ask_spread': default_stats,
        'total_volume': default_stats,
        'volume_imbalance': default_stats,
        'c': ["sum"],
        'realized_vol1': ["sum"],
        'realized_vol2': ["sum"],
    }
    
    return agg(df, feature_dict)
    

    
    
def extract_trade_stats(df):
    feature_dict = {
        'realized_vol_trade': ["sum"],
        'seconds_in_bucket':["count"],
        'size': ["sum"],
        'order_count': ["mean"],
    }
    
    return agg(df, feature_dict)


def time_constraint_fe(df, stats_df, last_sec, fe_function, cols):
    sub_df = df[df["seconds_in_bucket"] >= (600 - last_sec)].reset_index(drop=True)
    if sub_df.shape[0] > 0:
        sub_stats = fe_function(sub_df)
    else:
        sub_stats = cudf.DataFrame(columns=cols)
    return stats_df.merge(sub_stats, on="time_id", how="left", suffixes=('', f'_{last_sec}'))    
    

def feature_engineering(book_path, trade_path):
    book_df = cudf.read_parquet(book_path)
    book_df = extract_raw_book_features(book_df)
    book_stats = extract_book_stats(book_df)
    book_cols = book_stats.columns
    
    trade_df = cudf.read_parquet(trade_path)
    trade_df = extract_raw_trade_features(trade_df)
    trade_stats = extract_trade_stats(trade_df)
    trade_cols = trade_stats.columns
    
    for last_sec in [150, 300, 450]:
        book_stats = time_constraint_fe(book_df, book_stats, last_sec, extract_book_stats, book_cols) 
        trade_stats = time_constraint_fe(trade_df, trade_stats, last_sec, extract_trade_stats, trade_cols) 

    return book_stats.merge(trade_stats, on="time_id", how="left")


def process_data(order_book_paths, trade_paths, stock_ids):
    stock_dfs = []
    for book_path, trade_path in tqdm(list(zip(order_book_paths, trade_paths))):
        stock_id = int(book_path.split("=")[1].split("/")[0])

        df = feature_engineering(book_path, trade_path)
        df["stock_id"] = stock_id
        stock_dfs.append(df)
    return cudf.concat(stock_dfs)

past_volatility = process_data(order_book_training, trades_training, stock_ids)
past_test_volatility = process_data(order_book_test, trades_test, stock_ids)

past_volatility.shape, past_test_volatility.shape

def stock_time_fe(df):
    cols = ['realized_vol1_sum', 'realized_vol2_sum', 'realized_vol_trade_sum',
            'realized_vol1_sum_150', 'realized_vol2_sum_150', 'realized_vol_trade_sum_150',
            'realized_vol1_sum_300', 'realized_vol2_sum_300', 'realized_vol_trade_sum_300',
            'realized_vol1_sum_450', 'realized_vol2_sum_450', 'realized_vol_trade_sum_450']
    
    for agg_col in ["stock_id", "time_id"]:
        for agg_func in ["mean", "max", "std", "min", "var"]:
            agg_df = df.groupby(agg_col)[cols].agg(agg_func)
            agg_df.columns = [f"{agg_col}_{agg_func}_{col}" for col in agg_df.columns]
            df = df.merge(agg_df.reset_index(), on=agg_col, how="left")
    
    return df

past_volatility["is_test"] = False
past_test_volatility["is_test"] = True
all_df = past_volatility.append(past_test_volatility).reset_index(drop=True)

all_df = stock_time_fe(all_df)

past_volatility = all_df[~all_df["is_test"]]
past_test_volatility = all_df[all_df["is_test"]]

dev_df = dev_df.merge(past_volatility, on=["stock_id", "time_id"], how="left")

features = [col for col in list(dev_df.columns)
            if col not in {"stock_id", "time_id", "target", "is_test"}]
len(features)



def rmspe(y_true, y_pred):
    return (cp.sqrt(cp.mean(cp.square((y_true - y_pred) / y_true))))


def rmspe_xgb(pred, dtrain):
    y = dtrain.get_label()
    return 'rmspe', rmspe(cp.array(y), cp.array(pred))


NUM_FOLDS = 5
param = {'objective': 'reg:squarederror',
         'learning_rate': 0.1,
         'max_depth': 3,
         "min_child_weight": 200,
         "reg_alpha": 10.0,
         "tree_method": 'gpu_hist', "gpu_id": 0,
         'disable_default_eval_metric': 1
    }

target = "target"




In [None]:
%cd /kaggle/working

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, Reshape, Flatten
from keras.layers import Bidirectional, LSTM, Conv1D,GlobalAveragePooling1D,GlobalMaxPooling1D,Dropout,MaxPooling1D, GRU
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.models import load_model
import numpy as np
import tensorflow as tf
import gc
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

def lstm_model(train,validation,test,model_num,nfolds=5) :
    gc.collect()
    lstm_preds = cp.zeros(test.shape[0])
    X_train, y_train = train[0], train[1]
    X_test, y_test = validation[0], validation[1]
    
    gc.collect()
    X_train = cp.reshape(X_train,(346174, 1, 268))
    X_train = cp.asnumpy(cp.array(X_train))
    gc.collect()
    X_test  = cp.reshape(X_test,(82758, 1, 268))
    X_test = cp.asnumpy(cp.array(X_test))
    gc.collect()
    y_train = cp.reshape(y_train,(-1, 1))
    y_train = cp.asnumpy(cp.array(y_train))
    gc.collect()
    y_test = cp.reshape(y_test,(-1, 1))
    y_test = cp.asnumpy(cp.array(y_test))
    gc.collect()
    test  = cp.reshape(test,(test.shape[0], 1, 268))
    test = cp.asnumpy(cp.array(test))
#     test = np.reshape(test,(1,1,268))#.reshape(-1,1,768)
    # Initialising the RNN
    gc.collect()
    regressor = Sequential()

    # Adding the input layerand the LSTM layer
    regressor.add(Bidirectional(GRU(units=150, activation='relu',return_sequences=True)))
        
    regressor.add(Bidirectional(LSTM(units=30, activation='relu')))
        
    regressor.add(Dense(units=1, kernel_initializer='normal'))

    # Compiling the RNN
    regressor.compile(loss = root_mean_squared_error)
    mc = ModelCheckpoint(r'./best_model'+str(model_num)+'.h5',monitor='val_loss', mode='min', verbose=1, save_best_only=True)
    # Fitting the RNN to the Training set
    regressor.fit(X_train, y_train, batch_size=100, epochs = 1, verbose = 1,validation_data=(X_test,y_test),callbacks=[mc])
    regressor = load_model('./best_model'+str(model_num)+'.h5', custom_objects={'root_mean_squared_error':root_mean_squared_error})
    lstm_preds = regressor.predict(test)/5
    return lstm_preds

In [None]:
import gc
oof_preds = cp.zeros(dev_df.shape[0])
test_preds = np.zeros(past_test_volatility.shape[0])
seed = 42
test_pred = []

gc.collect()
for fold in range(NUM_FOLDS):
    print("Fold", fold)
    gc.collect()

    train_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS != fold)[0]
    val_ind = cp.where(dev_df["time_id"].values % NUM_FOLDS == fold)[0]
    gc.collect()
    train_df, val_df = dev_df.iloc[train_ind], dev_df.iloc[val_ind]
    train_df = train_df.fillna(0.0)
    val_df = val_df.fillna(0.0)
    print(past_test_volatility[features].shape)
    print(train_df[features].shape)
    past_test_volatility = past_test_volatility.fillna(0.0)
    test_result = lstm_model([train_df[features].values, train_df[target].values], [val_df[features].values, val_df[target].values], 
                           past_test_volatility[features].values, fold)
    test_preds += test_result
    
# dev_df["pred"] = oof_preds
# print(f'The RMSPE score of XGB is {rmspe(dev_df["target"], dev_df["pred"])}')
past_test_volatility["row_id"] = past_test_volatility["stock_id"].astype(str) + "-" + past_test_volatility["time_id"].astype(str) 
past_test_volatility["target"] = test_preds.clip(0.0, 100.0)/SCALE
%cd /kaggle/working
sub_df = load_data("test", path=PATH).merge(past_test_volatility[["row_id", "target"]], 
                                            on="row_id", how="left").fillna(0.0)

sub_df.to_csv("submission.csv", index=False, columns=["row_id", "target"])