# Changelog
* v.6
    * No features engineering. `book_seconds_lapse=5` & `trade_seconds_lapse=5` (data: `prep-opt-v6`)
    * Simple NN
* v.7
    * Add new features: `stock_id`, multiple `book` and `trade` features. `book_seconds_lapse=10` & `trade_seconds_lapse=15` (data: `prep-opt-v5`)
    * Add parallel processing
    * Debugged prediction
* v.8
    * v.6 but NN w/ embedding layers (data: `prep-opt-v6`)
* v.9
    * v.7 but `book_seconds_lapse=5` (try to fit in the memory) (data: `prep-opt-v7`)
* v.10
    * v.7 but MinMaxScaler instead of StandardScaler (data: prep-opt-v5)
* v.11
    * v.9 but `trade_seconds_lapse=20` (data: `prep-opt-v8`), since v.9 can't fit in memory when training
* v.12
    * v.10 but 
    * Start trying out GB models: XGBoost, LGBM, CatBoost
* v.13
    * v.10 but
    * Emsemble LGBM + NN
    * Added progress bar for parallel processing
* v.14
    * v.13 (MinMaxScaler, `book_seconds=10`, `trade_seconds=15`)
    * Checked the situation of stock 31
    * Added a bunch of features: `is_stock_31`, `trading_halted`, multiple book and trade features (data: `prep-opt-v9`)
* v.15
    * v.14, removed `is_stock_31` & `trading_halted`, useless features (data: `prep-opt-v10`)
    * NN `prep-opt-v5` + LGBM `prep-opt-v10`
    * Predict in batches
    
# (Relative) comparision of datasets:
* Memory: `prep-opt-v7` > `prep-opt-v8` > `prep-opt-v9` > `prep-opt-v10` > `prep-opt-v5` > `prep-opt-v6`
* Feature creativeness: `prep-opt-v9` = `prep-opt-v10` > `prep-opt-v7` > `prep-opt-v8` >  `prep-opt-v5` > `prep-opt-v6`

In [None]:
import os
import glob
import gc
import time
import warnings
import math

import pandas as pd
import numpy as np
import matplotlib as plt
from math import floor
from joblib import Parallel, delayed
from tqdm.auto import tqdm

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import (
    Input, layers, activations, initializers, optimizers, models, regularizers, callbacks
)
import lightgbm as lgb

# Util functions & Global variables

In [None]:
def reduce_memory_usage(df):
    integers = ['int16', 'int32', 'int64']
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in integers:
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
                
        elif col_type == 'float64':
            df[col] = df[col].astype(np.float32)
            
    
    return df

In [None]:
def convert_long_to_wide_format(df):
    """Convert book/trade DataFrame from long format (panel data) to wide format"""
    df = df.pivot(index='time_id', columns='seconds_in_bucket')
    multi_col = pd.MultiIndex.from_product([df.columns.levels[0], 
                                            df.columns.levels[1].astype('str')])
    df.columns = multi_col.map('_'.join)
    df.reset_index(inplace=True)
    
    return df

In [None]:
ROOT_PATH = '/kaggle/input/optiver-realized-volatility-prediction/'
train_target = pd.read_csv(ROOT_PATH + 'train.csv')
MIN_TARGET = train_target['target'].min()

# Light GBM

In [None]:
def add_book_features(df):
    df['wap1'] = ((df['bid_price1'] * df['ask_size1'] + 
                   df['ask_price1'] * df['bid_size1']) / 
                  (df['bid_size1'] + df['ask_size1']))
    df['wap2'] = ((df['bid_price2'] * df['ask_size2'] + 
                   df['ask_price2'] * df['bid_size2']) / 
                  (df['bid_size2'] + df['ask_size2']))

    df['log_ret1'] = df.groupby('time_id')['wap1'].apply(
        lambda x: np.log(x).diff().fillna(0)
    )
    df['log_ret2'] = df.groupby('time_id')['wap2'].apply(
        lambda x: np.log(x).diff().fillna(0)
    )

    df['bid_ask_spread'] = df['ask_price1'] / df['bid_price1'] - 1
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] / df['bid_price2'] - 1
    df['ask_spread'] = df['ask_price2'] / df['ask_price1'] - 1
    df['wap_spread'] = df['wap2'] / df['wap1'] - 1
    df['bid_ask_depth_ratio'] = (df['bid_size1'] + df['bid_size2']) / (df['ask_size1'] + df['ask_size2'])
    df['bid_ask_depth_ratio1'] = df['bid_size1'] / df['ask_size1']
    df['bid_ask_depth_ratio2'] = df['bid_size2'] / df['ask_size2']
    df['total_depth'] = df['bid_size1'] + df['bid_size2'] + df['ask_size1'] + df['ask_size2']
    
    return df

def add_agg_book_features(df):
    df_agg = pd.DataFrame()
    df_agg['sigma1'] = df.groupby('time_id')['log_ret1'].apply(lambda x: np.sqrt(np.sum(x**2)))
    df_agg['sigma2'] = df.groupby('time_id')['log_ret2'].apply(lambda x: np.sqrt(np.sum(x**2)))
    df_agg['book_update_times'] = df.groupby('time_id')['seconds_in_bucket'].count()

    for col in ['bid_size1', 'ask_size1', 'bid_size2', 'ask_size2',
               'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2']:
        df_agg[col + '_update_times'] = df.groupby('time_id')[col].unique().apply(lambda x: x.shape[0])
    
    for col in ['bid_size1', 'ask_size1', 'bid_size2', 'ask_size2',
                'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2',
                'wap1', 'wap2', 'bid_ask_spread', 'price_spread', 
                'bid_spread', 'ask_spread', 'wap_spread', 'bid_ask_depth_ratio',
                'bid_ask_depth_ratio1', 'bid_ask_depth_ratio2', 'total_depth']:
        df_agg['mean_' + col] = df.groupby('time_id')[col].apply(np.mean)
        df_agg['std_' + col] = df.groupby('time_id')[col].apply(np.std)
        df_agg['min_' + col] = df.groupby('time_id')[col].apply(np.min)
        df_agg['max_' + col] = df.groupby('time_id')[col].apply(np.max)

    
    for col in ['log_ret1', 'log_ret2']:
        df_agg['mean_' + col] = df.groupby('time_id')[col].apply(np.mean)
        df_agg['min_' + col] = df.groupby('time_id')[col].apply(np.min)
        df_agg['max_' + col] = df.groupby('time_id')[col].apply(np.max)

    return df_agg

def fetch_book_one_stock(stock_id, partition='train', seconds_lapse=10):
    """Fetch book data of one stock into a DataFrame"""
    path = f'{ROOT_PATH}book_{partition}.parquet/stock_id={str(stock_id)}'
    df = pd.read_parquet(path)
    df = add_book_features(df)
    df_agg = add_agg_book_features(df)
    
    # add missing seconds to book data
    df = df.set_index(['time_id', 'seconds_in_bucket'])
    multi_index = pd.MultiIndex.from_product([df.index.levels[0], 
                                              np.arange(0, 600, seconds_lapse)], 
                                             names = ['time_id', 'seconds_in_bucket'])
    
    # forward fill book on missing seconds (missing = order book unchanged)
    df = df.reindex(multi_index, method='ffill').fillna(method='bfill')  # bfill in case of missing second 0
    df.reset_index(inplace=True)
    
    df = convert_long_to_wide_format(df)
    df = df.merge(df_agg, on='time_id', how='left')
    df['stock_id'] = stock_id  # add stock_id as primary key for later merging
    assert not df.isna().any().any()

    return df

In [None]:
def add_trade_features(df):
    df['real_log_ret'] = df.groupby('time_id')['price'].apply(
        lambda x: np.log(x).diff().fillna(0)
    )
    df['volume'] = df['price'] * df['size']
    df['volume_per_order'] = (df['volume'] / df['order_count']).fillna(0)
    df['volume_last_seconds'] = df['volume'] * df['seconds_in_bucket'] / 600
    df['whale_last_seconds'] = df['volume_per_order'] * df['seconds_in_bucket'] / 600

    return df

def add_agg_trade_features(df, df_agg):
    df_agg['real_sigma'] = df.groupby('time_id')['real_log_ret'].apply(lambda x: np.sqrt(np.sum(x**2)))

    for col in ['price', 'size', 'order_count', 'volume', 
                'volume_per_order', 'volume_last_seconds', 'whale_last_seconds']:
        df_agg['mean_' + col] = df.groupby('time_id')[col].apply(np.mean)
        df_agg['std_' + col] = df.groupby('time_id')[col].apply(np.std)
        df_agg['min_' + col] = df.groupby('time_id')[col].apply(np.min)
        df_agg['max_' + col] = df.groupby('time_id')[col].apply(np.max)

    
    return df_agg
    
def fetch_trade_one_stock(stock_id, partition='train', seconds_lapse=15):
    """Fetch trade data of one stock into a DataFrame"""
    path = f'{ROOT_PATH}trade_{partition}.parquet/stock_id={str(stock_id)}'
    df = pd.read_parquet(path)
    df_agg = pd.DataFrame()
    df_agg['trade_times'] = df.groupby('time_id')['seconds_in_bucket'].unique().apply(lambda x: x.shape[0])

    # add ALL missing seconds (0, 1, 2,..., 599, 600) to trade data
    df = df.set_index(['time_id', 'seconds_in_bucket'])
    multi_index = pd.MultiIndex.from_product([df.index.levels[0], 
                                              np.arange(0, 600)], 
                                             names = ['time_id', 'seconds_in_bucket'])
    df = df.reindex(multi_index)
    df.reset_index(inplace=True)
    # forward fill price in missing seconds (missing = no trade happens)
    df['price'] = df['price'].fillna(method='ffill').fillna(method='bfill')  # bfill in case of missing second 0
    # fill size and order_count with 0
    df['size'] = df['size'].fillna(value=0).astype(int)
    df['order_count'] = df['order_count'].fillna(value=0).astype(int)
    
    # build another bucket of seconds (1 bucket = seconds_lapse)
    # e.g. seconds_lapse = 5; then seconds 0, 1, ..., 4 fall into bucket 5; seconds 5 - 9 fall into bucket 10; etc.
    df['seconds_in_bucket'] = df['seconds_in_bucket'].apply(lambda x: (floor(x / seconds_lapse) + 1) * seconds_lapse)
    # now compute the average price traded in each bucket of seconds again 
    # & total number shares traded + order_count
    df['price'] = df['price'] * df['size']
    df = df.groupby(['time_id', 'seconds_in_bucket']).sum()
    df['price'] = df['price'] / df['size']
    # fill missing price due to no trades happened in an entire bucket
    df['price'] = df['price'].fillna(method='ffill').fillna(method='bfill')
    df.reset_index(inplace=True)
    
    df = add_trade_features(df)
    df_agg = add_agg_trade_features(df, df_agg)
    
    df = convert_long_to_wide_format(df)
    df = df.merge(df_agg, on='time_id', how='left')
    df['stock_id'] = stock_id  # add stock_id as primary key for later merging
    
    assert not df.isna().any().any()

    return df

In [None]:
def fetch_data_one_stock(stock_id, partition='train', book_seconds_lapse=10, trade_seconds_lapse=15):
    pd.options.mode.chained_assignment = None
    book_df = fetch_book_one_stock(stock_id, partition, book_seconds_lapse)
    trade_df = fetch_trade_one_stock(stock_id, partition, trade_seconds_lapse)
    # merge order books and trade histories into 1 DataFrame
    features_df = book_df.merge(trade_df, on=['stock_id', 'time_id'], how='left')    
    del book_df, trade_df
    gc.collect()
    features_df.fillna(0, inplace=True)

    if partition == 'train':
        target_df = train_target[train_target['stock_id'] == stock_id]
        target_df['target'] = target_df['target'].astype('float32')
        full_df = target_df.merge(features_df, on=['stock_id', 'time_id'], how='inner')
        del features_df, target_df
        gc.collect()
        return reduce_memory_usage(full_df)
    
    elif partition == 'test':
        cols = list(features_df.columns)
        cols.insert(0, cols.pop(cols.index('stock_id')))
        features_df = features_df.loc[:, cols]
        return reduce_memory_usage(features_df)

In [None]:
class ProgressParallel(Parallel):
    def __init__(self, use_tqdm=True, total=None, *args, **kwargs):
        self._use_tqdm = use_tqdm
        self._total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self._total is None:
            self._pbar.total = self.n_dispatched_tasks
        self._pbar.n = self.n_completed_tasks
        self._pbar.refresh()

        
def fetch_data_all_stocks(stock_ids, partition="train", book_seconds_lapse=10, trade_seconds_lapse=15):
    df = ProgressParallel(n_jobs=-1, verbose=10, total=len(stock_ids))(
        delayed(fetch_data_one_stock)(
            stock_id, partition, book_seconds_lapse, trade_seconds_lapse
        ) for stock_id in stock_ids
    )
    df = pd.concat(df, ignore_index=True)
    gc.collect()
    
    assert not df.isna().any().any()
    return reduce_memory_usage(df)

In [None]:
def predict_one_stock(stock_id, model, scaler, model_type):
    X_num_test_df = fetch_data_one_stock(stock_id, partition='test')
    submission = pd.DataFrame()
    submission['row_id'] = X_num_test_df['stock_id'].astype(str) + '-' + X_num_test_df['time_id'].astype(str)
    
    X_cat_test = X_num_test_df[CAT_FEATURES].to_numpy()
    X_num_test_df.drop((CAT_FEATURES + ['time_id']), axis=1, inplace=True)
    X_num_test = X_num_test_df.to_numpy(dtype='float32')
    del X_num_test_df
    gc.collect()
    scaler.transform(X_num_test)
    
    if model_type == 'lgbm':
        X_test = np.concatenate((X_cat_test, X_num_test), axis=1)
        del X_num_test, X_cat_test
        gc.collect()
        Y_test = np.clip(model.predict(X_test), MIN_TARGET/2, None)
        del X_test
        gc.collect()
        
    elif model_type == 'nn':
        Y_test = np.clip(model.predict((X_cat_test, X_num_test)), MIN_TARGET/2, None)
        del X_cat_test, X_num_test
        gc.collect()
    
    submission['target_' + model_type] = Y_test
    del Y_test
    gc.collect()

    return submission
    
def predict_multiple_stocks(stock_ids, model, scaler, model_type, parallel=False):
    if not parallel:
        submission = [predict_one_stock(stock_id, model, scaler, model_type) for stock_id in stock_ids]
    
    else:
        submission = ProgressParallel(n_jobs=-1, verbose=10, total=len(stock_ids))(
            delayed(predict_one_stock)(stock_id, model, scaler, model_type) for stock_id in stock_ids
        )
    submission = pd.concat(submission, ignore_index=True)
    gc.collect()
    return submission

In [None]:
%%time
X_num = pd.read_parquet('/kaggle/input/prep-opt-v10/prep_opt_v10/full_dataset.parquet')  # book=10, trade=15, 1704 features
X_num.info(memory_usage='deep')

In [None]:
FEATURE_NAMES = [X_num.columns[0]] + list(X_num.columns[3:])
FEATURE_NAMES[:10]

In [None]:
X_num.shape

In [None]:
CAT_FEATURES = ['stock_id']
X_cat = X_num[CAT_FEATURES].to_numpy()
Y = X_num['target'].to_numpy('float32')
X_num.drop((CAT_FEATURES + ['time_id', 'target']), axis=1, inplace=True)
gc.collect()
print('X_cat shape: ', X_cat.shape)
print('X_num shape:', X_num.shape)
print('Y shape:', Y.shape)

In [None]:
RANDOM_STATE = 86

def reset_seed():
    np.random.seed(RANDOM_STATE)
    tf.random.set_seed(RANDOM_STATE)
    
reset_seed()

In [None]:
X_cat_train, X_cat_dev, X_num_train_df, X_num_dev_df, Y_train, Y_dev = train_test_split(
    X_cat, X_num, Y, test_size=10000, random_state=RANDOM_STATE
)
del X_num, Y
gc.collect()

X_num_train = X_num_train_df.to_numpy(dtype='float32')
del X_num_train_df
gc.collect()

X_num_dev = X_num_dev_df.to_numpy(dtype='float32')
del X_num_dev_df
gc.collect()

print('X_cat_train shape: ', X_cat_train.shape)
print('X_num_train shape:', X_num_train.shape)
print('X_cat_dev shape:', X_cat_dev.shape)
print('X_num_dev shape:', X_num_dev.shape)

In [None]:
scaler = MinMaxScaler(copy=False)

scaler.fit_transform(X_num_train)
scaler.transform(X_num_dev);

In [None]:
X_train = np.concatenate((X_cat_train, X_num_train), axis=1)
del X_num_train, X_cat_train
gc.collect()
X_dev = np.concatenate((X_cat_dev, X_num_dev), axis=1)
del X_num_dev, X_cat_dev
gc.collect()

print('X_train shape: ', X_train.shape)
print('X_dev shape:', X_dev.shape)

In [None]:
d_train = lgb.Dataset(X_train, label=Y_train, feature_name=FEATURE_NAMES, 
                      categorical_feature=CAT_FEATURES, free_raw_data=True)
del X_train, Y_train
gc.collect()
d_dev = lgb.Dataset(X_dev, label=Y_dev, feature_name=FEATURE_NAMES, 
                    categorical_feature=CAT_FEATURES, free_raw_data=True)
del X_dev, Y_dev
gc.collect()

In [None]:
def rmspe_np(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


def fobj_rmpse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    grad = -2 * (y_true - y_pred) / (y_true ** 2)
    hess = 2 / (y_true ** 2)
    return grad, hess
 

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE: ', round(rmspe_np(y_true, y_pred), 4), False

In [None]:
eval_result = {}
record_eval_cb = lgb.record_evaluation(eval_result)

params = {
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'num_leaves': 40,
    'max_bin': 255,
    'min_data_in_leaf': 750,
    'learning_rate' : 0.05,
    'subsample': 0.72,
    'subsample_freq': 3,
    'feature_fraction': 0.5,
    'seed': RANDOM_STATE,
    'n_jobs': -1,
    'verbose': -1,
    'device': 'gpu',
    'num_gpu': 1,
    'gpu_platform_id': -1,
    'gpu_device_id': -1,
    'gpu_use_dp': False,
}

model = lgb.train(params, d_train, num_boost_round=1000, 
                  valid_sets=[d_train, d_dev], valid_names=['Train', 'Dev'], 
                  early_stopping_rounds=50, fobj=fobj_rmpse, feval=feval_rmspe, 
                  verbose_eval = 10, categorical_feature=CAT_FEATURES,
                  callbacks=[record_eval_cb])

In [None]:
lgb.plot_importance(model, max_num_features=20);

In [None]:
val_loss_lgb = np.array(eval_result["Dev"]["RMSPE: "])
min_val_loss_lgb = np.min(val_loss_lgb)
best_epoch_lgb = np.argmin(val_loss_lgb) + 1
print("LGBM's minimum val_loss: ", min_val_loss_lgb, ", achieved at Round", best_epoch_lgb)

In [None]:
list_order_book_test = glob.glob(ROOT_PATH + 'book_test.parquet/*')
test_ids = [int(path.split('=')[1]) for path in list_order_book_test]

submission = predict_multiple_stocks(test_ids, model=model, scaler=scaler, model_type='lgbm', parallel=True)
submission.head()

In [None]:
del model, eval_result, record_eval_cb, d_train, d_dev, scaler
gc.collect()

# NN

In [None]:
def add_book_features(df):
    df['wap1'] = ((df['bid_price1'] * df['ask_size1'] + 
                   df['ask_price1'] * df['bid_size1']) / 
                  (df['bid_size1'] + df['ask_size1']))
    df['wap2'] = ((df['bid_price2'] * df['ask_size2'] + 
                   df['ask_price2'] * df['bid_size2']) / 
                  (df['bid_size2'] + df['ask_size2']))

    df['log_ret1'] = df.groupby('time_id')['wap1'].apply(
        lambda x: np.log(x).diff().fillna(0)
    )
    df['log_ret2'] = df.groupby('time_id')['wap2'].apply(
        lambda x: np.log(x).diff().fillna(0)
    )

    df['bid_ask_spread'] = df['ask_price1'] / df['bid_price1'] - 1
    df['bid_spread'] = df['bid_price1'] / df['bid_price2'] - 1
    df['ask_spread'] = df['ask_price2'] / df['ask_price1'] - 1
    df['wap_spread'] = df['wap2'] / df['wap1'] - 1
    df['bid_ask_depth_ratio'] = (df['bid_size1'] + df['bid_size2']) / (df['ask_size1'] + df['ask_size2'])
    df['bid_ask_depth_ratio1'] = df['bid_size1'] / df['ask_size1']
    df['bid_ask_depth_ratio2'] = df['bid_size2'] / df['ask_size2']
    df['total_depth'] = df['bid_size1'] + df['bid_size2'] + df['ask_size1'] + df['ask_size2']
    
    return df

def add_agg_book_features(df):
    df_agg = pd.DataFrame()
    df_agg['sigma1'] = df.groupby('time_id')['log_ret1'].apply(lambda x: np.sqrt(np.sum(x**2)))
    df_agg['sigma2'] = df.groupby('time_id')['log_ret2'].apply(lambda x: np.sqrt(np.sum(x**2)))
    df_agg['book_update_times'] = df.groupby('time_id')['seconds_in_bucket'].count()

    for col in ['bid_size1', 'ask_size1', 'bid_size2', 'ask_size2',
               'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2']:
        df_agg[col + '_update_times'] = df.groupby('time_id')[col].unique().apply(lambda x: x.shape[0])
    
    return df_agg

def fetch_book_one_stock(stock_id, partition='train', seconds_lapse=5):
    """Fetch book data of one stock into a DataFrame"""
    path = f'{ROOT_PATH}book_{partition}.parquet/stock_id={str(stock_id)}'
    df = pd.read_parquet(path)
    df = add_book_features(df)
    df_agg = add_agg_book_features(df)
    
    # add missing seconds to book data
    df = df.set_index(['time_id', 'seconds_in_bucket'])
    multi_index = pd.MultiIndex.from_product([df.index.levels[0], 
                                              np.arange(0, 600, seconds_lapse)], 
                                             names = ['time_id', 'seconds_in_bucket'])
    
    # forward fill book on missing seconds (missing = order book unchanged)
    df = df.reindex(multi_index, method='ffill').fillna(method='bfill')  # bfill in case of missing second 0
    df.reset_index(inplace=True)
    
    df = convert_long_to_wide_format(df)
    df = df.merge(df_agg, on='time_id', how='left')
    del df_agg
    gc.collect()
    df['stock_id'] = stock_id  # add stock_id as primary key for later merging
    assert not df.isna().any().any()

    return df

In [None]:
def add_trade_features(df):
    df['real_log_ret'] = df.groupby('time_id')['price'].apply(
        lambda x: np.log(x).diff().fillna(0)
    )
    df['volume'] = df['price'] * df['size']
    df['volume_per_order'] = (df['volume'] / df['order_count']).fillna(0)
    df['volume_last_seconds'] = df['volume'] * df['seconds_in_bucket'] / 600
    df['whale_last_seconds'] = df['volume_per_order'] * df['seconds_in_bucket'] / 600

    return df

def add_agg_trade_features(df, df_agg):
    df_agg['real_sigma'] = df.groupby('time_id')['real_log_ret'].apply(lambda x: np.sqrt(np.sum(x**2)))

    for col in ['price', 'size', 'order_count', 'volume', 'volume_per_order', 'volume_last_seconds', 'whale_last_seconds']:
        df_agg['std_' + col] = df.groupby('time_id')[col].apply(np.std)
    
    return df_agg
    
def fetch_trade_one_stock(stock_id, partition='train', seconds_lapse=5):
    """Fetch trade data of one stock into a DataFrame"""
    path = f'{ROOT_PATH}trade_{partition}.parquet/stock_id={str(stock_id)}'
    df = pd.read_parquet(path)
    df_agg = pd.DataFrame()
    df_agg['trade_times'] = df.groupby('time_id')['seconds_in_bucket'].unique().apply(lambda x: x.shape[0])

    # add ALL missing seconds (0, 1, 2,..., 599, 600) to trade data
    df = df.set_index(['time_id', 'seconds_in_bucket'])
    multi_index = pd.MultiIndex.from_product([df.index.levels[0], 
                                              np.arange(0, 600)], 
                                             names = ['time_id', 'seconds_in_bucket'])
    df = df.reindex(multi_index)
    df.reset_index(inplace=True)
    # forward fill price in missing seconds (missing = no trade happens)
    df['price'] = df['price'].fillna(method='ffill').fillna(method='bfill')  # bfill in case of missing second 0
    # fill size and order_count with 0
    df['size'] = df['size'].fillna(value=0).astype(int)
    df['order_count'] = df['order_count'].fillna(value=0).astype(int)
    
    # build another bucket of seconds (1 bucket = seconds_lapse)
    # e.g. seconds_lapse = 5; then seconds 0, 1, ..., 4 fall into bucket 5; seconds 5 - 9 fall into bucket 10; etc.
    df['seconds_in_bucket'] = df['seconds_in_bucket'].apply(lambda x: (floor(x / seconds_lapse) + 1) * seconds_lapse)
    # now compute the average price traded in each bucket of seconds again 
    # & total number shares traded + order_count
    df['price'] = df['price'] * df['size']
    df = df.groupby(['time_id', 'seconds_in_bucket']).sum()
    df['price'] = df['price'] / df['size']
    # fill missing price due to no trades happened in an entire bucket
    df['price'] = df['price'].fillna(method='ffill').fillna(method='bfill')
    df.reset_index(inplace=True)
    
    df = add_trade_features(df)
    df_agg = add_agg_trade_features(df, df_agg)
    
    df = convert_long_to_wide_format(df)
    df = df.merge(df_agg, on='time_id', how='left')
    del df_agg
    gc.collect()
    df['stock_id'] = stock_id  # add stock_id as primary key for later merging
    
    assert not df.isna().any().any()

    return df

In [None]:
def fetch_data_one_stock(stock_id, partition='train', book_seconds_lapse=10, trade_seconds_lapse=15):
    pd.options.mode.chained_assignment = None
    book_df = fetch_book_one_stock(stock_id, partition, book_seconds_lapse)
    trade_df = fetch_trade_one_stock(stock_id, partition, trade_seconds_lapse)
    # merge order books and trade histories into 1 DataFrame
    features_df = book_df.merge(trade_df, on=['stock_id', 'time_id'], how='left')    
    del book_df, trade_df
    gc.collect()
    features_df.fillna(0, inplace=True)

    if partition == 'train':
        target_df = train_target[train_target['stock_id'] == stock_id]
        target_df['target'] = target_df['target'].astype('float32')
        full_df = target_df.merge(features_df, on=['stock_id', 'time_id'], how='inner')
        del features_df, target_df
        gc.collect()
        return reduce_memory_usage(full_df)
    
    elif partition == 'test':
        cols = list(features_df.columns)
        cols.insert(0, cols.pop(cols.index('stock_id')))
        features_df = features_df.loc[:, cols]
        return reduce_memory_usage(features_df)

In [None]:
class ProgressParallel(Parallel):
    def __init__(self, use_tqdm=True, total=None, *args, **kwargs):
        self._use_tqdm = use_tqdm
        self._total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self._use_tqdm, total=self._total) as self._pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self._total is None:
            self._pbar.total = self.n_dispatched_tasks
        self._pbar.n = self.n_completed_tasks
        self._pbar.refresh()

        
def fetch_data_all_stocks(stock_ids, partition="train", book_seconds_lapse=10, trade_seconds_lapse=15):
    df = ProgressParallel(n_jobs=-1, verbose=10, total=len(stock_ids))(
        delayed(fetch_data_one_stock)(
            stock_id, partition, book_seconds_lapse, trade_seconds_lapse
        ) for stock_id in stock_ids
    )
    df = pd.concat(df, ignore_index=True)
    gc.collect()
    
    assert not df.isna().any().any()
    return reduce_memory_usage(df)

In [None]:
def predict_one_stock(stock_id, model, scaler, model_type):
    X_num_test_df = fetch_data_one_stock(stock_id, partition='test')
    submission = pd.DataFrame()
    submission['row_id'] = X_num_test_df['stock_id'].astype(str) + '-' + X_num_test_df['time_id'].astype(str)
    
    X_cat_test = X_num_test_df[CAT_FEATURES].to_numpy()
    X_num_test_df.drop((CAT_FEATURES + ['time_id']), axis=1, inplace=True)
    X_num_test = X_num_test_df.to_numpy(dtype='float32')
    del X_num_test_df
    gc.collect()
    scaler.transform(X_num_test)
    
    if model_type == 'lgbm':
        X_test = np.concatenate((X_cat_test, X_num_test), axis=1)
        del X_num_test, X_cat_test
        gc.collect()
        Y_test = np.clip(model.predict(X_test), MIN_TARGET/2, None)
        del X_test
        gc.collect()
        
    elif model_type == 'nn':
        Y_test = np.clip(model.predict((X_cat_test, X_num_test)), MIN_TARGET/2, None)
        del X_cat_test, X_num_test
        gc.collect()
    
    submission['target_' + model_type] = Y_test
    del Y_test
    gc.collect()

    return submission
    
def predict_multiple_stocks(stock_ids, model, scaler, model_type, parallel=False):
    if not parallel:
        submission = [predict_one_stock(stock_id, model, scaler, model_type) for stock_id in stock_ids]
    
    else:
        submission = ProgressParallel(n_jobs=-1, verbose=10, total=len(stock_ids))(
            delayed(predict_one_stock)(stock_id, model, scaler, model_type) for stock_id in stock_ids
        )
    submission = pd.concat(submission, ignore_index=True)
    gc.collect()
    return submission

In [None]:
%%time
X_num = pd.read_parquet('/kaggle/input/prep-opt-v5/prep_opt/full_dataset.parquet')  # book=10, trade=15, 1541 features
X_num.info(memory_usage='deep')

In [None]:
X_cat = X_num[CAT_FEATURES].to_numpy()
MAX_STOCK_ID = max(X_cat)[0] + 1
Y = X_num['target'].to_numpy('float32')
X_num.drop((CAT_FEATURES + ['time_id', 'target']), axis=1, inplace=True)
gc.collect()
print('X_cat shape: ', X_cat.shape)
print('X_num shape:', X_num.shape)
print('Y shape:', Y.shape)

In [None]:
reset_seed()

In [None]:
X_cat_train, X_cat_dev, X_num_train_df, X_num_dev_df, Y_train, Y_dev = train_test_split(
    X_cat, X_num, Y, test_size=10000, random_state=RANDOM_STATE
)
del X_num, Y
gc.collect()

X_num_train = X_num_train_df.to_numpy(dtype='float32')
del X_num_train_df
gc.collect()

X_num_dev = X_num_dev_df.to_numpy(dtype='float32')
del X_num_dev_df
gc.collect()

print('X_cat_train shape: ', X_cat_train.shape)
print('X_num_train shape:', X_num_train.shape)
print('X_cat_dev shape:', X_cat_dev.shape)
print('X_num_dev shape:', X_num_dev.shape)

In [None]:
scaler = MinMaxScaler(copy=False)

scaler.fit_transform(X_num_train)
scaler.transform(X_num_dev);

In [None]:
keras.backend.clear_session()
reset_seed()

def rmspe(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square((y_true - y_pred)/y_true)))

def optiver_fnn(tpu=True):
    # params
    num_units = 64
    num_layers = 8
    embedding_size = 96
    drop_out = 0
    activation = activations.swish
    activation_out = activations.swish
    kernel_initializer = initializers.LecunUniform(seed=RANDOM_STATE)
    batch_norm = False
    optimizer = optimizers.Adam
    learning_rate = 0.0005
    regularizer = None
    
    # build models
    hidden_units = [num_units] * num_layers
    quarter_num_layers = int(num_layers / 4)
    drop_out_rates = ([drop_out] * quarter_num_layers + 
                      [drop_out / 2] * quarter_num_layers + 
                      [drop_out / 4] * quarter_num_layers + 
                      [0] * quarter_num_layers)

    if not len(hidden_units) == len(drop_out_rates):
        raise Exception("Length of hidden units must be equal to length of drop-out rates.")

    inp_cat = Input(shape=(1,), name='stock_id')
    inp_num = Input(shape=X_num_train.shape[1], name='num_data')
    
    stock_embedded = layers.Embedding(MAX_STOCK_ID, embedding_size, 
                                      input_length=1, name='stock_id_embedding')(inp_cat)
    stock_flattened = layers.Flatten()(stock_embedded)
    x = layers.Concatenate()([stock_flattened, inp_num])

    if batch_norm:
        
        x = layers.Dense(hidden_units[0], kernel_initializer=kernel_initializer, 
                         kernel_regularizer=regularizer)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation(activation)(x)
        x = layers.Dropout(drop_out_rates[0], seed=RANDOM_STATE)(x)
        
        for i in range(1, len(hidden_units)):
            x = layers.Dense(hidden_units[i], kernel_initializer=kernel_initializer, 
                             kernel_regularizer=regularizer)(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation(activation)(x)
            x = layers.Dropout(drop_out_rates[i], seed=RANDOM_STATE)(x)
            
    else:
        
        x = layers.Dense(hidden_units[0], kernel_initializer=kernel_initializer, 
                         kernel_regularizer=regularizer)(x)
        x = layers.Activation(activation)(x)
        x = layers.Dropout(drop_out_rates[0], seed=RANDOM_STATE)(x)

        for i in range(1, len(hidden_units)):
            x = layers.Dense(hidden_units[i], kernel_initializer=kernel_initializer, 
                             kernel_regularizer=regularizer)(x)
            x = layers.Activation(activation)(x)
            x = layers.Dropout(drop_out_rates[i], seed=RANDOM_STATE)(x)    
    
    x = layers.Dense(1)(x)
    out = layers.Activation(activation_out)(x)

    model = models.Model(inputs=[inp_cat, inp_num], outputs=out)
    
    def lr_normalizer(lr, optimizer):
        if optimizer == optimizers.Adam:
            return lr
        elif optimizer == optimizers.Nadam:
            return lr * 2
        else:
            raise Exception(str(optimizer) + 'is neither Adam or Nadam.')
            
    model.compile(loss=rmspe, optimizer = optimizer(lr_normalizer(learning_rate, optimizer)))        
    return model


model = optiver_fnn()
earlystop_cb = callbacks.EarlyStopping(patience=20, restore_best_weights=True)
history = model.fit((X_cat_train, X_num_train), Y_train, epochs=400, 
                    validation_data=((X_cat_dev, X_num_dev), Y_dev), batch_size=2048, callbacks=[earlystop_cb]);

In [None]:
val_loss_nn = history.history["val_loss"]
min_val_loss_nn = np.min(val_loss_nn)
best_epoch_nn = np.argmin(val_loss_nn) + 1
print("NN's minimum val_loss: ", min_val_loss_nn, ", achieved at Epoch", best_epoch_nn)

In [None]:
del X_cat_train, X_num_train, Y_train, X_cat_dev, X_num_dev, Y_dev, earlystop_cb, history
gc.collect()

In [None]:
submission_nn = predict_multiple_stocks(test_ids, model=model, scaler=scaler, model_type='nn')
submission_nn.head()

In [None]:
del model, scaler
gc.collect()

# Ensemble

In [None]:
submission = submission.merge(submission_nn, how='left', on='row_id')
del submission_nn
gc.collect()

In [None]:
base_rmspe = 0.21
error_ratio = (min_val_loss_nn - base_rmspe) / (min_val_loss_lgb - base_rmspe)
lgb_weight = error_ratio / (error_ratio + 1)
nn_weight = 1 - lgb_weight

print('LGBM weight: ', lgb_weight)
print('NN weight: ', nn_weight)
submission['target'] = (submission['target_nn'] * nn_weight + 
                        submission['target_lgbm'] * lgb_weight)
submission.drop(['target_nn', 'target_lgbm'], axis=1, inplace=True)

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()