## Models:
### - 501: MLP / 1dCNN / TabNet / UNet / WaveNet
### - 601: LGB / Catboost / DeepForest(CFR) / MLP / 1dCNN / TabNet

In [None]:
# install DeepForest

!mkdir -p /tmp/pip/cache/

import os
from shutil import copyfile
from tqdm.notebook import tqdm

src = '../input/deep-forest-files/'
dst = '/tmp/pip/cache/'
for filename in tqdm(os.listdir(src)):
    if '.xyz' in filename:
        f = filename.split('.xyz')[0]
        copyfile(src + filename, dst + f + '.tar.gz')
    else:
        copyfile(src + filename, dst + filename)
        
!pip install --no-index --find-links /tmp/pip/cache/ deep-forest

from deepforest import CascadeForestRegressor as CFR

# install TabNet
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl

In [None]:
import numpy as np 
import numpy.matlib
import datatable as dt
import pandas as pd
import glob
import gc
import pickle
from collections import defaultdict
from tqdm.auto import tqdm
from numba import njit
from joblib import Parallel, delayed, dump, load

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, LabelEncoder


from numba_functions import *

# TF
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model

# Torch and TabNet
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

In [None]:
N_MINS = 5
MIN_SIZE = 600 // N_MINS
DEBUG = 0
DEBUG_STOCK_ID = 31

# CONSTANT
MEAN = -5.762330803300896
STD = 0.6339307835941186
EPS = 1e-9

# path
preprocessor_path = '../input/optiver-final-preprocessors'

In [None]:
if DEBUG:
    df_result = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    df_result['row_id'] = [f'{x[0]}-{x[1]}' for x in df_result[['stock_id', 'time_id']].values]
else:
    df_result = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
df_result.head()

In [None]:
if DEBUG:
    list_train_book_data = glob.glob('../input/optiver-realized-volatility-prediction/book_train.parquet/*')
    list_stock_id = sorted([int(path.split('=')[1]) for path in list_train_book_data])
#     list_stock_id = [0, 31]
else:
    list_test_book_data = glob.glob('../input/optiver-realized-volatility-prediction/book_test.parquet/*')
    list_stock_id = sorted([int(path.split('=')[1]) for path in list_test_book_data])

# Functions

In [None]:
def calc_log_return(prices):
    return np.log(prices).diff()


def transform_target(target):
    return (np.log(target + EPS) - MEAN) / STD


def inverse_target(target):
    return np.exp(MEAN + STD * target) - EPS


def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


def mspe_loss(y_true, y_pred):
    y_true = K.exp(MEAN + STD * y_true) - EPS
    y_pred = K.exp(MEAN + STD * y_pred) - EPS
    return K.sqrt(K.mean(K.square((y_true - y_pred) / y_true)))


def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
def load_pickle(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data


def gen_row_id(df):
    df['row_id'] = [f'{x[0]}-{x[1]}' for x in df[['stock_id', 'time_id']].values]
    return df


def get_path_by_id(data_type, stock_id):
    if DEBUG:
        return f'../input/optiver-realized-volatility-prediction/{data_type}_train.parquet/stock_id={stock_id}'
    return f'../input/optiver-realized-volatility-prediction/{data_type}_test.parquet/stock_id={stock_id}'

# 501

## Feature Generation

In [None]:
def unstack_agg(df, agg_col):
    df = df.unstack(level=1)
    df.columns = [f'{agg_col}_{k}' for k in df.columns]
    return df.reset_index()

def init_feature_df(df_book, stock_id):
    df_feature = pd.DataFrame(df_book['time_id'].unique())
    df_feature['stock_id'] = stock_id
    df_feature.columns = ['time_id', 'stock_id']
    return df_feature[['stock_id', 'time_id']]

def add_stats(df, cols, data_name, suffix='', axis=0):
    unwrap = lambda x: x.item() if len(cols) == 1 else x
    df[f'{data_name}_{suffix}_mean'] = unwrap(df[cols].mean(axis=axis).values)
    df[f'{data_name}_{suffix}_std'] = unwrap(df[cols].std(axis=axis).values)
    df[f'{data_name}_{suffix}_skew'] = unwrap(df[cols].skew(axis=axis).values)
    df[f'{data_name}_{suffix}_min'] = unwrap(df[cols].min(axis=axis).values)
    df[f'{data_name}_{suffix}_q1'] = unwrap(df[cols].quantile(q=0.25, axis=axis).values)
    df[f'{data_name}_{suffix}_q2'] = unwrap(df[cols].quantile(q=0.50, axis=axis).values)
    df[f'{data_name}_{suffix}_q3'] = unwrap(df[cols].quantile(q=0.75, axis=axis).values)
    df[f'{data_name}_{suffix}_max'] = unwrap(df[cols].max(axis=axis).values)
    return df

def add_feature_min(df_feature, df, configs):
    df['min_id'] = df['seconds_in_bucket'] // MIN_SIZE
    df_gb_min = df.groupby(['time_id', 'min_id'])
    for data_col, agg_func, agg_col in configs:
        # agg by min
        df_ = df_gb_min[data_col].agg(agg_func, engine='numba')
        df_ = unstack_agg(df_, agg_col)
        df_feature = df_feature.merge(df_, on=['time_id'], how='left')
        # gen stats by min and by time
        cols = [f'{agg_col}_{k}' for k in range(N_MINS)]
        for c in cols:
            if c not in df_feature:
                df_feature[c] = 0
        df_feature = add_stats(df_feature, cols=cols, data_name=agg_col, suffix='min', axis=1)
    return df_feature.fillna(0.0)

def add_feature_time(df_feature, df, configs):
    df_gb_time = df.groupby(['time_id'])
    for data_col, agg_func, agg_col in configs:
        # agg by time
        df_ = df_gb_time[data_col].agg(agg_func, engine='numba')
        df_.name = f'{agg_col}_time'
        df_feature = df_feature.merge(df_, on=['time_id'], how='left')
    return df_feature.fillna(0.0)

def ffill_book(df_book):
    list_time_id_book = df_book.time_id.unique()
    df_ = pd.DataFrame()
    df_['time_id'] = np.matlib.repeat(list_time_id_book, 600)
    df_['seconds_in_bucket'] = np.matlib.repmat(range(600), 1, len(list_time_id_book)).ravel()
    df_book = df_.merge(df_book, on=['time_id', 'seconds_in_bucket'], how='left')
    df_book = df_book.set_index('time_id').groupby(level='time_id').ffill().bfill().reset_index() 
    return df_book

## Preprocessing - LGB

In [None]:
book_configs = [
    ('log_return1', rv_numba, 'B_RV1'),
    ('log_return2', rv_numba, 'B_RV2'),
    ('seconds_in_bucket', count_numba, 'B_NROW'),
    ('bid_vol1', sum_numba, 'B_BVOL1'),
    ('bid_vol2', sum_numba, 'B_BVOL2'),
    ('ask_vol1', sum_numba, 'B_AVOL1'),
    ('ask_vol2', sum_numba, 'B_AVOL2'),
]

book_configs_ffill = [
    ('bid_price1', mean_numba, 'B_BP1'),
    ('bid_price2', mean_numba, 'B_BP2'),
    ('ask_price1', mean_numba, 'B_AP1'),
    ('ask_price2', mean_numba, 'B_AP2'),
    ('bid_size1', mean_numba, 'B_BS1'),
    ('bid_size2', mean_numba, 'B_BS2'),
    ('ask_size1', mean_numba, 'B_AS1'),
    ('ask_size2', mean_numba, 'B_AS2'),
    # new features
    ('price1_diff', mean_numba, 'Z_P1-DIFF'),
    ('price2_diff', mean_numba, 'Z_P2-DIFF'),
    ('price1_dabs', mean_numba, 'Z_P1-DABS'),
    ('price2_dabs', mean_numba, 'Z_P2-DABS'),
    ('price_spread1', mean_numba, 'Z_SPREAD1'),
]

trade_configs = [
    ('vol', sum_numba, 'T_VOL'),
    ('order_count', sum_numba, 'T_OC'),
    ('size', sum_numba, 'T_SIZE'),
    ('seconds_in_bucket', count_numba, 'T_NROW'),
]

In [None]:
def gen_df_feature(stock_id):
    # -----------------------------------------------------------------
    # Book data (no ffill)
    book_parquet_path = get_path_by_id('book', stock_id)
    df_book = pd.read_parquet(book_parquet_path)
    df_book.iloc[:, 2:] = df_book.iloc[:, 2:].astype('float64')
    df_book_ff = df_book.copy()
    df_feature = init_feature_df(df_book, stock_id)
    # add wap and log_return
    df_book['wap1'] = calc_wap_njit(
        df_book.bid_price1.values,
        df_book.ask_price1.values,
        df_book.bid_size1.values,
        df_book.ask_size1.values
    )
    df_book['wap2'] = calc_wap_njit(
        df_book.bid_price2.values,
        df_book.ask_price2.values,
        df_book.bid_size1.values + df_book.bid_size2.values,
        df_book.ask_size1.values + df_book.ask_size2.values
    )
    df_book['log_return1'] = df_book.groupby(['time_id'])['wap1'].apply(calc_log_return).fillna(0)
    df_book['log_return2'] = df_book.groupby(['time_id'])['wap2'].apply(calc_log_return).fillna(0)
    # add vols
    df_book['bid_vol1'] = prod_njit(df_book['bid_price1'].values, df_book['bid_size1'].values)
    df_book['bid_vol2'] = prod_njit(df_book['bid_price2'].values, df_book['bid_size2'].values)
    df_book['ask_vol1'] = prod_njit(df_book['ask_price1'].values, df_book['ask_size1'].values)
    df_book['ask_vol2'] = prod_njit(df_book['ask_price2'].values, df_book['ask_size2'].values)
    # generate book features
    df_feature = add_feature_min(df_feature, df_book, book_configs)
    df_feature = add_feature_time(df_feature, df_book, book_configs)


    # -----------------------------------------------------------------
    # Book data (ffill) 
    df_book_ff = ffill_book(df_book_ff)
    # new features
    df_book_ff['price1_diff'] = df_book_ff['ask_price1'] - df_book_ff['bid_price1']
    df_book_ff['price2_diff'] = df_book_ff['ask_price2'] - df_book_ff['bid_price2']
    df_book_ff['price1_dabs'] = df_book_ff['price1_diff'].abs()
    df_book_ff['price2_dabs'] = df_book_ff['price2_diff'].abs()
    df_book_ff['price_spread1'] = (df_book_ff['ask_price1'] - df_book_ff['bid_price1']) / (df_book_ff['ask_price1'] + df_book_ff['bid_price1'])
    # generate book features
    df_feature = add_feature_min(df_feature, df_book_ff, book_configs_ffill)
    df_feature = add_feature_time(df_feature, df_book_ff, book_configs_ffill)
    

    # -----------------------------------------------------------------
    # Trade data
    trade_parquet_path = get_path_by_id('trade', stock_id)
    df_trade = pd.read_parquet(trade_parquet_path)
    df_trade.iloc[:, 2:] = df_trade.iloc[:, 2:].astype('float64')
    # add vol
    df_trade['vol'] = prod_njit(df_trade['price'].values, df_trade['size'].values)
    # generate trade features
    df_feature = add_feature_min(df_feature, df_trade, trade_configs)
    df_feature = add_feature_time(df_feature, df_trade, trade_configs)


    # -----------------------------------------------------------------
    # Combined feature
    log_return = df_trade.merge(df_book, on=['time_id', 'seconds_in_bucket'], how='left').groupby('time_id')['log_return1'].agg(lambda x: np.sum(np.square(x)))
    total_log_return = df_book.groupby('time_id')['log_return1'].agg(lambda x: np.sum(np.square(x)))
    df_feature['Z_RATIO'] = (log_return / total_log_return).values
    df_feature['Z_RATIO'] = df_feature['Z_RATIO'].fillna(0.0)
    return df_feature

In [None]:
list_dfs = Parallel(n_jobs=-1)(delayed(gen_df_feature)(stock_id) for stock_id in tqdm(list_stock_id))
df_train = pd.concat(list_dfs).reset_index(drop=True)
df_train = df_train.sort_values(['stock_id', 'time_id']).reset_index(drop=True)
df_train_nn = df_train.copy() # prepare for NN

fea_cols = [c for c in df_train.columns if c.startswith('B_') or c.startswith('T_') or c.startswith('Z_')]
fea_cols_TA = [f for f in fea_cols if 'min_' not in f]
df_time_mean = df_train.groupby('time_id')[fea_cols_TA].mean()
df_time_mean.columns = [f'{c}_TA_mean' for c in df_time_mean.columns]
df_time_mean = df_time_mean.reset_index()
df_train = df_train.merge(df_time_mean, on='time_id', how='left')

# Save data for LGB
# dt.Frame(df_train).to_csv('test_501_LGB.csv')

## Preprocessing - NN

In [None]:
def add_time_stats(df_train):
    time_cols = [f for f in df_train.columns if f.endswith('_time')]
    df_gp_stock = df_train.groupby('stock_id')
    #
    df_stats = df_gp_stock[time_cols].mean().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_mean' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].std().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_std' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].skew().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_skew' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].min().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_min' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].max().reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_max' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].quantile(0.25).reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_q1' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].quantile(0.50).reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_q2' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    #
    df_stats = df_gp_stock[time_cols].quantile(0.75).reset_index()
    df_stats.columns = ['stock_id'] + [f'{f}_q3' for f in time_cols]
    df_train = df_train.merge(df_stats, on=['stock_id'], how='left')
    return df_train.fillna(0.0)

In [None]:
# Power transformation
df_train = df_train_nn
pipe = load_pickle(f'../input/{preprocessor_path}/pipe.pkl')
df_train[fea_cols] = pipe.transform(df_train[fea_cols].values)
df_train[fea_cols] = df_train[fea_cols].clip(-10, 10, axis=1)

# Add groupby time_id features
fea_cols_TA = [f for f in fea_cols if 'min_' not in f]
df_time_mean = df_train.groupby('time_id')[fea_cols_TA].mean()
df_time_mean.columns = [f'{c}_TA_mean' for c in df_time_mean.columns]
df_time_mean = df_time_mean.reset_index()
df_train = df_train.merge(df_time_mean, on='time_id', how='left')

# Add groupby stock_id features
df_train = add_time_stats(df_train)

# Save data for NN
dt.Frame(df_train).to_csv('test_501_NN.csv')

In [None]:
del df_time_mean, list_dfs, df_train, df_train_nn
x = gc.collect()

## Prediction - LGB

In [None]:
# df_test = dt.fread('test_501_LGB.csv').to_pandas()
# fea_cols = ['stock_id'] + [f for f in df_test.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]
# X_test = df_test[fea_cols].values
# for i in range(N_FOLD):
#     lgb_model_path = f'{model_path}/lgb_501_{i}.pkl'
#     lgb_model = load_pickle(lgb_model_path)
#     df_test[f'lgb_{i}'] = lgb_model.predict(X_test)
# df_test['pred_501-lgb'] = df_test[[f'lgb_{i}' for i in range(N_FOLD)]].mean(axis=1)
# df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_501-lgb']], on=['stock_id', 'time_id'])
# df_result.head()

In [None]:
# del df_test, X_test
# x = gc.collect()

## Prediction - 501 - MLP

In [None]:
N_SEED = 5
N_FOLD = 10

df_test = dt.fread('test_501_NN.csv').to_pandas()
fea_cols = [f for f in df_test.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-501-mlp'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.hdf5'
        model = tf.keras.models.load_model(model_path, custom_objects={'mspe_loss': mspe_loss})
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict(X_test, batch_size=1024)

df_test['pred_501_mlp'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_test['pred_501_mlp'] = inverse_target(df_test['pred_501_mlp'])
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_501_mlp']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 501 - 1dCNN

In [None]:
N_SEED = 5
N_FOLD = 10

df_test = dt.fread('test_501_NN.csv').to_pandas()
fea_cols = [f for f in df_test.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-501-1dcnn'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.hdf5'
        model = tf.keras.models.load_model(model_path, custom_objects={'mspe_loss': mspe_loss})
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict(X_test, batch_size=1024)

df_test['pred_501_1dcnn'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_test['pred_501_1dcnn'] = inverse_target(df_test['pred_501_1dcnn'])
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_501_1dcnn']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 501 - TabNet

In [None]:
N_SEED = 3
N_FOLD = 5

df_test = dt.fread('test_501_NN.csv').to_pandas()
fea_cols = [f for f in df_test.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-501-tabnet'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.xyz'
        model = TabNetRegressor()
        model.load_model(model_path)
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict(X_test)

df_test['pred_501_tabnet'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_test['pred_501_tabnet'] = inverse_target(df_test['pred_501_tabnet'])
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_501_tabnet']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 501 - UNet

In [None]:
N_SEED = 3
N_FOLD = 5

df_test = dt.fread('test_501_NN.csv').to_pandas()
fea_cols = [f for f in df_test.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-501-unet'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.hdf5'
        model = tf.keras.models.load_model(model_path, custom_objects={'mspe_loss': mspe_loss})
        df_test[f'pred_{i_seed}_{i_fold}'] = inverse_target(model.predict(X_test, batch_size=1024))

df_test['pred_501_unet'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_501_unet']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 501 - WaveNet

In [None]:
N_SEED = 3
N_FOLD = 5

df_test = dt.fread('test_501_NN.csv').to_pandas()
fea_cols = [f for f in df_test.columns if f.startswith('B_') or f.startswith('T_') or f.startswith('Z_')]
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-501-wavenet'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.hdf5'
        model = tf.keras.models.load_model(model_path, custom_objects={'mspe_loss': mspe_loss})
        df_test[f'pred_{i_seed}_{i_fold}'] = inverse_target(model.predict(X_test, batch_size=1024))

df_test['pred_501_wavenet'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_501_wavenet']], on=['stock_id', 'time_id'])
df_result.head()

In [None]:
del df_test, X_test
x = gc.collect()

# 601

## Feature Generation

In [None]:
# for pandas aggregation
@njit
def rv_fast(x): return np.sqrt(np.nansum(x**2))
fast_rv = lambda x: rv_fast(x.values)
fast_rv.__name__ = 'realized_volatility'

@njit
def sum_fast(x): return np.sum(x)
fast_sum = lambda x: sum_fast(x.values)
fast_sum.__name__ = 'sum'

@njit
def mean_fast(x): return np.mean(x)
fast_mean = lambda x: mean_fast(x.values)
fast_mean.__name__ = 'mean'

# @njit
def std_fast(x): return np.std(x, ddof=1)
fast_std = lambda x: std_fast(x.values)
fast_std.__name__ = 'std'

@njit
def min_fast(x): return np.min(x)
fast_min = lambda x: min_fast(x.values)
fast_min.__name__ = 'min'

@njit
def max_fast(x): return np.max(x)
fast_max = lambda x: max_fast(x.values)
fast_max.__name__ = 'max'

@njit
def count_fast(x): return len(np.unique(x))
fast_count = lambda x: count_fast(x.values)
fast_count.__name__ = 'count_unique'

In [None]:
data_dir = '../input/optiver-realized-volatility-prediction/'

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv(f'{data_dir}/train.csv')
    test = pd.read_csv(f'{data_dir}/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df.iloc[:, 2:] = df.iloc[:, 2:].astype('float64')
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)
    # Calculate log returns    
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [fast_sum, fast_std],
        'wap2': [fast_sum, fast_std],
        'wap3': [fast_sum, fast_std],
        'wap4': [fast_sum, fast_std],
        'log_return1': [fast_rv],
        'log_return2': [fast_rv],
        'log_return3': [fast_rv],
        'log_return4': [fast_rv],
        'wap_balance': [fast_sum, fast_max],
        'price_spread':[fast_sum, fast_max],
        'price_spread2':[fast_sum, fast_max],
        'bid_spread':[fast_sum, fast_max],
        'ask_spread':[fast_sum, fast_max],
        'total_volume':[fast_sum, fast_max],
        'volume_imbalance':[fast_sum, fast_max],
        "bid_ask_spread":[fast_sum, fast_max],
    }
    create_feature_dict_time = {
        'log_return1': [fast_rv],
        'log_return2': [fast_rv],
        'log_return3': [fast_rv],
        'log_return4': [fast_rv],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1].split('/')[0]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df.iloc[:, 2:] = df.iloc[:, 2:].astype('float64')
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[fast_rv],
        'seconds_in_bucket':[fast_count],
        'size':[fast_sum, fast_max, fast_min],
        'order_count':[fast_sum,fast_max],
        'amount':[fast_sum,fast_max,fast_min],
    }
    create_feature_dict_time = {
        'log_return':[fast_rv],
        'seconds_in_bucket':[fast_count],
        'size':[fast_sum],
        'order_count':[fast_sum],
    }
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1].split('/')[0]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    

# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = glob.glob(f'../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id={stock_id}/*')[0]
            file_path_trade = glob.glob(f'../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id={stock_id}/*')[0]
        # Test
        else:
            file_path_book = glob.glob(f'../input/optiver-realized-volatility-prediction/book_test.parquet/stock_id={stock_id}/*')[0]
            file_path_trade = glob.glob(f'../input/optiver-realized-volatility-prediction/trade_test.parquet/stock_id={stock_id}/*')[0]
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in tqdm(list_stock_ids))
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

In [None]:
# Read train and test
train, test = read_train_test()

# Get unique stock ids 
# train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
# train_ = preprocessor(train_stock_ids, is_train = True)
# train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# # Get group stats of time_id and stock_id
# train = get_time_stock(train)
test = get_time_stock(test)

In [None]:
# replace by order sum (tau)
# train['size_tau'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique'] )
test['size_tau'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique'] )
# train['size_tau_400'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_400'] )
test['size_tau_400'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_400'] )
# train['size_tau_300'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_300'] )
test['size_tau_300'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_300'] )
# train['size_tau_200'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_200'] )
test['size_tau_200'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_200'] )

In [None]:
# train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )
test['size_tau2'] = np.sqrt( 1/ test['trade_order_count_sum'] )
# train['size_tau2_400'] = np.sqrt( 0.33/ train['trade_order_count_sum'] )
test['size_tau2_400'] = np.sqrt( 0.33/ test['trade_order_count_sum'] )
# train['size_tau2_300'] = np.sqrt( 0.5/ train['trade_order_count_sum'] )
test['size_tau2_300'] = np.sqrt( 0.5/ test['trade_order_count_sum'] )
# train['size_tau2_200'] = np.sqrt( 0.66/ train['trade_order_count_sum'] )
test['size_tau2_200'] = np.sqrt( 0.66/ test['trade_order_count_sum'] )

# delta tau
# train['size_tau2_d'] = train['size_tau2_400'] - train['size_tau2']
test['size_tau2_d'] = test['size_tau2_400'] - test['size_tau2']

In [None]:
train = dt.fread('../input/optiver-501-train/public_train_601.csv').to_pandas()
train.head()

In [None]:
test.head()

In [None]:
colNames = [col for col in list(train.columns)
            if col not in {"stock_id", "time_id", "target", "row_id"}]
len(colNames)

## Kmean for LGB

In [None]:
from sklearn.cluster import KMeans
# making agg features

train_p = pd.read_csv(f'{data_dir}/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
print(kmeans.labels_)

l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )


mat = []
matTest = []

n = 0
for ind in l:
    print(ind)
    newDf = train.loc[train['stock_id'].isin(ind) ]
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    mat.append ( newDf )
    
    newDf = test.loc[test['stock_id'].isin(ind) ]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    matTest.append ( newDf )
    
    n+=1
    
mat1 = pd.concat(mat).reset_index()
mat1.drop(columns=['target'],inplace=True)

mat2 = pd.concat(matTest).reset_index()

In [None]:
mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

In [None]:
nnn = ['time_id',
     'log_return1_realized_volatility_0c1',
     'log_return1_realized_volatility_1c1',     
     'log_return1_realized_volatility_3c1',
     'log_return1_realized_volatility_4c1',     
     'log_return1_realized_volatility_6c1',
     'total_volume_sum_0c1',
     'total_volume_sum_1c1', 
     'total_volume_sum_3c1',
     'total_volume_sum_4c1', 
     'total_volume_sum_6c1',
     'trade_size_sum_0c1',
     'trade_size_sum_1c1', 
     'trade_size_sum_3c1',
     'trade_size_sum_4c1', 
     'trade_size_sum_6c1',
     'trade_order_count_sum_0c1',
     'trade_order_count_sum_1c1',
     'trade_order_count_sum_3c1',
     'trade_order_count_sum_4c1',
     'trade_order_count_sum_6c1',      
     'price_spread_sum_0c1',
     'price_spread_sum_1c1',
     'price_spread_sum_3c1',
     'price_spread_sum_4c1',
     'price_spread_sum_6c1',   
     'bid_spread_sum_0c1',
     'bid_spread_sum_1c1',
     'bid_spread_sum_3c1',
     'bid_spread_sum_4c1',
     'bid_spread_sum_6c1',       
     'ask_spread_sum_0c1',
     'ask_spread_sum_1c1',
     'ask_spread_sum_3c1',
     'ask_spread_sum_4c1',
     'ask_spread_sum_6c1',   
     'volume_imbalance_sum_0c1',
     'volume_imbalance_sum_1c1',
     'volume_imbalance_sum_3c1',
     'volume_imbalance_sum_4c1',
     'volume_imbalance_sum_6c1',       
     'bid_ask_spread_sum_0c1',
     'bid_ask_spread_sum_1c1',
     'bid_ask_spread_sum_3c1',
     'bid_ask_spread_sum_4c1',
     'bid_ask_spread_sum_6c1',
     'size_tau2_0c1',
     'size_tau2_1c1',
     'size_tau2_3c1',
     'size_tau2_4c1',
     'size_tau2_6c1'] 

train = pd.merge(train,mat1[nnn],how='left',on='time_id')
test = pd.merge(test,mat2[nnn],how='left',on='time_id')

In [None]:
if DEBUG:
    dt.Frame(train).to_csv(f'test_601_LGB.csv')    
else:
    dt.Frame(test).to_csv(f'test_601_LGB.csv')

# Kmean + Quantile transformation for NN

In [None]:
train.replace([np.inf, -np.inf], np.nan,inplace=True)
test.replace([np.inf, -np.inf], np.nan,inplace=True)
qt_train = load_pickle(f'{preprocessor_path}/qt_train.pkl')
train_nn=train[colNames].copy()
test_nn=test[colNames].copy()
for i, col in enumerate(colNames):
    #print(col)
    qt = qt_train[i]
    train_nn[col] = qt.transform(train_nn[[col]])
    test_nn[col] = qt.transform(test_nn[[col]])

In [None]:
train_nn[['stock_id','time_id','target']]=train[['stock_id','time_id','target']]
test_nn[['stock_id','time_id']]=test[['stock_id','time_id']]

In [None]:
# Kmean: making agg features
from sklearn.cluster import KMeans
train_p = pd.read_csv(f'{data_dir}/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
print(kmeans.labels_)

l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )
    

mat = []
matTest = []

n = 0
for ind in l:
    print(ind)
    newDf = train_nn.loc[train_nn['stock_id'].isin(ind) ]
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    mat.append ( newDf )
    
    newDf = test_nn.loc[test_nn['stock_id'].isin(ind) ]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    matTest.append ( newDf )
    
    n+=1
    
mat1 = pd.concat(mat).reset_index()
mat1.drop(columns=['target'],inplace=True)

mat2 = pd.concat(matTest).reset_index()
mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])

In [None]:
nnn = ['time_id',
     'log_return1_realized_volatility_0c1',
     'log_return1_realized_volatility_1c1',     
     'log_return1_realized_volatility_3c1',
     'log_return1_realized_volatility_4c1',     
     'log_return1_realized_volatility_6c1',
     'total_volume_sum_0c1',
     'total_volume_sum_1c1', 
     'total_volume_sum_3c1',
     'total_volume_sum_4c1', 
     'total_volume_sum_6c1',
     'trade_size_sum_0c1',
     'trade_size_sum_1c1', 
     'trade_size_sum_3c1',
     'trade_size_sum_4c1', 
     'trade_size_sum_6c1',
     'trade_order_count_sum_0c1',
     'trade_order_count_sum_1c1',
     'trade_order_count_sum_3c1',
     'trade_order_count_sum_4c1',
     'trade_order_count_sum_6c1',      
     'price_spread_sum_0c1',
     'price_spread_sum_1c1',
     'price_spread_sum_3c1',
     'price_spread_sum_4c1',
     'price_spread_sum_6c1',   
     'bid_spread_sum_0c1',
     'bid_spread_sum_1c1',
     'bid_spread_sum_3c1',
     'bid_spread_sum_4c1',
     'bid_spread_sum_6c1',       
     'ask_spread_sum_0c1',
     'ask_spread_sum_1c1',
     'ask_spread_sum_3c1',
     'ask_spread_sum_4c1',
     'ask_spread_sum_6c1',   
     'volume_imbalance_sum_0c1',
     'volume_imbalance_sum_1c1',
     'volume_imbalance_sum_3c1',
     'volume_imbalance_sum_4c1',
     'volume_imbalance_sum_6c1',       
     'bid_ask_spread_sum_0c1',
     'bid_ask_spread_sum_1c1',
     'bid_ask_spread_sum_3c1',
     'bid_ask_spread_sum_4c1',
     'bid_ask_spread_sum_6c1',
     'size_tau2_0c1',
     'size_tau2_1c1',
     'size_tau2_3c1',
     'size_tau2_4c1',
     'size_tau2_6c1'] 

In [None]:
mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

train_nn = pd.merge(train_nn,mat1[nnn],how='left',on='time_id')
test_nn = pd.merge(test_nn,mat2[nnn],how='left',on='time_id')

In [None]:
# fill mean
features_to_consider = list(train_nn)

features_to_consider.remove('time_id')
features_to_consider.remove('target')
try:
    features_to_consider.remove('pred_NN')
except:
    pass

train_nn[features_to_consider] = train_nn[features_to_consider].fillna(train_nn[features_to_consider].mean())
test_nn[features_to_consider] = test_nn[features_to_consider].fillna(train_nn[features_to_consider].mean())

if DEBUG:
    dt.Frame(train_nn).to_csv(f'test_601_NN.csv')    
else:
    dt.Frame(test_nn).to_csv(f'test_601_NN.csv')

In [None]:
del mat1, mat2, train, test, train_nn, test_nn
# del train,test
x = gc.collect()

## Prediction - 601 - LGB

In [None]:
N_SEED = 5
N_FOLD = 10

df_test = dt.fread('test_601_LGB.csv').to_pandas()
fea_cols = ['stock_id'] + [f for f in df_test if f not in ['time_id', 'target', 'stock_id', 'row_id']]
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-601-lgb'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        model_path = f'{model_folder}/LGB_{i_seed}_{i_fold}.pkl'
        model = load_pickle(model_path)
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict(X_test)
df_test['pred_601_lgb'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_601_lgb']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 601 - Catboost

In [None]:
N_SEED = 5
N_FOLD = 10

df_test = dt.fread('test_601_LGB.csv').to_pandas()
fea_cols = ['stock_id'] + [f for f in df_test if f not in ['time_id', 'target', 'stock_id', 'row_id']]
X_test = df_test[fea_cols]
model_folder = '../input/optiver-final-601-cat'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        model_path = f'{model_folder}/CAT_{i_seed}_{i_fold}.pkl'
        model = load_pickle(model_path)
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict(X_test)
df_test['pred_601_cat'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_601_cat']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 601 - DeepForest (CFR)

In [None]:
N_FOLD = 5

df_test = dt.fread('test_601_LGB.csv').to_pandas()
fea_cols = ['stock_id'] + [f for f in df_test if f not in ['time_id', 'target', 'stock_id', 'row_id']]
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-601-cfr'
for i_fold in tqdm(range(N_FOLD)):
    model_path = f'{model_folder}/cfr_601_{i_fold}.pkl'
    model = load_pickle(model_path)
    df_test[f'pred_{i_fold}'] = model.predict(X_test)
df_test['pred_601_cfr'] = df_test[[f'pred_{i_fold}' for i_fold in range(N_FOLD)]].mean(axis=1)
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_601_cfr']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 601 - MLP

In [None]:
N_SEED = 5
N_FOLD = 10

df_test = dt.fread('test_601_NN.csv').to_pandas()
fea_cols = [f for f in df_test if f not in ['time_id', 'target', 'pred_NN', 'stock_id', 'row_id']]
S_test = df_test['stock_id'].values
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-601-mlp'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        scaler = load_pickle(f'{model_folder}/minmax_scaler_{i_seed}_{i_fold}.pkl')
        X_test_scaled = scaler.transform(X_test)
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.hdf5'
        model = tf.keras.models.load_model(model_path, custom_objects={'mspe_loss': mspe_loss})
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict([S_test, X_test_scaled], batch_size=1024)

df_test['pred_601_mlp'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_test['pred_601_mlp'] = inverse_target(df_test['pred_601_mlp'])
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_601_mlp']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 601 - 1dCNN

In [None]:
N_SEED = 5
N_FOLD = 10

df_test = dt.fread('test_601_NN.csv').to_pandas()
fea_cols = [f for f in df_test if f not in ['time_id', 'target', 'pred_NN', 'stock_id', 'row_id']]
S_test = df_test['stock_id'].values
X_test = df_test[fea_cols].values
model_folder = '../input/optiver-final-601-1dcnn'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        scaler = load_pickle(f'{model_folder}/minmax_scaler_{i_seed}_{i_fold}.pkl')
        X_test_scaled = scaler.transform(X_test)
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.hdf5'
        model = tf.keras.models.load_model(model_path, custom_objects={'mspe_loss': mspe_loss})
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict([S_test, X_test_scaled], batch_size=1024)

df_test['pred_601_1dcnn'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_test['pred_601_1dcnn'] = inverse_target(df_test['pred_601_1dcnn'])
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_601_1dcnn']], on=['stock_id', 'time_id'])
df_result.head()

## Prediction - 601 - TabNet

In [None]:
N_SEED = 3
N_FOLD = 5

df_test = dt.fread('test_601_NN.csv').to_pandas()
df_test['stock_id'] = df_test['stock_id'].astype('int64')
fea_cols = [f for f in df_test if f not in ['time_id', 'target', 'pred_NN', 'stock_id', 'row_id']]
S_test = df_test['stock_id'].values
X_test = df_test[['stock_id']+fea_cols].values
model_folder = '../input/optiver-final-601-tabnet'
for i_seed in range(N_SEED):
    for i_fold in range(N_FOLD):
        scaler = load_pickle(f'{model_folder}/minmax_scaler_{i_seed}_{i_fold}.pkl')
        X_test_scaled = X_test.copy()
        X_test_scaled[:, 1:] = scaler.transform(X_test[:, 1:])
        model_path = f'{model_folder}/model_{i_seed}_{i_fold}.xyz'
        model = TabNetRegressor()
        model.load_model(model_path)
        df_test[f'pred_{i_seed}_{i_fold}'] = model.predict(X_test_scaled)

df_test['pred_601_tabnet'] = df_test[[f'pred_{i_seed}_{i_fold}' for i_seed in range(N_SEED) for i_fold in range(N_FOLD)]].mean(axis=1)
df_test['pred_601_tabnet'] = inverse_target(df_test['pred_601_tabnet'])
df_result = df_result.merge(df_test[['stock_id', 'time_id', 'pred_601_tabnet']], on=['stock_id', 'time_id'])
df_result.head()

In [None]:
del df_test, X_test
x = gc.collect()

# Public - KFold

In [None]:
def read_train_test():
    # Function to read our base train and test set
    
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    print(f'Our test set has {test.shape[0]} rows')
    print(f'Our training set has {train.isna().sum().sum()} missing values')
    print(f'Our test set has {test.isna().sum().sum()} missing values')
    
    return train, test

In [None]:
train, test = read_train_test()

In [None]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

def calc_wap1(df):
    # Function to calculate first WAP
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    # Function to calculate second WAP
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(series):
    # Function to calculate the log of the return
    return np.log(series).diff()

def realized_volatility(series):
    # Calculate the realized volatility
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    # Function to count unique elements of a series
    return len(np.unique(series))

def book_preprocessor(file_path):
    # Function to preprocess book data (for each stock id)
    
    df = pd.read_parquet(file_path)
    
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'price_spread2':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std],
        "bid_ask_spread":[np.sum, np.mean, np.std],
    }
    
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Function to get group stats for different windows (seconds in bucket)
        
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    
    # Merge all
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')

    # Drop unnecesary time_ids
    df_feature.drop(['time_id__400', 'time_id__300', 'time_id__200'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    
    return df_feature


def trade_preprocessor(file_path):
    # Function to preprocess trade data (for each stock id)
    
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, realized_volatility, np.mean, np.std, np.max, np.min],
        'order_count':[np.mean,np.sum,np.max],
    }
    
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Function to get group stats for different windows (seconds in bucket)
        
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')

    # Drop unnecesary time_ids
    df_feature.drop(['time_id__400', 'time_id__300', 'time_id__200','time_id'], axis = 1, inplace = True)
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    
    def order_sum(df, sec:str):
        new_col = 'size_tau' + sec
        bucket_col = 'trade_seconds_in_bucket_count_unique' + sec
        df[new_col] = np.sqrt(1/df[bucket_col])
        
        new_col2 = 'size_tau2' + sec
        order_col = 'trade_order_count_sum' + sec
        df[new_col2] = np.sqrt(1/df[order_col])
        
        if sec == '400_':
            df['size_tau2_d'] = df['size_tau2_400'] - df['size_tau2']
        

    
    for sec in ['','_200','_300','_400']:
        order_sum(df_feature, sec)
        
    df_feature['size_tau2_d'] = df_feature['size_tau2_400'] - df_feature['size_tau2']
    
    return df_feature


def get_time_stock(df):
    # Function to get group stats for the stock_id and time_id
    
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    
    return df

def create_agg_features(train, test):

    # Making agg features

    train_p = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')
    corr = train_p.corr()
    ids = corr.index
    kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
    l = []
    for n in range(7):
        l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )

    mat = []
    matTest = []
    n = 0
    for ind in l:
        newDf = train.loc[train['stock_id'].isin(ind) ]
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        mat.append ( newDf )
        newDf = test.loc[test['stock_id'].isin(ind) ]    
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        matTest.append ( newDf )
        n+=1

    mat1 = pd.concat(mat).reset_index()
    mat1.drop(columns=['target'],inplace=True)
    mat2 = pd.concat(matTest).reset_index()
    
    mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
    
    mat1 = mat1.pivot(index='time_id', columns='stock_id')
    mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
    mat1.reset_index(inplace=True)
    
    mat2 = mat2.pivot(index='time_id', columns='stock_id')
    mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
    mat2.reset_index(inplace=True)
    
    prefix = ['log_return1_realized_volatility', 'total_volume_mean', 'trade_size_mean', 'trade_order_count_mean','price_spread_mean','bid_spread_mean','ask_spread_mean',
              'volume_imbalance_mean', 'bid_ask_spread_mean','size_tau2']
    selected_cols=mat1.filter(regex='|'.join(f'^{x}.(0|1|3|4|6)c1' for x in prefix)).columns.tolist()
    selected_cols.append('time_id')
    
    train_m = pd.merge(train,mat1[selected_cols],how='left',on='time_id')
    test_m = pd.merge(test,mat2[selected_cols],how='left',on='time_id')
    
    # filling missing values with train means

    features = [col for col in train_m.columns.tolist() if col not in ['time_id','target','row_id']]
    train_m[features] = train_m[features].fillna(train_m[features].mean())
    test_m[features] = test_m[features].fillna(train_m[features].mean())

    return train_m, test_m
    
    
def preprocessor(list_stock_ids, is_train = True):
    # Funtion to make preprocessing function in parallel (for each stock id)
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    
    return df

In [None]:
# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()

# Preprocess them using Parallel and our single stock id functions
# train_ = preprocessor(train_stock_ids, is_train = True)
train_ = pd.read_csv('../input/optiver-public-kfold-train/kfold_train_.csv')
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()

# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)

# Fill inf values
train.replace([np.inf, -np.inf], np.nan,inplace=True)
test.replace([np.inf, -np.inf], np.nan,inplace=True)

# Aggregating some features
train, test = create_agg_features(train,test)
test = test.loc[test['row_id'].isin(df_result['row_id'])]

In [None]:
X = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
X_test=test.copy()
X_test.drop(['time_id','row_id'], axis=1,inplace=True)

In [None]:
nunique = X.nunique()
types = X.dtypes

categorical_columns = []
categorical_dims =  {}

scalers = dict()
for col in X.columns:
    if col == 'stock_id':
        l_enc = LabelEncoder()
        X[col] = l_enc.fit_transform(X[col].values)
        X_test[col] = l_enc.transform(X_test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
        dump(l_enc, 'l_enc')
    else:
        scaler = StandardScaler()
        X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
        X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))
        scalers[col] = scaler

dump(scalers, 'scalers')
cat_idxs = [ i for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

## Prediction - KFold - TabNet

In [None]:
N_FOLD = 5

test_predictions = np.zeros(X_test.shape[0])
model_folder = '../input/optiver-public-kfold-tabnet'
for i_fold in range(N_FOLD):
    model_path = f'{model_folder}/fold{i_fold}.xyz'
    model = TabNetRegressor()
    model.load_model(model_path)
    test_predictions += model.predict(X_test.values).flatten() / N_FOLD

df_result['pred_kfold_tabnet'] = test_predictions
df_result.head()

# Make submission

In [None]:
pred_cols = ['pred_501_mlp', 'pred_501_wavenet', 'pred_501_1dcnn', 'pred_501_tabnet', 'pred_501_unet']
coef_ = [ 0.4, 0, 0.4, 0.1, 0.1 ]
df_result[pred_cols] = transform_target(df_result[pred_cols])
df_result['fpred_501'] = np.sum(coef_ * df_result[pred_cols].values, axis=1)
df_result['fpred_501'] = inverse_target(df_result['fpred_501'])

pred_cols = ['pred_601_tabnet', 'pred_601_mlp', 'pred_601_lgb', 'pred_601_cat', 'pred_601_1dcnn', 'pred_601_cfr']
coef_ = [ 0.2, 0.4, 0.15, 0, 0.25, 0 ]
df_result[pred_cols] = transform_target(df_result[pred_cols])
df_result['fpred_601'] = np.sum(coef_ * df_result[pred_cols].values, axis=1)
df_result['fpred_601'] = inverse_target(df_result['fpred_601'])

pred_cols = ['fpred_501', 'fpred_601', 'pred_kfold_tabnet']
coef_ = [ 0.6, 0.3, 0.1 ]
df_result[pred_cols] = transform_target(df_result[pred_cols])
df_result['target'] = np.sum(coef_ * df_result[pred_cols].values, axis=1)
df_result['target'] = inverse_target(df_result['target'])

df_result.head()

In [None]:
# Make submission
df_submission = df_result[['row_id', 'target']]
df_submission.to_csv('submission.csv', index=False)
print(df_submission.shape)
df_submission.head()