# Ensembling Model with XGB and Simple Neural Network:
---
In this Notebook, I applied an ensemble model with XGB and a simple neural network. Using a ensemble model can theoretically reduce overfitting. In this competition, the inference time is of upmost importance. For this reason, the following ways can reduce the inference time:
1. import data with 'float32' instead of 'float64'
2. using treelite module for XGB model
3. Use small neural network instead of deep one

Also, the 'resp_1','resp_2' ,'resp_3' ,'resp_4' was told to be using as the regularization purpose for the competitors. We use all the 5 labels as target.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os, gc
# import cudf
import pandas as pd
import numpy as np
# import cupy as cp
import janestreet
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load

import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping 

# treelite
import treelite
import treelite_runtime

In [None]:
tf.random.set_seed(42)
features_columns = ["feature_%d" % i for i in range(130)]
columns_dtypes = {}
for column in features_columns:
    columns_dtypes[column] = "float32"
columns_dtypes["resp_1"] = "float32"
columns_dtypes["resp_2"] = "float32"
columns_dtypes["resp_3"] = "float32"
columns_dtypes["resp_4"] = "float32"
columns_dtypes["resp"] = "float32"
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', dtype=columns_dtypes)
features = [c for c in train.columns if "feature" in c]

In [None]:
f_mean = train[features].median()
train = train.query('date > 85').reset_index(drop = True)
train = train.query('weight > 0').reset_index(drop = True)
train[features] = train[features].fillna(f_mean)

# We use all the 5 labels as the target:
train['action'] = (  (train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) &  (train['resp'] > 0 )   ).astype('int')
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

np.save('f_mean.npy', f_mean)

In [None]:
X = train[features].values
y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget

### XGB

In [None]:
#Hyperparameters for XGB
params = {
    'colsample_bytree': 0.4,                 
    'learning_rate': 0.001,
    'max_depth': 12,
    'subsample': 0.8,
    'seed': 42,
    'tree_method': 'gpu_hist'            # Let's use GPU for a faster experiment
}
params["objective"] = 'binary:logistic'
params["eval_metric"] = 'logloss'        # target

In [None]:
# fit xgb
y_tr = train['action'].values
xgb_path = 'xgb_best.model'
dtrain = xgb.DMatrix(X, label=y_tr)
bst = xgb.train(params, dtrain, 100, [(dtrain, 'train')])
bst.save_model(xgb_path)

### NN Loop

In [None]:
def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)): 
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i+1])(x)    
        
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation('sigmoid')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing), 
                  metrics = tf.keras.metrics.AUC(name = 'AUC'), 
                 )
    
    return model

In [None]:
batch_size = 5000
hidden_units = [160, 160, 160]
dropout_rates = [
    0.25,
    0.25,
    0.25,
    0.25
]
label_smoothing = 1e-2
learning_rate = 1e-3

num_models = 2

In [None]:
model = create_mlp(X.shape[1], y.shape[1], hidden_units, dropout_rates, label_smoothing, learning_rate)
ckp_path = 'best_val_AUC.hdf5'
model.fit(X, y, epochs = 200, batch_size = batch_size, verbose = 0)
model.save_weights('best_val_AUC.hdf5')

## Retrieve the Models

In [None]:
# XGB retreive model
xgb_models = []
#bst_model = xgb.Booster()
#bst_model.load_model('xgb_best.model')
xgb_model = treelite.Model.from_xgboost(bst)
xgb_model.export_lib(toolchain='gcc', libpath='./xgb.so', params={'parallel_comp': 32}, verbose=True)
print('finish')
predictor = treelite_runtime.Predictor('./xgb.so', verbose=True)
xgb_models.append(predictor)

In [None]:
# NN retreive model
models = []
models.append(model)

### Submission

In [None]:
env = janestreet.make_env()
env_iter = env.iter_test()

In [None]:
opt_th = 0.5
f = np.median
f_mean = np.load('./f_mean.npy')
for (test_df, pred_df) in tqdm(env_iter):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:,:].sum()):
            x_tt[:, :] = np.nan_to_num(x_tt[:, :]) + np.isnan(x_tt[:, :]) * f_mean
        pred = []
        for xgb_ in xgb_models:
            batch = treelite_runtime.Batch.from_npy2d(x_tt, rbegin=0, rend=1)
            pred.append(np.array(xgb_.predict(batch)).reshape(1,1))
        for clf in models:
            predict = np.mean(clf(x_tt, training = False).numpy(), axis = 0)
            pred.append(f(predict))
        pred_df.action = np.where(np.mean(pred) >= opt_th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)