## Blending ResNet, MLP, & XGBoost

In this notebook, first we use Pytorch to implement a ResNet model. If run alone, we find this ResNet model could get a score 5k+ on the public test set. 

Then we use Tensorflow to implement a MLP model. If run alone, the model could get a score 9k+ on the public test plus. 

Then we add the XGBoost model. If run alone, the model could get a score 7k+ on the public test set. 

When we blend them together to form an ensemble model, we find we can a score 10k+ on the public test set. We believe this ensembled model will solve better the overfitting problem on the private dataset. 

Note that, in this notebook, we load the pre-trained model from other notebooks, and we do not do the training part in this notebook. 

Also note that, for ResNet and MLP, we use mean value of each column to fill the missing value. 

### ResNet train code:
https://www.kaggle.com/a763337092/neural-network-starter-pytorch-version<br/>
https://www.kaggle.com/a763337092/pytorch-resnet-starter-training<br/>

### MLP training code:
https://www.kaggle.com/code1110/jane-street-with-keras-nn-overfit<br/>

### XGBoost training code: 
https://www.kaggle.com/dongwenjian/key-notebook-xgboost/edit<br/>

The ResNet part and the MLP part are based on the amazing notebook of Lindada焱焱焱 :

https://www.kaggle.com/a763337092/blending-tensorflow-and-pytorch

We add our xgboost model for better solving the overfitting problem.

## ResNet part

In [None]:
import os
import time
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import log_loss, roc_auc_score

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

DATA_PATH = '../input/jane-street-market-prediction/'

NFOLDS = 5

TRAIN = False
CACHE_PATH = '../input/mlp012003weights'
XGBOOST_PATH = '../input/xgboost'

def save_pickle(dic, save_path):
    with open(save_path, 'wb') as f:
    # with gzip.open(save_path, 'wb') as f:
        pickle.dump(dic, f)

def load_pickle(load_path):
    with open(load_path, 'rb') as f:
    # with gzip.open(load_path, 'rb') as f:
        message_dict = pickle.load(f)
    return message_dict

feat_cols = [f'feature_{i}' for i in range(130)]

target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4']

f_mean = np.load(f'{CACHE_PATH}/f_mean_online.npy')


##### Making features
all_feat_cols = [col for col in feat_cols]
all_feat_cols.extend(['cross_41_42_43', 'cross_1_2'])

##### Model&Data fnc
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.batch_norm0 = nn.BatchNorm1d(len(all_feat_cols))
        self.dropout0 = nn.Dropout(0.2)

        dropout_rate = 0.2
        hidden_size = 256
        self.dense1 = nn.Linear(len(all_feat_cols), hidden_size)
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(dropout_rate)

        self.dense2 = nn.Linear(hidden_size+len(all_feat_cols), hidden_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(dropout_rate)

        self.dense3 = nn.Linear(hidden_size+hidden_size, hidden_size)
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(dropout_rate)

        self.dense4 = nn.Linear(hidden_size+hidden_size, hidden_size)
        self.batch_norm4 = nn.BatchNorm1d(hidden_size)
        self.dropout4 = nn.Dropout(dropout_rate)

        self.dense5 = nn.Linear(hidden_size+hidden_size, len(target_cols))

        self.Relu = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        # self.GeLU = nn.GELU()
        self.RReLU = nn.RReLU()

    def forward(self, x):
        x = self.batch_norm0(x)
        x = self.dropout0(x)

        x1 = self.dense1(x)
        x1 = self.batch_norm1(x1)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x1 = self.LeakyReLU(x1)
        x1 = self.dropout1(x1)

        x = torch.cat([x, x1], 1)

        x2 = self.dense2(x)
        x2 = self.batch_norm2(x2)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x2 = self.LeakyReLU(x2)
        x2 = self.dropout2(x2)

        x = torch.cat([x1, x2], 1)

        x3 = self.dense3(x)
        x3 = self.batch_norm3(x3)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x3 = self.LeakyReLU(x3)
        x3 = self.dropout3(x3)

        x = torch.cat([x2, x3], 1)

        x4 = self.dense4(x)
        x4 = self.batch_norm4(x4)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x4 = self.LeakyReLU(x4)
        x4 = self.dropout4(x4)

        x = torch.cat([x3, x4], 1)

        x = self.dense5(x)

        return x

if True:
    #if torch.cuda.is_available():
    device = torch.device("cuda:0")
    #else:
    #    device = torch.device("cpu")

    model_list = []
    tmp = np.zeros(len(feat_cols))
    for _fold in range(NFOLDS):
        torch.cuda.empty_cache()
        model = Model()
        model.to(device)
        model_weights = f"{CACHE_PATH}/online_model{_fold}.pth"
        model.load_state_dict(torch.load(model_weights))
        model.eval()
        model_list.append(model)



## MLP part

In [None]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import tensorflow_addons as tfa

import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices


SEED = 1111

np.random.seed(SEED)

# fit
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

epochs = 200
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)
clf = create_mlp(
    len(feat_cols), 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )
clf.load_weights('../input/jane-street-with-keras-nn-overfit/model.h5')

tf_models = [clf]

## XGBoost Part

In [None]:
import joblib
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
XGBOOST_PATH = '../input/xgboost'

# Load median, a pandas series, from csv. I am not sure if there is another simpler way to this
median_df = pd.read_csv(XGBOOST_PATH+'/median_pd_130_features.csv', index_col=False, header=0);
median_df.columns = range(median_df.shape[1])
median_df = median_df.transpose()
median_df.columns = median_df.iloc[0]
median_df.drop(median_df.index[0], inplace=True)
#median_df.reset_index(drop=True)
median = median_df.iloc[0]

In [None]:
xgb_file_suffix = "-n-500-d-8-sub-0.9-lr-0.05.joblib"
xgb_clfs = []
for i in range(5):
    xgb_clf = joblib.load(XGBOOST_PATH + "/xgb" + str(i) + xgb_file_suffix)
    xgb_clfs.append(xgb_clf)

## Local accuracy test

In [None]:
from sklearn.model_selection import train_test_split
import datatable

LOCAL_TEST = True

if LOCAL_TEST:    

    datatable_frame = datatable.fread('../input/jane-street-market-prediction/train.csv')
    
    df_raw = datatable_frame.to_pandas()
    
    del datatable_frame

    df_raw = df_raw.query('date > 85').reset_index(drop = True) 
    df_raw = df_raw[df_raw['weight'] != 0]
    
    df_raw['action'] = ((df_raw['resp'].values) > 0).astype(int)

    
    df_train, df_test = train_test_split(df_raw, test_size=0.2, shuffle=True, random_state=150)
    
    features = [c for c in df_train.columns if "feature" in c]
    all_feat_cols = [col for col in features]
    
    del df_raw, df_train
    
    neutral_values = median
    df_test.fillna(neutral_values,inplace=True)

    X_test = df_test.loc[:, df_test.columns.str.contains('feature')]
    
    X_test_extended = df_test.loc[:, df_test.columns.str.contains('feature')]
    X_test_extended['cross_41_42_43'] = X_test_extended['feature_41'] + X_test_extended['feature_42'] + X_test_extended['feature_43']
    X_test_extended['cross_1_2'] = X_test_extended['feature_1'] / (X_test_extended['feature_2'] + 1e-5)
    
    all_feat_cols.extend(['cross_41_42_43', 'cross_1_2'])

    #y_test = np.stack([(df_test[c] > 0).astype('int') for c in resp_cols]).T

    y_action_test = df_test['action'].to_numpy()
    
    del df_test
    


    

In [None]:
from torch.utils.data import Dataset, DataLoader

if LOCAL_TEST:
    class TestDataset(Dataset):
        def __init__(self, df):
            self.features = df[all_feat_cols].values

            #self.label = df[target_cols].values.reshape(-1, len(target_cols))

        def __len__(self):
            return len(self.features)

        def __getitem__(self, idx):
            return {
                'features': torch.tensor(self.features[idx], dtype=torch.float)
                #'label': torch.tensor(self.label[idx], dtype=torch.float)
            }

In [None]:
from sklearn import metrics

if LOCAL_TEST:

    # ResNet with Pytorch
    BATCH_SIZE = 8192
    test_set = TestDataset(X_test_extended)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    
    for model in model_list:
        model.eval()
        
    torch_preds = []
    for data in test_loader:
        feature_data = data['features'].to(device)
        multiple_preds = np.zeros((len(feature_data), len(model_list)))
        for model in model_list:
            multiple_preds += model(torch.tensor(feature_data, dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy() / NFOLDS
            
        torch_pred = np.median(multiple_preds, axis=1)
        torch_preds.append(torch_pred)
        
    resnet_preds = np.concatenate(torch_preds)
    
    
    # MLP with Tensorflow
    mlp_preds = np.median(tf_models[0](X_test.values, training = False).numpy(), axis=1)  # currently we only have one tf model
    

    # XGBoost
    five_preds = []

    for i in range(5):  #len(resp_cols)
        pred_prob = xgb_clfs[i].predict_proba(X_test)[:,1]    # arr[0] is the probability for class 0, arr[1] is the probability for class 1
        five_preds.append(pred_prob)
    five_preds = np.array(five_preds).T
    xgboost_preds = np.median(five_preds, axis=1)


    # Blend the three models
    th = 0.5

    preds = np.mean(np.vstack([resnet_preds, mlp_preds, xgboost_preds]).T, axis=1)
    actions_predicted = np.where(preds >= th, 1, 0).astype(int)

    print(preds.shape)
    print(actions_predicted.shape)

    print(metrics.accuracy_score(y_action_test, actions_predicted))

## Inference with online testing API

In [None]:
if not LOCAL_TEST:
    import janestreet
    env = janestreet.make_env()
    env_iter = env.iter_test()

    th = 0.5
    for (test_df, pred_df) in tqdm(env_iter):
        if test_df['weight'].item() > 0:
            x_tt = test_df.loc[:, feat_cols].values
            if np.isnan(x_tt.sum()):
                x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * f_mean

            cross_41_42_43 = x_tt[:, 41] + x_tt[:, 42] + x_tt[:, 43]
            cross_1_2 = x_tt[:, 1] / (x_tt[:, 2] + 1e-5)
            feature_inp = np.concatenate((
                x_tt,
                np.array(cross_41_42_43).reshape(x_tt.shape[0], 1),
                np.array(cross_1_2).reshape(x_tt.shape[0], 1),
            ), axis=1)

            # torch_pred
            torch_pred = np.zeros((1, len(target_cols)))
            for model in model_list:
                torch_pred += model(torch.tensor(feature_inp, dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy() / NFOLDS
            torch_pred = np.median(torch_pred)
            
            # tf_pred
            tf_pred = np.median(np.mean([model(x_tt, training = False).numpy() for model in tf_models],axis=0))
            
            # xgboost
            x_tt = test_df.loc[:, feat_cols].values
            if np.isnan(x_tt.sum()):
                x_tt = np.nan_to_num(x_tt) + np.isnan(x_tt) * median.values
            five_preds = []

            for i in range(5):
                pred_prob = xgb_clfs[i].predict_proba(x_tt)[:,1]    # arr[0] is the probability for class 0, arr[1] is the probability for class 1
                five_preds.append(pred_prob)
            five_preds = np.array(five_preds).T
            xgb_pred = np.median(five_preds, axis=1)
            
            
            # avg
            pred = (torch_pred + tf_pred + xgb_pred) / 3.0
            
            pred_df.action = np.where(pred >= th, 1, 0).astype(int)
        else:
            pred_df.action = 0
        env.predict(pred_df)