In [None]:
from IPython.display import Image
Image("../input/tf-model-garden-official-models/TF.png")

This notebook is based on [Jane Street with Keras NN overfit](https://www.kaggle.com/code1110/jane-street-with-keras-nn-overfit), which itself is based on [OWN Jane Street with Keras NN](https://www.kaggle.com/tarlannazarov/own-jane-street-with-keras-nn).  

The changes I have made in this version are:
* Add stratified K-Fold data splitting
* Add learning curve plotting
* Add early stopping etc.
* Improve inference speed as described in [this notebook](https://www.kaggle.com/tocha4/20210204-speed-up-your-prediction) and [this notebook](https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function)
* Tidy up code somewhat  

The primary motivation is to make the model more general (so less likely to top the public overfit leaderboard in this competition). The model itself has intentionally been kept unchanged to see how the PB score compares with the original, but there are plenty of improvements that could be made to the model itself of course. 

# Import training data
The data has lot's of NaNs, and they are imputed here like in the original notebook with mean values. Mean values are very likely to change over time, so a bias is introduced here which might impact inference on future data.

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow_addons as tfa
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
NFOLDS = 5

train_all = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train_all = train_all[train_all.date > 85].reset_index(drop = True) 
train_all = train_all[train_all['weight'] != 0]
train_all.fillna(train_all.mean(),inplace=True)

# Stratification
When splitting data into K-folds cross-validation sets, it is important that the splits contain equal percentage of desired features. The NN model here does not use any time series information whatsoever, so we can split and shuffle the data as we like. Let's take a look at date distribution for example:

In [None]:
sns.set_style('whitegrid')
train_all['date'].plot(kind='hist');

Trading frequency increase with time, and we want each fold to have equal distribution of trades from different times. So we bin the dates into 4. But we will also like to have equal amounts of sell/buy data, so we multiply the date bins with feature_0 to get our stratification variable.

In [None]:
train_all['date_bin'] = (pd.qcut(train_all['date'], q=4, labels=False)+1)*train_all['feature_0'] # stratify column
features = [c for c in train_all.columns if "feature" in c]
f_mean = np.mean(train_all[features[1:]].values,axis=0)
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
X_train = train_all.loc[:, train_all.columns.str.contains('feature|date_bin')]
y_train = pd.DataFrame(np.stack([(train_all[c] > 0).astype('int') for c in resp_cols]).T, columns = resp_cols)

Lets check the distribution after splitting:

In [None]:
skf = StratifiedKFold(n_splits=NFOLDS, shuffle = True, random_state = 42)
result = next(skf.split(X_train, X_train.date_bin), None)
train = train_all.iloc[result[0]].reset_index(drop=True)
valid = train_all.iloc[result[1]].reset_index(drop=True)

In [None]:
train['date'].plot(kind='hist');

In [None]:
valid['date'].plot(kind='hist');

The date distributions in train/validation sets are identical as expected.

In [None]:
del train, valid, train_all, result # save precious memory

# Model definition

In [None]:
MNAME = 'model'

def get_callbacks(idx):
    mc = ModelCheckpoint(MNAME+"-{}.h5".format(idx), save_best_only=True)
    rp = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=False)
    return [mc, rp, es]

def create_dnn(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    
    return model

# Training

In [None]:
skf = StratifiedKFold(n_splits=NFOLDS, shuffle = True, random_state = 42)
history = []

for i in range(NFOLDS):
    print('fold {}'.format(i))
    result = next(skf.split(X_train, X_train.date_bin), None)
    X_tr = X_train.iloc[result[0]].reset_index(drop=True)
    X_tr.drop(labels='date_bin', axis = 1, inplace=True)
    y_tr = y_train.iloc[result[0]].reset_index(drop=True)
    X_val = X_train.iloc[result[1]].reset_index(drop=True)
    X_val.drop(labels='date_bin', axis=1, inplace=True)
    y_val = y_train.iloc[result[1]].reset_index(drop=True)
    
    np.random.seed(42*i)
    tf.keras.backend.clear_session()
    tf.random.set_seed(42*i)

    # fit
    batch_size = 8192
    hidden_units = [160, 160, 160]
    dropout_rates = [0.2, 0.2, 0.2, 0.2]
    label_smoothing = 1e-2
    learning_rate = 1e-3

    clf = create_dnn(len(features), y_train.shape[1], hidden_units, dropout_rates, label_smoothing, learning_rate)
    clf.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )
    callbacks = get_callbacks(i)

    epochs = 200

    history.append(clf.fit(X_tr, y_tr, epochs=epochs, batch_size=batch_size, validation_data=(X_val,y_val), callbacks=callbacks, verbose=0))
    
    del clf, X_tr, y_tr, X_val, y_val, result # save precious memory

# Learning curves
Plot train and validation loss/AUC to check our training.

In [None]:
def plot_lrc(hist_list):
    x = np.arange(1,epochs+1)
    fig, ax = plt.subplots(2,1,figsize=(16,16))    
    for i in range(len(hist_list)):
        if i == 0:
            ax[0].plot(x[0:len(history[i].history['loss'])], history[i].history['loss'], 
                   color='tab:blue', label='Train loss')
            ax[0].plot(x[0:len(history[i].history['val_loss'])], history[i].history['val_loss'], 
                   color='tab:orange', label='Validation loss')
        else:
            ax[0].plot(x[0:len(history[i].history['loss'])], history[i].history['loss'], 
                   color='tab:blue')
            ax[0].plot(x[0:len(history[i].history['val_loss'])], history[i].history['val_loss'], 
                   color='tab:orange')
    ax[0].set_xlabel('Epoch', fontsize=10)
    ax[0].set_ylabel('Loss', fontsize=10)    
    ax[0].legend()
    for i in range(len(hist_list)):
        if i == 0:
            ax[1].plot(x[0:len(history[i].history['AUC'])], history[i].history['AUC'], 
                   color='tab:blue', label='Train AUC')
            ax[1].plot(x[0:len(history[i].history['val_AUC'])], history[i].history['val_AUC'], 
                   color='tab:orange', label='Validation AUC')
        else:
            ax[1].plot(x[0:len(history[i].history['AUC'])], history[i].history['AUC'], 
                   color='tab:blue')
            ax[1].plot(x[0:len(history[i].history['val_AUC'])], history[i].history['val_AUC'], 
                   color='tab:orange')        
    ax[1].set_xlabel('Epoch', fontsize=10)
    ax[1].set_ylabel('AUC', fontsize=10)    
    ax[1].legend()
    plt.suptitle('Training curves', fontsize=20);

In [None]:
plot_lrc(history)

The curves for the different folds are very close, as expected.

# Inference

In [None]:
import janestreet
from numba import njit

env = janestreet.make_env()

In [None]:
@njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
th = 0.501
clf0 = tf.keras.models.load_model("model-0.h5")
#clf1 = tf.keras.models.load_model("model-1.h5")
clf2 = tf.keras.models.load_model("model-2.h5")
#clf3 = tf.keras.models.load_model("model-3.h5")
clf4 = tf.keras.models.load_model("model-4.h5")
models = [clf0, clf2, clf4]

test_df_columns = ['weight'] + [f'feature_{i}' for i in range(130)] + ['date']
index_features = [n for n,col in enumerate(test_df_columns) if col in features]

for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].values[0] > 0:
        x_tt = test_df.values[0][index_features].reshape(1,-1)
        x_tt[:, 1:] = fillna_npwhere_njit(x_tt[:, 1:][0], f_mean)
        pred = np.median(np.mean([model(x_tt, training = False).numpy() for model in models],axis=0))
        pred_df.action = int(pred >= th)
    else:
        pred_df.action = 0
    env.predict(pred_df)