# Jane Street: Neural Network Starter

I try implementing a simple Tensorflow Keras neural network here. Train in Version 17.

**Caution:** The GroupCV method applied in this notebook may cause time leakage problem. Please use [Purged Time-Series CV][1] instead.

[1]: https://www.kaggle.com/marketneutral/purged-time-series-cv-xgboost-optuna

In [1]:
#import sys
#!cp ../input/rapids/rapids.0.17.0 /opt/conda/envs/rapids.tar.gz
#!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
#sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
#sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
#sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
#!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
TRAINING = True
USE_FINETUNE = False

In [3]:
import warnings
warnings.filterwarnings('ignore')

#if TRAINING:
#    import cudf
#    import cupy as cp

import os, gc
import pandas as pd
import numpy as np
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
from joblib import dump, load

import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import Normalization


import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args


import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args
import kerastuner as kt

# Preprocessing

In [4]:
TARGET = 'action'
FEATS = ['feature_{}'.format(int(i)) for i in range(130)]
#FIX_FEAT = ['feature_53', 'feature_45', 'feature_69', 'feature_101', 'feature_84', 'feature_121', 'feature_48', 'feature_89', 'feature_93', 'feature_106', 'feature_108', 'feature_99', 'feature_100', 'feature_87', 'feature_120', 'feature_98', 'feature_46', 'feature_128', 'feature_50', 'feature_116', 'feature_71', 'feature_111', 'feature_55', 'feature_49', 'feature_56', 'feature_1', 'feature_51', 'feature_57', 'feature_107', 'feature_41', 'feature_109', 'feature_113', 'feature_123', 'feature_103', 'feature_90', 'feature_91', 'feature_114', 'feature_42', 'feature_54', 'feature_47', 'feature_95', 'feature_96']

#FIX_FEAT = ['feature_1','feature_2','feature_41','feature_42','feature_44','feature_45','feature_46','feature_47','feature_48','feature_49','feature_50','feature_51','feature_53','feature_54','feature_55','feature_56','feature_57','feature_58','feature_59','feature_69','feature_70','feature_71','feature_84','feature_85','feature_86','feature_87','feature_88','feature_89','feature_90','feature_91','feature_92','feature_93','feature_94','feature_95','feature_96','feature_97','feature_98','feature_99','feature_100','feature_101','feature_102','feature_103','feature_104','feature_105','feature_106','feature_107','feature_108','feature_109','feature_110','feature_111','feature_112','feature_113','feature_114','feature_115','feature_116','feature_117','feature_118','feature_119','feature_120','feature_121','feature_122','feature_123','feature_124','feature_125','feature_126','feature_127','feature_128','feature_129']

In [5]:
if TRAINING:
    print('Loading...')
    train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
    TARGET = 'action'
    FEATS = ['feature_{}'.format(int(i)) for i in range(130)]

    print('Filling...')
    train = train.query('weight > 0').reset_index(drop = True)
    resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
    y = np.stack([(train[c] > 0.00000).astype('int') for c in resp_cols]).T #Multitarget

    X = train[FEATS].to_numpy()
    wr = train.weight*train['resp'].to_numpy()
    Y = wr*y[:,3]
    X = train[FEATS].to_numpy()
    print('Finish.')

Loading...
Filling...
Finish.


In [6]:
if TRAINING:
    nan_feat = (train[FEATS].isnull().sum()>0)
    NAN_FEAT = nan_feat[nan_feat == True].index
    nan_feat_bool = nan_feat.values
    with open('../input/js-cv-split2/f_mean.npy', 'rb') as f:
        f_mean = np.load(f)
    
    mask2 = np.isnan(X[:,nan_feat_bool]).astype(int)
    X = np.nan_to_num(X) + np.isnan(X).astype(int) * f_mean
    del(train)
    _= gc.collect()
    pd.to_pickle(nan_feat_bool, 'nfb.pkl')

# Training

In [7]:
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, Dense, Dropout, LSTM, Masking, Embedding, Concatenate, Input, Reshape,Flatten, AveragePooling1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.metrics import AUC
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda
#from tensorflow.keras.layers import merge
from tensorflow.keras.layers import multiply, Reshape
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm import tqdm
from tqdm import trange
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
def create_autoencoder(input_dim,output_dim,noise=0.1):
    i = Input(130)
    mask = Input(130)
    encoded = BatchNormalization()(i)
    encoded = GaussianNoise(noise)(encoded)
    
    encoded = Dense(96, activation = 'elu')(encoded)
    encoded = Dense(64,activation='linear')(encoded)
    encoder = Model(inputs=i,outputs=encoded)
    
    return encoder

In [9]:
encoder = create_autoencoder(130, 5, noise=0.1)
encoder.load_weights('../input/js-cv-split2/encoder.hdf5')
encoder.trainable = False

In [10]:
if False:
    # detect and init the TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)

    # instantiate a distribution strategy
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [11]:
def custom_loss(y_true, y_pred):
    return 100 * tf.keras.losses.MSE(y_true,y_pred)

def metrics2(y_true, y_pred):
    return K.sum(y_pred)

def metrics(y_true, y_pred):
    Pi = np.bincount(y_true, y_pred)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    print('\n', round(u,5))
    return u

In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Input, Multiply, Add, Concatenate
from tensorflow.keras.activations import sigmoid, relu
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import AUC
from tensorflow.keras.losses import binary_crossentropy
import tensorflow_addons as tfa
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.activations import softmax


def build_tabnet_model(input_dim, transform_dim, N_a, N_d, num_decision_step, gamma, output_dim, B_v, m_d, Lambda, multiplier):
    # Initialization
    inputs = Input(input_dim)
    #x2 = encoder(x1)
    #x = Concatenate()([x1, x2])
    #P = tf.ones([tf.shape(x)[0], tf.shape(x)[1]]) #
    #d_out = tf.zeros([tf.shape(x)[0], N_d])
    entropy = 0
    # pre-encoding
    
    x = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(inputs)
    feat_trans1 = Dense(transform_dim * 2, use_bias= False)(x)
    feat_trans1 = Dropout(0.1)(feat_trans1)
    feat_trans1 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans1)
    feat_trans1 = Multiply()([feat_trans1[:,:transform_dim], sigmoid(feat_trans1[:,transform_dim:])])
    
    feat_trans2 = Dense(transform_dim * 2, use_bias = False)(feat_trans1)
    feat_trans2 = Dropout(0.1)(feat_trans2)
    feat_trans2 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans2)
    feat_trans2 = Multiply()([feat_trans2[:,:transform_dim], sigmoid(feat_trans2[:,transform_dim:])])
    feat_trans2 = Add()([feat_trans2, feat_trans1])
    feat_trans2 *= tf.math.sqrt(0.5)
    
    feat_trans3 = Dense(transform_dim * 2, use_bias = False)(feat_trans2)
    feat_trans3 = Dropout(0.1)(feat_trans3)
    feat_trans3 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans3)
    feat_trans3 = Multiply()([feat_trans3[:,:transform_dim], sigmoid(feat_trans3[:,transform_dim:])])
    feat_trans3 = Add()([feat_trans3, feat_trans2])
    feat_trans3 *= tf.math.sqrt(0.5)
    
    feat_trans4 = Dense(transform_dim * 2, use_bias = False)(feat_trans3)
    feat_trans4 = Dropout(0.1)(feat_trans4)
    feat_trans4 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans4)
    feat_trans4 = Multiply()([feat_trans4[:,:transform_dim], sigmoid(feat_trans4[:,transform_dim:])])
    feat_trans4 = Add()([feat_trans4, feat_trans3])
    feat_trans4 *= tf.math.sqrt(0.5)
    
    for step in range(num_decision_step):
        #Attentive transform
        
        mask_value = Dense(input_dim, use_bias= False)(feat_trans4[:,-N_a:])
        mask_value = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(mask_value)
        if step == 0:
            mask_value = softmax(multiplier * mask_value)
            P = (gamma - mask_value)
        else:
            mask_value *= P
            mask_value = softmax(multiplier * mask_value)
            P *= (gamma - mask_value)

        masked_feature = Multiply()([mask_value, inputs])
        
        
        # Entropy is used to penalize the amount of sparsity in feature
        # selection.
        entropy += tf.reduce_mean(tf.reduce_sum(-mask_value * tf.math.log(mask_value + 0.01), axis=1)) / (num_decision_step)
        
        
        # feature_transform
        feat_trans1 = Dense(transform_dim * 2, use_bias= False)(masked_feature)
        feat_trans1 = Dropout(0.1)(feat_trans1)
        feat_trans1 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans1)
        feat_trans1 = Multiply()([feat_trans1[:,:transform_dim], sigmoid(feat_trans1[:,transform_dim:])])
    
        feat_trans2 = Dense(transform_dim * 2, use_bias = False)(feat_trans1)
        feat_trans2 = Dropout(0.1)(feat_trans2)
        feat_trans2 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans2)
        feat_trans2 = Multiply()([feat_trans2[:,:transform_dim], sigmoid(feat_trans2[:,transform_dim:])])
        feat_trans2 = Add()([feat_trans2, feat_trans1])
        feat_trans2 *= tf.math.sqrt(0.5)
    
        feat_trans3 = Dense(transform_dim * 2, use_bias = False)(feat_trans2)
        feat_trans3 = Dropout(0.1)(feat_trans3)
        feat_trans3 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans3)
        feat_trans3 = Multiply()([feat_trans3[:,:transform_dim], sigmoid(feat_trans3[:,transform_dim:])])
        feat_trans3 = Add()([feat_trans3, feat_trans2])
        feat_trans3 *= tf.math.sqrt(0.5)
        
        feat_trans4 = Dense(transform_dim * 2, use_bias = False)(feat_trans3)
        feat_trans4 = Dropout(0.1)(feat_trans4)
        feat_trans4 = BatchNormalization(virtual_batch_size=B_v, momentum = m_d)(feat_trans4)
        feat_trans4 = Multiply()([feat_trans4[:,:transform_dim], sigmoid(feat_trans4[:,transform_dim:])])
        feat_trans4 = Add()([feat_trans4, feat_trans3])
        feat_trans4 *= tf.math.sqrt(0.5)
        
        # ouput
        if step == 0:
            d_out = relu(feat_trans4[:,:N_d])
        else:
            d_out += relu(feat_trans4[:,:N_d])
        
    y1 = Dense(output_dim, activation = 'sigmoid', name = 'label_out')(d_out)
    
    y2 = tf.math.reduce_mean(y1, axis = -1)
    y2 = tf.where(y2 > 0.5, 1, 0)
    y2 = tf.cast(y2, tf.float32)
    wr = Input(1)
    y2 = Multiply(name = 'return_out')([y2,wr])
    
    lr_schedule = ExponentialDecay(0.01, decay_steps=2000,decay_rate=0.95,staircase=False)
    model = Model(inputs = [inputs,wr], outputs = [y1,y2])
    model.add_loss(Lambda * entropy)
    model.compile(optimizer = Adam(learning_rate = lr_schedule), loss = {'label_out':'binary_crossentropy', 'return_out': custom_loss}, metrics = {'label_out':AUC(name = 'auc'), 'return_out': metrics2})
    return model

In [13]:
#with tpu_strategy.scope():
#    model = build_tabnet_model(130, 78, 16, 8, 8, 3, 1.2, 5, None, 0.9, 0.0001, encoder)
#    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = AUC(name = 'auc'))

In [14]:
FOLDS = 6
SEED = 42
f = 5
#tf.random.set_seed(SEED)
multiplier = [3,4,5,6,7]
if TRAINING:
    splits = pd.read_pickle('../input/js-cv-split2/cross_validation.pkl')
    #gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=20)
    #splits = list(gkf.split(y, groups=date))

    #for fold, (train_indices, test_indices) in enumerate(splits):
    for fold in range(FOLDS):
        if fold == f:
            y_train, y_test = y[splits[f][0]], y[splits[f][1]]
            X_train, X_test = X[splits[f][0]], X[splits[f][1]]
            Y_train, Y_test = Y[splits[f][0]], Y[splits[f][1]]
            mask_train, mask_test = mask2[splits[f][0]], mask2[splits[f][1]]
            
            encoded_X_train = encoder(X_train).numpy()
            encoded_X_test = encoder(X_test).numpy()
            
            X_train = np.concatenate((X_train, encoded_X_train, mask_train), axis = -1)
            X_test = np.concatenate((X_test, encoded_X_test, mask_test), axis = -1)

            wr_train, wr_test = wr[splits[f][0]], wr[splits[f][1]]
            
            del(encoded_X_train, encoded_X_test, mask_train, mask_test)
            _= gc.collect()
            
            for m in multiplier:

                model = build_tabnet_model(130 + 64 + 88, 32, 16, 16, 3, 1.2, 5, None, 0.8, 0, m)
                model.fit([X_train, wr_train], [y_train, Y_train],validation_data=([X_test, wr_test], [y_test, Y_test]),epochs=300, batch_size=4000, callbacks = [EarlyStopping('val_label_out_auc',mode='max',patience=10,restore_best_weights=True)])
                model.save_weights(f'./tabnet_model_{SEED}_{fold}_multiplier_{m}.hdf5')
                del(model)
                _=gc.collect()
    

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch