In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Encoder + MLP
The idea of using an encoder is the denoise the data. After many attempts at using a unsupervised autoencoder, the choice landed on a bottleneck encoder as this will preserve the intra-feature relations. 

In [None]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, Adagrad
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

from tqdm import tqdm
from random import choices
import random


import kerastuner as kt

In [None]:
def set_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

# BlockingTimeSeriesSplitWithGap

In [None]:
from more_itertools import windowed
import numpy as np
import warnings

class BlockingTimeSeriesSplitWithGap():
    def __init__(self,n_splits=5,gap=0,group_fold_size=None):
        """
        Adapted from https://github.com/wajihullahbaig/IEEE-CIS-Fraud-Detection/blob/master/BlockingTimeSeriesSplit.py
        with gap and allowed overlap. 
        
        group_fold_size : size of each fold in relation to group
        group_fold_size * n_splits >= len(unique(groups)). Set equal if group_fold_size = None. 
        """
        self.n_splits = n_splits
        self.gap = gap
        self.group_fold_size = group_fold_size
    
    def get_n_splits(self,X,y,groups):
        return self.n_splits
        
    def split(self,X,groups,y=None,test_size=0.2):
        """
        test_size : proportion of fold to use for testing. This will be affected by gap. 
        """
        
        group_indices = {i:np.where(groups==i) for i in np.unique(groups)}
        groups = np.unique(groups)
        if self.group_fold_size is None:
            group_fold_size = len(groups)//self.n_splits
        elif self.group_fold_size > len(groups):
            raise AssertionError(f'group_fold_size={self.group_fold_size} is greater than the number of groups ({len(groups)}).')
        elif self.group_fold_size * self.n_splits < len(groups):
            warnings.warn(f'group_fold_size={self.group_fold_size} is too small for a group of {len(groups)} items. Setting group_fold_size={(len(groups))//self.n_splits}.')
            group_fold_size = len(groups)//self.n_splits
        else:
            group_fold_size = self.group_fold_size
            
        splits = list(windowed(groups,group_fold_size,step=1))
        idx = np.round(np.linspace(0, len(splits)-1, self.n_splits)).astype(int)
        splits = [splits[i] for i in idx]
        
        for idx in splits:
            idx = list(filter(None,idx))
            mid = int((1-test_size) * len(idx))
            group_idx_tr, group_idx_te = idx[0: mid], idx[mid + self.gap: -1]
            
            if len(group_idx_te) < 1:
                break
            
            tr = np.concatenate([group_indices[gr_i] for gr_i in group_idx_tr],axis=1).ravel()
            te = np.concatenate([group_indices[gr_i] for gr_i in group_idx_te],axis=1).ravel()
            yield tr,te

In [None]:
class CVTuner(kt.engine.tuner.Tuner):
    def run_trial(self, trial, X, y, splits, batch_size=32, epochs=1,callbacks=None):
        val_losses = []
        for train_indices, test_indices in splits:
            X_train, X_test = [x[train_indices] for x in X], [x[test_indices] for x in X]
            y_train, y_test = [a[train_indices] for a in y], [a[test_indices] for a in y]
            if len(X_train) < 2:
                X_train = X_train[0]
                X_test = X_test[0]
            if len(y_train) < 2:
                y_train = y_train[0]
                y_test = y_test[0]
            
            model = self.hypermodel.build(trial.hyperparameters)
            hist = model.fit(X_train,y_train,
                      validation_data=(X_test,y_test),
                      epochs=epochs,
                        batch_size=batch_size,
                      callbacks=callbacks)
            
            val_losses.append([hist.history[k][-1] for k in hist.history])
        val_losses = np.asarray(val_losses)
        self.oracle.update_trial(trial.trial_id, {k:np.mean(val_losses[:,i]) for i,k in enumerate(hist.history.keys())})
        self.save_model(trial.trial_id, model)

### Loading the training data

In [None]:
TRAINING = False
USE_FINETUNE = True
FOLDS = 5
SEED = 42

train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
#train = train.query('date > 85').reset_index(drop = True) 
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use
train.fillna(train.mean(),inplace=True)
#train = train.query('weight > 0').reset_index(drop = True)

features = [c for c in train.columns if 'feature' in c]
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
EPSILON = {c:0.0 for c in resp_cols}

X = train[features].values
y = np.stack([(train[c] > EPSILON[c]).astype('int') for c in resp_cols]).T #Multitarget

f64_max = train['feature_64'].values.max()
f64_min = train['feature_64'].values.min()

# Reduce VIF
Dropping co-linear features. 

In [None]:
def calculate_vif(X,thresh=5.0,sample_size=-1,features=None):
    if sample_size<0: 
        new_X = X.copy()
    else:
        new_X = X[np.random.choice(len(X), size=sample_size)]
        
    if features and len(features) == new_X.shape[1]: 
        features_copy = features.copy()
    else:
        features_copy = list(range(new_X.shape[-1]))
    
    dropped_idx = []
    dropped = True
    while dropped:
        dropped = False
        cc = np.corrcoef(new_X, rowvar=False)
        VIF = np.linalg.inv(cc).diagonal()
        maxloc = np.argmax(VIF)
        if max(VIF) > thresh:
            print(f'Dropped {features_copy[maxloc]} with vif={max(VIF)}')
            features_copy.pop(maxloc)
            new_X = np.delete(new_X, maxloc, axis=1)
            dropped_idx.append(maxloc)
            dropped = True
            
    del new_X
    
    for idx in dropped_idx:
        X = np.delete(X,idx,axis=1)
        
    return X, features_copy

In [None]:
if TRAINING:
    X, features = calculate_vif(X,thresh=100,features=features)

In [None]:
import gc
gc.collect()

In [None]:
if TRAINING:
    pd.to_pickle(features,'features.pkl')
else:
    features = pd.read_pickle('../input/jsautoencoder/features.pkl')

f_mean = f_mean = np.mean(train[features[1:]].values,axis=0)

### Creating the autoencoder. 
The autoencoder should aid in denoising the data. Based on [this](https://www.semanticscholar.org/paper/Deep-Bottleneck-Classifiers-in-Supervised-Dimension-Parviainen/fb86483f7573f6430fe4597432b0cd3e34b16e43) paper. 

In [None]:
from tensorflow.keras.optimizers import Adam
optimizer = lambda x: Adam(x)

In [None]:

def create_encoder(hp,input_dim,output_dim):
    i = Input(input_dim)
    x = BatchNormalization()(i)
    x = GaussianNoise(hp.Float('noise',0.1,0.5,default=0.1))(x)
    
    encoding_units = []
    encoding_dp = []
    for j in range(hp.Int('num_encoding_layers',1,3)):
        u = hp.Int(f'encoding_units_{j}',64,512)
        encoding_units.append(u)
        x = BatchNormalization()(x)
        x = Dense(u,activation='relu')(x)
        dp = hp.Float(f'encoding_dp_{j}',0.1,0.5)
        encoding_dp.append(dp)
        x = Dropout(dp)(x)
    
    for u,dp in zip(encoding_units[::-1],encoding_dp[::-1]):
        x = BatchNormalization()(x)
        x = Dense(u,activation='relu')(x)
        x = Dropout(dp)(x)
        
    decoded = Dense(input_dim,activation='linear',name='decoded')(x)
    
    for j in range(hp.Int('num_bottleneck_layers',1,3)):
        x = BatchNormalization()(x)
        x = Dense(hp.Int(f'bottleneck_units_{j}',16,64))(x)
        x = Lambda(tf.keras.activations.swish)(x)
        x = Dropout(hp.Float(f'bottleneck_dp_{j}',0.1,0.5))(x)
    
    x = Dense(output_dim,activation='sigmoid',name='label_output')(x)
    
    autoencoder = Model(inputs=i,outputs=[decoded,x])
    
    autoencoder.compile(optimizer=optimizer(1e-3),loss={'decoded':'mse','label_output':'binary_crossentropy'})
    return autoencoder

### Creating the MLP. 

In [None]:
def create_model(hp,input_dim,output_dim,encoder):
    inputs = Input(input_dim)
    
    x = encoder(inputs)[0]
    x = Concatenate()([x,inputs]) #use both raw and de-noised features
    x = BatchNormalization()(x)
    x = Dropout(hp.Float('init_dropout',0.1,0.5))(x)
    
    for i in range(hp.Int('num_layers',3,6)):
        x = BatchNormalization()(x)
        x = Dense(hp.Int(f'num_units_{i}',16,64))(x)
        x = Lambda(tf.keras.activations.swish)(x)
        x = Dropout(hp.Float(f'dropout_{i}',0.1,0.5))(x)
    x = Dense(output_dim,activation='sigmoid')(x)
    model = Model(inputs=inputs,outputs=x)
    model.compile(optimizer=optimizer(1e-3),loss=BinaryCrossentropy(label_smoothing=hp.Float('label_smoothing',0.0,0.1)),metrics=[tf.keras.metrics.AUC(name = 'auc')])
    return model

### Running CV
Following [this notebook](https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv) which use 5 PurgedGroupTimeSeriesSplit split on the dates in the training data. 

We add the locked encoder as the first layer of the MLP. This seems to help in speeding up the submission rather than first predicting using the encoder then using the MLP. 

We use a Baysian Optimizer to find the optimal HPs for out model. 20 trials take about 2 hours on GPU. 

In [None]:
def utility_score_bincount(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

Added encoder tuning in v129. 

In [None]:
%%time
SEEDS = [123]

if TRAINING:
    for j,SEED in enumerate(SEEDS):
        set_all_seeds(SEED)
        gkf = BlockingTimeSeriesSplitWithGap(n_splits = FOLDS, gap=20, group_fold_size=150) 
        splits = list(gkf.split(y, groups=train['date'].values, test_size=0.3)) #105 days training, 20 day gap, 25 days testing
        
        model_fn = lambda hp : create_encoder(hp,len(features),y.shape[-1])
        
        tuner = CVTuner(
            hypermodel=model_fn,
            oracle=kt.oracles.BayesianOptimization(
            objective= kt.Objective('val_loss', direction='min'),
            num_initial_points=4,
            max_trials=10,
            seed=SEED),
            project_name=f'jane_street_encoder_{SEED}'
            )
        
        tuner.search((X,),(X,y),splits=splits,batch_size=4096,epochs=100,callbacks=[EarlyStopping('val_loss',patience=5),
                                                                                   ReduceLROnPlateau('val_loss',patience=3)])
        
        encoder_hp  = tuner.get_best_hyperparameters(1)[0]
        pd.to_pickle(encoder_hp,f'./best_hp_encoder_{SEED}.pkl')
        best_autoencoder = tuner.get_best_models(1)[0]
        
        model_fn = lambda hp: create_model(hp,len(features),y.shape[-1],best_autoencoder)

        tuner = CVTuner(
            hypermodel=model_fn,
            oracle=kt.oracles.BayesianOptimization(
            objective= kt.Objective('val_loss', direction='min'),
            num_initial_points=4,
            max_trials=10,
            seed=SEED),
            project_name=f'jane_street_{SEED}'
            )

        tuner.search((X,),(y,),splits=splits,batch_size=4096,epochs=100,callbacks=[EarlyStopping('val_loss',patience=5),
                                                                                   ReduceLROnPlateau('val_loss',patience=3)])
        hp  = tuner.get_best_hyperparameters(1)[0]
        pd.to_pickle(hp,f'./best_hp_{SEED}.pkl')
            
        oof = np.zeros(y.shape)
        for fold, (train_indices, test_indices) in enumerate(splits):
            X_train, X_test = X[train_indices], X[test_indices]
            y_train, y_test = y[train_indices], y[test_indices]
            
            autoencoder = create_encoder(encoder_hp,X.shape[-1],y.shape[-1])
            autoencoder.fit(X_train,(X_train,y_train),validation_data=(X_test,(X_test,y_test)),epochs=100,batch_size=4096,
                      callbacks=[EarlyStopping('val_loss',patience=10,restore_best_weights=True),
                                 ReduceLROnPlateau('val_loss',patience=5)])
            
            autoencoder.trainable=False
            
            autoencoder.save_weights(f'./autoencoder_{SEED}_{fold}.tf')
        
            
            model = create_model(hp,X.shape[-1],y.shape[-1],autoencoder)
            model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,batch_size=4096,
                      callbacks=[EarlyStopping('val_loss',patience=10,restore_best_weights=True),
                                 ReduceLROnPlateau('val_loss',patience=5)])
            
            model.save_weights(f'./model_{SEED}_{fold}.tf')
            model.compile(optimizer(1e-3/100),
                          loss=BinaryCrossentropy(label_smoothing=hp.get('label_smoothing')))
            model.fit(X_test,y_test,epochs=5,batch_size=8192)
            model.save_weights(f'./model_{SEED}_{fold}_finetune.tf')
else:
    models = []
    for SEED in SEEDS:
        hp = pd.read_pickle(f'../input/jsautoencoder/best_hp_encoder_{SEED}.pkl')
        autoencoder = create_encoder(hp,len(features),y.shape[-1])
        autoencoder.trainable=False
        hp = pd.read_pickle(f'../input/jsautoencoder/best_hp_{SEED}.pkl')
        for f in range(FOLDS):
            model = create_model(hp,len(features),y.shape[-1],autoencoder)
            if USE_FINETUNE:
                model.load_weights(f'../input/jsautoencoder/model_{SEED}_{f}_finetune.tf')
            else:
                model.load_weights(f'../input/jsautoencoder/model_{SEED}_{f}.tf')
            model.call = tf.function(model.call, experimental_relax_shapes=True)
            models.append(model)
    

In [None]:
#See https://www.kaggle.com/snippsy/no-gpu-time-sell-at-10-a-m-play-golf/
bias = 0.1
def time_bias(f0,f64):
    if f0 < 0:
        correction = (1+bias/2) - bias * (f64-f64_min)/(f64_max-f64_min)
    else:
        correction = (1-bias/2) + bias * (f64-f64_min)/(f64_max-f64_min)
    return correction

## Submission

In [None]:
if not TRAINING:
    import janestreet
    janestreet.competition.make_env.__called__ = False
    env = janestreet.make_env()
    th = 0.5
    #w = np.asarray([0.1,0.1,0.1,0.5,0.2])
    for (test_df, pred_df) in tqdm(env.iter_test()):
        if test_df['weight'].item() > 0:
            f0 = test_df['feature_0'].item()
            f64 = test_df['feature_64'].item()
            
            x_tt = test_df.loc[:, features].values
            if np.isnan(x_tt[:, 1:].sum()):
                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
            pred = np.mean([model(x_tt,training=False).numpy() for model in models],axis=0).squeeze()
            tb = time_bias(f0,f64)
            pred = np.mean(pred) * tb
            pred_df.action = np.where(pred > th, 1, 0).astype(int)
        else:
            pred_df.action = 0
        env.predict(pred_df)