In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import gc
from joblib import dump,load
from tqdm.auto import tqdm
pd.set_option('display.max_columns', None)

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.utils import Sequence

In [None]:
# dummy class to store all the hyperparameters throughout the notebook
class Params: pass
params=Params()

In [None]:
# # loading original train.csv
# !pip install datatable > /dev/null
# import datatable as dt
# train_dt = dt.fread('../input/jane-street-market-prediction/train.csv')
# full_df = train_dt.to_pandas()

# # converting float64 columns to float32
# float64_cols=[col for col in full_df.columns if full_df[col].dtype=='float64']
# full_df[float64_cols]=full_df[float64_cols].astype('float32')

# # saving in feather format for future use
# full_df.to_feather('train.feather')

In [None]:
train_df=pd.read_feather('../input/js-files/train.feather')
train_df.info()

# Target Engineering

Like many others, I have noticed that treating this task as multi-label classification leads to better results compared to trying to predict just one label `resp`. I have made a couple of adjustments compared to most public notebooks:
* I do not use `resp_4` - my CV always goes down when I try to add it. This could be to some extent explained by the fact (conjecture?) that the time horizon of `resp_4` is longer than that of `resp`.
* Instead, I add the mean value of `resp`, `resp_1`, `resp_2` and `resp_3` as a separate target which does improve the CV score. This feature can be thought as a proxy for general direction of returns over the whole `resp` time horizon.

In [None]:
train_df['resp_sum']=(train_df.resp+train_df.resp_1+train_df.resp_2+train_df.resp_3)/4

In [None]:
targets=['resp','resp_1','resp_2','resp_3','resp_sum']
train_df['targ']=train_df['resp'] #storing the original `resp` away for Utility score calculation
train_df[targets]=train_df[targets]>0 #converting targets for binary classification

# Feature Engineering

In [None]:
# original features
features = [col for col in train_df.columns if 'feature' in col]

It has been hypothesized that feature_64 represents some sort of clock:
* It's always increasing throughout the day 
* Every day it follows very similar pattern
* There is a gap in the middle that can be interpreted as a lunch break (feature of several Asian markets)

In [None]:
train_df[train_df.date==0].feature_64.plot(style='.',xlabel='timestamp',ylabel='feature_64',title='Day 0');

This gave me an idea for a few new features:
* Binary feature represing part of the trading day (before/after lunch)
* Number of trades suggested by JS algorithm earlier today (for the first part of the day) or after lunch (for the second part of the day) - the intution here that together with 'clock' this feature could represent a market condition (e.g. more trade opportunities = more volatility)
* 'Gradient' of feature_64 with respect to timestamp - similar intuition to the previous point

In [None]:
train_df['part_of_day']=(train_df.feature_64>1).astype('float32')
train_df['min_ts']=train_df.groupby(['date','part_of_day'])['ts_id'].transform(min)
train_df['trades']=train_df.ts_id-train_df.min_ts

params.grad_64_lag=50
train_df['grad_64']=train_df.feature_64.diff(1)
train_df.loc[train_df.min_ts==train_df.ts_id,'grad_64']=0
train_df['lag_64']=train_df.grad_64.rolling(params.grad_64_lag).sum().fillna(0)
train_df['lag_64']=train_df['lag_64'].astype('float32')

train_df.drop(columns=['min_ts','grad_64'],axis=1,inplace=True)

In [None]:
features = features + ['part_of_day','trades','lag_64']

Another set of features is based on the behaviour of the binary feature_0. There is no obvious pattern there but it's also definitely not completely random. Again, my guess is that it could be indicative of some market condition - this could be used to construct lag features.

In [None]:
train_df[train_df.date<100].feature_0.cumsum().plot(xlabel='timestamp',ylabel='feature_0',title='Days 0-99')

In [None]:
params.lag_features=['feature_0']
params.n_lags=20
params.lag=200
lag_cols=[]
for i in range(1,params.n_lags):
    col=f'lag_{i*params.lag}'
    lag_cols.append(col)
    train_df[col]=train_df[params.lag_features].rolling(i*params.lag).sum().fillna(0).astype('float32')

Given the anonymous nature of the dataset, this whole FE logic could very easily be misguided. Still, adding these features resulted in noticeable performance gains across various models and training regimes that I tried.

In [None]:
features = features + lag_cols

# Final Data Preprocessing

Rows with zero weight do not contribute to the competition metric calculation - I chose to discard them.

In [None]:
train_df=train_df[train_df.weight>0].reset_index(drop=True)

Finally, we need to deal with missing values:
* Replace NaNs with mean values. I tried other methods (median, `ffill`) but did not see any performance gains.
* Add new columns indicating if the replacement took place. Missing values follow a pattern - so we only need new columns for groups of features not for each individual feature with missing values.

In [None]:
# grouping features based on NaN patterns
tmp=pd.DataFrame(train_df[features].isnull().sum())
tmp=tmp[tmp[0]>0].reset_index()
tmp.columns=['feat','cnt']
tmp=tmp.sort_values('cnt')
feat_groups=dict(tmp.groupby('cnt')['feat'].agg(lambda x:list(x)))
feat_groups

In [None]:
# store away the names of features that will be used to repeat this step during inference (can be any feature from each group)
params.nan_cols=[v[0] for k,v in feat_groups.items() if k>2000]
params.nan_cols

In [None]:
nan_names=[f'nan_{i}' for i in range(len(params.nan_cols))]
train_df[nan_names]= train_df[params.nan_cols].isnull().astype('float32')

In [None]:
features = features + nan_names

In [None]:
len(features)

In [None]:
means=train_df[features].mean().astype('float32')
stds=train_df[features].std().astype('float32')

In [None]:
train_df[features]=train_df[features].fillna(means)
train_df[features]=(train_df[features]-means)/stds

In [None]:
dump(means,'means.joblib')
dump(stds,'stds.joblib');

# Model & Training

In [None]:
def create_model(n_in, n_out, layers, dropout_rate, optimizer, metrics):
    
    inp = tf.keras.layers.Input(shape = (n_in, ))  
        
    x=inp
    for i,hidden_units in enumerate(layers): 
        x = tf.keras.layers.BatchNormalization()(x)
        if i>0:   
            x = tf.keras.layers.Dropout(dropout_rate)(x)    
        else: 
            x = tf.keras.layers.Dropout(.01)(x)    
        x = tf.keras.layers.Dense(hidden_units)(x)
        x = tf.keras.layers.Activation('relu')(x)
        
    x = tf.keras.layers.Dense(n_out)(x)
    out = tf.keras.layers.Activation('sigmoid')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = optimizer,
                  loss = tf.keras.losses.BinaryCrossentropy(), 
                  metrics = metrics, 
#                   run_eagerly=True
                 )
    
    return model

In [None]:
def utility_score_bincount(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return t,u

Lesson learned in the MoA competition - for a multilabel classification task it might be useful to track the validation metric for each target separately because they might exhibit different overfitting behaviour. It's not possible to notice overfitting of specific targets just looking at the the combined metric. 
I implemented it via callback because I don't need this calculated for the training data, just for validation.

In [None]:
from sklearn.metrics import roc_auc_score
class ValScore(tf.keras.callbacks.Callback):
    def __init__(self, validation_data,dates,weights,targs):
        super().__init__()
        self.X_val, self.y_val = validation_data
        self.dates, self.weights, self.targs = dates, weights, targs

    def on_epoch_end(self, epoch, logs={}):
        y_pred=self.model(self.X_val,training = False).numpy()
        aoc = roc_auc_score(self.y_val, y_pred,average=None)
        action=(y_pred.mean(1)>0.5).astype('int8')
        score=utility_score_bincount(self.dates,self.weights,self.targs,action)
        print(f"AOC scores: {aoc}, t: {score[0]:.2f}, Utility score: {score[1]:.0f}")

In [None]:
### model parameters
params.layers = [500,350,200]
params.dropout_rate = 0.35

###training parameters
params.bs = 8192
params.lr = 0.002
params.epochs = 30 
params.wd = 0.02

### adding overall AuC as a metric
### for early stopping I only look at resp and resp_sum because they start overfitting earlier
metrics = [tf.keras.metrics.AUC(label_weights=[1,0,0,0,1],name='auc_10001'),
           tf.keras.metrics.AUC(name='auc')
          ] 

In [None]:
#training loop
!mkdir models
params.n_folds=5
params.days_in_group=[50]*3 
params.n_runs=len(params.days_in_group)
pred_cols=[f'pred{i}_{j}' for i in range(params.n_runs) for j in range(len(targets))]
train_df[pred_cols]=0
train_df[pred_cols]=train_df[pred_cols].astype('float32')

for i,days_in_group in enumerate(params.days_in_group):
    gkf=GroupKFold(params.n_folds)
    params.n_groups=500//days_in_group
    train_df['group']=train_df['date']//days_in_group
    for j,splits in enumerate(gkf.split(train_df[features],train_df[targets],
                          train_df['group'])):
        
        print(f'Run:{i}, Fold:{j}')
        
        X_train, X_val = train_df.loc[splits[0], features].values, train_df.loc[splits[1], features].values
        y_train, y_val = train_df.loc[splits[0], targets].values, train_df.loc[splits[1], targets].values
      
        dates=train_df.loc[splits[1],'date'].values
        weights=train_df.loc[splits[1],'weight'].values
        targs=train_df.loc[splits[1],'targ'].values
        cbs=[ValScore((X_val, y_val),dates,weights,targs),
                 tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3,
                              patience=3, verbose=1),
                 tf.keras.callbacks.EarlyStopping(
                                monitor='val_auc_10001', patience=5, verbose=1,
                                mode='max', restore_best_weights=True
                            )
            ]
        
        model=create_model(len(features), len(targets), params.layers, params.dropout_rate, 
                           optimizer=tfa.optimizers.Lookahead(
                               tfa.optimizers.LAMB(learning_rate=params.lr,weight_decay_rate=params.wd)
                           ),
                           metrics=metrics)
        model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = params.epochs, 
                    batch_size = params.bs, validation_batch_size=500_000,
                    callbacks = [cbs], verbose = 2)           
        model.save_weights(f'models/saved_model_{i}_{j}.hdf5')
        
        preds=model(X_val,training = False).numpy()
        train_df.loc[splits[1],[f'pred{i}_{j}' for j in range(len(targets))]]=preds
        gc.collect()
        tf.keras.backend.clear_session()      

In [None]:
dump(params,'params.joblib');   

In [None]:
dump(train_df,'train_df_oof.joblib');   