# Resilience to noise

Spawned from discussion here: https://www.kaggle.com/c/jane-street-market-prediction/discussion/203312
Using @yirun zhang's NN model as a benchmark

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os, gc

import pandas as pd
import numpy as np

import janestreet
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from scipy.optimize import curve_fit
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load

import tensorflow as tf
tf.random.set_seed(41)
import tensorflow.keras.backend as K

import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from typing import List

In [None]:
def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):

    inp = tf.keras.layers.Input(shape = (num_columns, ))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)): 
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i+1])(x)    

    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation('sigmoid')(x)

    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing), 
                  metrics = tf.keras.metrics.AUC(name = 'AUC'), 
                 )
    
    return model

In [None]:
def create_mlp_wrapper(train, params, filename, features):
    batch_size = params['batch_size']
    hidden_units = [params['hidden_unit_1'], params['hidden_unit_2'], params['hidden_unit_3'], params['hidden_unit_4']]
    dropout_rates = [params['dropout_rate_1'], params['dropout_rate_2'], params['dropout_rate_3'], params['dropout_rate_4'], params['dropout_rate_5']]
    label_smoothing = params['label_smoothing']
    learning_rate = params['learning_rate']
    time='all'
#     features = [c for c in train.columns if 'feature' in c]
    
    oof = []
    gkf = GroupKFold(n_splits = 5)
    scores = []
    utility_noisy = pd.DataFrame(columns=features,index=range(5))
    scores_noisy = pd.DataFrame(columns=features,index=range(5))
    for fold, (tr, te) in enumerate(gkf.split(train['resp'].values, train['resp'].values, train['date'].values)):
        X_tr, X_val = train.iloc[tr][features].values, train.iloc[te][features].values
        y_tr, y_val = train.iloc[tr]['action'].values, train.iloc[te]['action'].values

        ckp_path = f'JSModel_{filename}_{fold}.hdf5'
        model = create_mlp(X_tr.shape[1], 1, hidden_units, dropout_rates, label_smoothing, learning_rate)
        rlr = ReduceLROnPlateau(monitor = 'val_AUC', factor = 0.1, patience = 3, verbose = 0, 
                                min_delta = 1e-4, mode = 'max')
        ckp = ModelCheckpoint(ckp_path, monitor = 'val_AUC', verbose = 0, 
                              save_best_only = True, save_weights_only = True, mode = 'max')
        es = EarlyStopping(monitor = 'val_AUC', min_delta = 1e-4, patience = 7, mode = 'max', 
                           baseline = None, restore_best_weights = True, verbose = 0)
        model.fit(X_tr, y_tr, validation_data = (X_val, y_val), epochs = 1000, 
                  batch_size = batch_size, callbacks = [rlr, ckp, es], verbose = 0)
        rng = np.random.default_rng()
        for i, feature in enumerate(features):
            gc.collect()
            X_val_noisy = X_val.copy()
            X_val_noisy[:,i] = rng.permutation(X_val_noisy[:,i])
            pred = model.predict(X_val_noisy, batch_size = batch_size * 4).ravel()
            scores_noisy.loc[fold, feature] = roc_auc_score(y_val, (pred>.5).astype(int))
            utility_noisy.loc[fold, feature] = utility_score(train.iloc[te]['date'].values, train.iloc[te]['weight'].values, train.iloc[te]['resp'], (pred>.5).astype(int))
        oof.append(model.predict(X_val, batch_size = batch_size * 4).ravel())
        score = roc_auc_score(y_val, oof[fold])
        print(f'Fold {fold} ROC AUC:\t', round(score, 4))

        K.clear_session()
        del model
        rubbish = gc.collect()

    return oof, scores_noisy, utility_noisy


In [None]:
def utility_score(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

In [None]:
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
features = [c for c in train.columns if 'feature' in c]
f_mean = train[features[1:]].mean()
train = train.query('weight > 0').reset_index(drop = True)
train[features[1:]] = train[features[1:]].fillna(f_mean)
train['action'] = (train['resp'] > 0).astype('int')
params = {'batch_size': 4096, 'dropout_rate_1': 0.08949964830076425, 'dropout_rate_2': 0.09214922878769727, 'dropout_rate_3': 0.10007316489415027, 'dropout_rate_4': 0.2416444299467454,  'dropout_rate_5': 0.1338760637313039,   'hidden_unit_1': 245, 'hidden_unit_2': 678,  'hidden_unit_3': 759, 'hidden_unit_4': 471, 'label_smoothing': 0.0007423928500131863, 'learning_rate': 0.0003772663214527269}

In [None]:
gc.collect()
oof_pred, scores_noisy, utility_noisy = create_mlp_wrapper(train, params, 'filename', features)
gc.collect()

In [None]:
def mutate(x, p):
    mutate_idx = np.random.choice(x.shape[0], int(x.shape[0]*p),replace=False)
    x[mutate_idx] = np.where(x[mutate_idx] == 1, 0, 1)
    return x

In [None]:
gkf = GroupKFold(n_splits = 5)
utilities = []
for fold, (tr, te) in enumerate(gkf.split(train['resp'].values, train['resp'].values, train['date'].values)):
    oof_pred_, oof_real = oof_pred[fold], train.iloc[te]['action'].values
    fold_utilities = []
    ps = np.linspace(0.0, 0.7, num=100)
    for p in ps:
        mutated_pred = mutate((oof_pred_>0.5).astype(int), p)
        fold_utilities.append(utility_score(train.iloc[te]['date'].values, train.iloc[te]['weight'].values, train.iloc[te]['resp'], mutated_pred))
    utilities.append(fold_utilities)

In [None]:
def sigmoid(x, L ,x0, k, b):
    y = -L / (1 + np.exp(-k*(x-x0)))+b
    return (y)

In [None]:
for i, fold_utilities in enumerate(utilities):
#     https://stackoverflow.com/questions/55725139/fit-sigmoid-function-s-shape-curve-to-data-using-python
    p0 = [max(fold_utilities), np.median(ps),20,max(fold_utilities)]
    popt, pcov = curve_fit(sigmoid, ps, fold_utilities, p0, method='dogbox')

    fig, ax = plt.subplots()
    ax.scatter(ps, fold_utilities)
    ax.plot(ps, sigmoid(ps, *popt), 'g--')
    ax.set_xlabel('probability of mutation')
    ax.set_ylabel('utility')
    ax.set_title(f'Fold {i+1} resilience to noise')


## Methods

We randomly mutate the predicted best actions, flipping some percentage of actions if 1 to 0, if 0 to 1. We then plot the output. A s-curve is fit for visual clarity. 

## Results

We see most utility fully decay upwards 40% chance of mutation. Before then, utility steadily declines linearly with an increase in probablity of mutation.

## Discussion

Small permutations in the output of our actions, <1%, have a noticable affect on our utility score. Radically large permutations can completely decay our utility. 

### Future work

- investigate weight's relationship on utility decay from noise
- investigate feature dependence for neural net models. 

In [None]:
((scores_noisy.mean()-scores_noisy.mean().max()).sort_values()).plot.bar()
plt.ylabel('ROC AUC decrease from noise')
plt.show()
((scores_noisy.mean()-scores_noisy.mean().max()).sort_values())[:5].plot.bar()
plt.ylabel('ROC AUC decrease from noise')
plt.show()
((scores_noisy.mean()-scores_noisy.mean().max()).sort_values())[-5:].plot.bar()
plt.ylabel('ROC AUC decrease from noise')
plt.show()

In [None]:
((utility_noisy.mean()-utility_noisy.mean().max()).sort_values()).plot.bar()
plt.ylabel('utility decrease from noise')
plt.show()
((utility_noisy.mean()-utility_noisy.mean().max()).sort_values())[:5].plot.bar()
plt.ylabel('utility decrease from noise')
plt.show()
((utility_noisy.mean()-utility_noisy.mean().max()).sort_values())[-5:].plot.bar()
plt.ylabel('utility decrease from noise')
plt.show()

In [None]:
print((utility_noisy.mean()-utility_noisy.mean().max()).sort_values().corr((scores_noisy.mean()-scores_noisy.mean().max()).sort_values()))

In [None]:
plt.scatter((utility_noisy.mean()-utility_noisy.mean().max()), scores_noisy.mean()-scores_noisy.mean().max())
plt.xlabel('utility decrease from noise')
plt.ylabel('ROC AUC decrease from noise')

## Methods

We randomly permute all the values in a feature column, as input to a trained prediction model. We then graph the relative decrease in utility, and ROC AUC, among the features. We also scatter plotted the decreases in utility and ROC AUC

## Results

We see the random permutations of features 39, 0, 44, 3, and 41 as the most detrimental to our ROC AUC score, and features 46, 50, 80, 86, and 30 as the least detrimental.
We see the random permutations of features 3, 83, 39, 44, and 8 as the most detrimental to our utility score, and features 72, 21, 70, 17, and 35 as the least detrimental.

## Discussion

The neural net model appears to rely heavily on the inputs of 3, 39, and 44, as permuting these features resulted in large decreases in both ROC AUC and utility.
ROC AUC and utility are loosely tied.