<h1><center>Gaussian Noise Augmentation Experiment</center></h1>

Originally based on this excellent work by Stanley Zheng:
https://www.kaggle.com/stanleyjzheng/baseline-nn-with-k-folds


This experiment builds on a previously shared public notebook (public LB 0.01901) adds Gaussian noise to each non-categorical feature input with standard deviation proportionate to the feature standard deviation using the Keras GaussianNoise layer. Noise with 0% (control), 10%, 20%, 30%, 40%, 50%, 60% and 70% of the original feature st dev will be implemented and evaluated using 2 repeats of 5 fold CV. To assess the viability of this noise augmentation, OOF log loss will be compared for each intensity. 

Initial experiments indicated small improvements on CV OOF prediction loss but mixed results on public LB.

### Import Libraries

In [None]:

import numpy as np 
import pandas as pd 
import os
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
 
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

### Read Data

In [None]:
test_df = pd.read_csv('../input/lish-moa/test_features.csv')
train_df = pd.read_csv('../input/lish-moa/train_features.csv')
train_target_df = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

target_cols = train_target_df.columns[1:]

In [None]:
train_df.head()

In [None]:
sub.head()

### Basic Setup and Helpers

In [None]:
SEED = 9876
EPOCHS = 30
BATCH_SIZE = 128
FOLDS = 5
REPEATS = 2
LR = 0.0005
N_TARGETS = len(target_cols)

In [None]:
def seed_everything(seed):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [None]:
def multi_log_loss(y_true, y_pred):
    losses = []
    for col in y_true.columns:
        losses.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col]))
    return np.mean(losses)

### Encode Categoricals to Binary

In [None]:
def preprocess_df(data):
    data['cp_type'] = (data['cp_type'] == 'trt_cp').astype(int)
    data['cp_dose'] = (data['cp_dose'] == 'D2').astype(int)
    return data

In [None]:
x_train = preprocess_df(train_df.drop(columns="sig_id"))
x_test =preprocess_df(test_df.drop(columns="sig_id"))
y_train = train_target_df.drop(columns="sig_id")
N_FEATURES = x_train.shape[1]

### Define Model Architecture

In [None]:
def create_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Input(N_FEATURES),
    tf.keras.layers.GaussianNoise(NOISE_STD_VECTOR),
    tf.keras.layers.BatchNormalization(),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(3072, activation="relu")),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(3072, activation="relu")),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(N_TARGETS, activation="sigmoid"))
    ])
    model.compile(optimizer=tfa.optimizers.Lookahead(tf.optimizers.Adam(lr=LR), sync_period=10), loss='binary_crossentropy', metrics=["accuracy"])
    return model

### Main CV and Model Training Function

In [None]:
def build_train(resume_models = None, repeat_number = 0, folds = 5, skip_folds = 0):
    
    models = []
    oof_preds = y_train.copy()
    

    kfold = KFold(folds, shuffle = True)
    for fold, (train_ind, val_ind) in enumerate(kfold.split(x_train)):
        print('-'*50)
        print(f'Training fold {fold + 1}')
        
        cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.4, patience = 3, verbose = 0, min_delta = 0.0001, mode = 'auto')
        checkpoint_path = f'repeat:{repeat_number}_Fold:{fold}.hdf5'
        cb_checkpt = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 0, save_best_only = True, save_weights_only = True, mode = 'min')

        model = create_model()
        model.fit(x_train.values[train_ind],
              y_train.values[train_ind],
              validation_data=(x_train.values[val_ind], y_train.values[val_ind]),
              callbacks = [cb_lr_schedule, cb_checkpt],
              epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0
             )
        model.load_weights(checkpoint_path)
        oof_preds.loc[val_ind, :] = model.predict(x_train.values[val_ind])
        models.append(model)

    return models, oof_preds

## Define our Experiment Values

In [None]:
noise_factors = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

In [None]:
# get the standard deviation of the numeric features
feat_stds = x_train[x_train.columns[3:]].std(axis=0).values
losses_ind = []
losses_mean = []
losses_mean_adj = []

test_pred_all = []



# seed everything
seed_everything(SEED)

for j, fac in enumerate(noise_factors):
    print('\n')
    print('#'*50)
    print(f"Training with Setting {j+1}, Noise Factor: {fac}")
    # Do not add noise in the first 3 columns
    NOISE_STD_VECTOR = [0, 0, 0]
    # noise std is a function of feture std
    NOISE_STD_VECTOR.extend(feat_stds * fac)

    oof_preds = []
    models = []
    
    for i in range(REPEATS):
        print(f"Starting Repeat {i+1}")
        m, oof = build_train(repeat_number = i, folds=FOLDS)
        models = models + m
        oof_preds.append(oof)

        
    print(f"Finished training.")
    
    mean_oof_preds = y_train.copy()
    mean_oof_preds.loc[:, target_cols] = 0
    rep_losses = []
    print(f"OOF Results for Setting {j+1}, Noise Factor: {fac}")
    for i, p in enumerate(oof_preds):
        loss = multi_log_loss(y_train, p)
        print(f"Repeat {i + 1} OOF Log Loss: {loss}")
        rep_losses.append(loss)
        mean_oof_preds.loc[:, target_cols] += p[target_cols]
    losses_ind.append(rep_losses)
    
    mean_oof_preds.loc[:, target_cols] /= len(oof_preds)
    loss_mean = multi_log_loss(y_train, mean_oof_preds)
    print(f"Mean OOF Log Loss: {multi_log_loss(y_train, mean_oof_preds)}")
    losses_mean.append(loss_mean)
    mean_oof_preds.loc[x_train['cp_type'] == 0, target_cols] = 0
    loss_mean_adj = multi_log_loss(y_train, mean_oof_preds)
    print(f"Mean OOF Log Loss (ctl adjusted): {loss_mean_adj}\n")
    losses_mean_adj.append(loss_mean_adj)
    
    print(f"Making test predicitons:")
    test_preds = sub.copy()
    test_preds[target_cols] = 0
    for model in models:
        test_preds.loc[:,target_cols] += model.predict(x_test)
    test_preds.loc[:,target_cols] /= len(models)
    test_preds.loc[x_test['cp_type'] == 0, target_cols] = 0
    test_pred_all.append(test_preds)
    
    print("Freeing Memory.")
    for m in models:
        del m
    K.clear_session()
    
    print('#'*50)


## Results Table and Graph

In [None]:
pd.DataFrame({'Noise Factor': noise_factors, 'Loss (each repeat)': losses_ind, 'Losses (mean pred.)': losses_mean, 'Losses (mean pred. adj.)': losses_mean_adj})

In [None]:
plt.figure(figsize=(12,8))
low = min(losses_mean_adj)
high = max(losses_mean_adj)
plt.ylim([round(low-2*(high-low),5), round(high+0.5*(high-low), 5)])
plt.bar(['{:.2f}'.format(x) for x in noise_factors], losses_mean_adj)
plt.ylabel(f'Log Loss (adjusted {FOLDS} by {REPEATS} mean)')
plt.xlabel('Noise Factor')
plt.title('Gaussian Noise Factor vs Mean OOF Prediction Log Loss')

OOF prediction loss results indicate best accuracy at approximately 40% of feature standard deviation. This approach definitely has some potential value. Future tests will need to assert that the OOF prediction improvements translate to LB.

### Make Submission Using All Models from All Noise Settings

In [None]:
test_preds = sub.copy()
test_preds[target_cols] = 0
for p in test_pred_all:
    test_preds.loc[:,target_cols] += p[target_cols]
test_preds.loc[:,target_cols] /= len(test_pred_all)
test_preds.loc[x_test['cp_type'] == 0, target_cols] = 0
test_preds.to_csv(f'submission.csv', index=False)