# Mechanisms of Action - ElasticNet

In this notebook we use elastic net regularization to trian a deep neural network.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import VarianceThreshold

from keras import Sequential
from keras.backend import clear_session
from keras.layers import Dense, Dropout, BatchNormalization, Input
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy

from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras import callbacks

In [None]:
#load data
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_non_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')

train_targets.head()

In [None]:
print('Features shape:', train_features.shape)
print('Scored targets shape:', train_targets.shape)
print('Non-scored targets shape:', train_non_scored.shape)

In [None]:
train_features = train_features.set_index('sig_id')
train_targets = train_targets.set_index('sig_id')
train_non_scored = train_non_scored.set_index('sig_id')

## Data preparation
Preprocessing used:
* Statistical features
* Variance threshold
* Scaling
* PCA
* Autoencoder features


In [None]:
#features are in categories
g_features = [x for x in train_features.columns if x.startswith('g-')]
c_features = [x for x in train_features.columns if x.startswith('c-')]
other_features = [x for x in train_features.columns if x not in g_features+c_features]

In [None]:
#encode binary features
train_features['cp_type'] = train_features['cp_type'].map({
    'trt_cp' : 0,
    'ctl_vehicle' : 1})
train_features['cp_dose'] = train_features['cp_dose'].map({
    'D1' : 0,
    'D2' : 1})

In [None]:
X = train_features
y = train_targets
y_non_scored = train_non_scored

X = pd.get_dummies(X, columns = ['cp_time'])

### Statistical features

In [None]:
X['g_std'] = X[g_features].std(axis = 1)
X['g_var'] = X[g_features].var(axis = 1)
X['g_skew'] = X[g_features].skew(axis = 1)
X['g_kurt'] = X[g_features].kurtosis(axis = 1)

In [None]:
X['c_std'] = X[c_features].std(axis = 1)
X['c_var'] = X[c_features].var(axis = 1)
X['c_skew'] = X[c_features].skew(axis = 1)
X['c_kurt'] = X[c_features].kurtosis(axis = 1)

In [None]:
stat_features = ['g_std', 'g_var', 'g_skew', 'g_kurt', 'c_std', 'c_var', 'c_skew', 'c_kurt']

### Variance Threshold

In [None]:
selector = VarianceThreshold(0.85)
selector_cols = g_features + c_features + stat_features
selector.fit(X[selector_cols])

In [None]:
drop = [col for col, support in zip(selector_cols, selector.get_support()) if not support]
X = X.drop(columns = drop)

In [None]:
#update features
g_features_train = list(set(X.columns).intersection(set(g_features)))
c_features_train = list(set(X.columns).intersection(set(c_features)))
stat_features_train = list(set(X.columns).intersection(set(stat_features)))
X.shape

### Scaling

In [None]:
#rescaling
scale_cols = g_features_train + c_features_train + stat_features_train
scaler = StandardScaler()
X[scale_cols] = scaler.fit_transform(X[scale_cols])

In [None]:
#rescaling
scale_cols = g_features_train + c_features_train + stat_features_train
scaler = StandardScaler()
X[scale_cols] = scaler.fit_transform(X[scale_cols])

X.shape

### PCA

In [None]:
#extract PCA features
n_components = 100
pca = PCA(n_components = n_components)
pca_features = pca.fit_transform(X[g_features_train + c_features_train])

pca_cols = ['pca_'+str(i) for i in range(n_components)]
X[pca_cols] = pca_features

### Autoencoder

In [None]:
#train an autoencoder then use the output of its encoder to extract features
auto_input_cols = g_features_train + c_features_train
len(auto_input_cols)

In [None]:
auto = tf.keras.Sequential([
    tf.keras.layers.Dense(400, activation = 'relu', input_shape = (len(auto_input_cols),)),
    tf.keras.layers.Dense(100, activation = 'relu'),
    tf.keras.layers.Dense(400, activation = 'relu'),
    tf.keras.layers.Dense(len(auto_input_cols))
])

auto.compile(optimizer = tf.keras.optimizers.Adam(),
            loss = tf.keras.losses.MeanSquaredError())

auto.summary()

In [None]:
auto.fit(X[auto_input_cols], X[auto_input_cols],
        epochs = 80,
        batch_size = 128)

In [None]:
#get the auto features from the encoder output
auto_features = tf.keras.backend.function([auto.input], [auto.layers[1].output])([X[auto_input_cols].to_numpy(), 1])[0]

In [None]:
#add the auto features
auto_cols = ['auto_' + str(n) for n in range(auto_features.shape[1])]
X[auto_cols] = auto_features
X.shape

### Prepare test set

In [None]:
#prepare the submission input data
test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
test = test.set_index('sig_id')

test['cp_type'] = test['cp_type'].map({
    'trt_cp' : 0,
    'ctl_vehicle' : 1})
test['cp_dose'] = test['cp_dose'].map({
    'D1' : 0,
    'D2' : 1})
X_test = pd.get_dummies(test, columns = ['cp_time'])

#statistical features
X_test['g_std'] = X_test[g_features].std(axis = 1)
X_test['g_var'] = X_test[g_features].var(axis = 1)
X_test['g_skew'] = X_test[g_features].skew(axis = 1)
X_test['g_kurt'] = X_test[g_features].kurtosis(axis = 1)
X_test['c_std'] = X_test[c_features].std(axis = 1)
X_test['c_var'] = X_test[c_features].var(axis = 1)
X_test['c_skew'] = X_test[c_features].skew(axis = 1)
X_test['c_kurt'] = X_test[c_features].kurtosis(axis = 1)

#variance threshold
X_test = X_test.drop(columns = drop)

#scaling
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

#pca
pca_features = pca.transform(X_test[g_features_train + c_features_train])
X_test[pca_cols] = pca_features

#auto encoder
auto_features = tf.keras.backend.function([auto.input], [auto.layers[1].output])([X_test[auto_input_cols].to_numpy(), 1])[0]
X_test[auto_cols] = auto_features

X_test.shape

## Modelling

Define the architecture of the neural network. The architecture and hyperparameters were chosen through repeated rounds of Bayesian optimisations in previous notebooks.

In [None]:
def build_model(input_shape, output_shape, l1 = 1e-7, l2 = 1e-7, r = 1):
    '''Function to create the network with given amount of regularization.'''
    
    reg = tf.keras.regularizers.L1L2(l1 = l1, l2 = l2)

    model = Sequential()

    model.add(Input(input_shape))

    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(WeightNormalization(Dense(int(2048*r), activation = 'selu',
                                        kernel_initializer = 'lecun_normal',
                                       kernel_regularizer = reg)))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(WeightNormalization(Dense(int(1024*r), activation = 'selu',
                                        kernel_initializer = 'lecun_normal',
                                       kernel_regularizer = reg)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(WeightNormalization(Dense(int(512*r), activation = 'selu',
                                        kernel_initializer = 'lecun_normal',
                                       kernel_regularizer = reg)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(WeightNormalization(Dense(int(395*r), activation = 'selu',
                                        kernel_initializer = 'lecun_normal',
                                       kernel_regularizer = reg)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(BatchNormalization())
    model.add(WeightNormalization(Dense(output_shape, activation = 'sigmoid',
                                       kernel_regularizer = reg)))

    return model

In [None]:
def custom_metric(y_true, y_pred):
    '''Competition metric'''
    
    eps = tf.constant(1e-7, dtype = tf.float32)
    
    y_pred = tf.math.maximum(tf.math.minimum(y_pred, 1-eps),eps)
    
    log_loss = -y_true * tf.math.log(y_pred) - (1 - y_true) * tf.math.log(1 - y_pred)
    return tf.reduce_mean(log_loss)    

In [None]:
#callbacks for warmup training
reduce_lr = callbacks.ReduceLROnPlateau(patience = 5, mode = 'min', monitor = 'val_custom_metric', factor=0.1, epsilon=1e-4)

### Bayesian Optimisation
Optimize the strengths of regularization and the size of the network with gp_minimize.

In [None]:
from skopt import gp_minimize
from skopt.space.space import Real, Integer
from skopt.utils import use_named_args

In [None]:
dim_l1 = Real(low = 1e-7, high = 3e-7, name = 'l1')
dim_l2 = Real(low = 9e-9, high = 3e-8, name = 'l2')
dim_r = Real(low = 0.85, high = 1.15, name = 'r')
dimensions = [dim_l1, dim_l2, dim_r]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [None]:
@use_named_args(dimensions = dimensions)
def obj_fun(l1, l2, r):
    
    clear_session()
        
    #warmup model with no regularization
    model = build_model(X_train.shape[1], y_train.shape[1], l1 = l1, l2 = l2, r = r)

    model.compile(optimizer = Adam(2e-3),
                  loss = BinaryCrossentropy(label_smoothing = 0.001),
                  metrics = [custom_metric])

    history = model.fit(X_train, y_train,
                  batch_size = 128,
                  epochs = 45,
                  validation_data = (X_val, y_val),
                  callbacks = [reduce_lr],
                  verbose = 0)

    return history.history['val_custom_metric'][-1]

In [None]:
#opt_result = gp_minimize(obj_fun, dimensions = dimensions, n_calls = 50, x0 = [2e-7, 1.48e-8, 1.0],verbose = True)

In [None]:
#optimal results
l1 = 1.3e-07
l2 = 2.27e-08
r = 0.946078

## Prediction

Below is the main loop containing training and inference. Performing a K-fold cross validation over different seeds predicting the test set after fitting to each fold.

In [None]:
# fit the model to each fold and generate predictions
y_preds = []
val_log_losses = []

verbose = 0
batch_size = 128
runs = 5
splits = 7
epochs = 45

for run in range(runs):

    kf = KFold(n_splits = splits, shuffle = True)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):

        print('Run {} Fold {}'.format(run, fold))

        #split data
        X_train, X_val = X.to_numpy()[train_idx], X.to_numpy()[val_idx]
        y_train, y_val = y.to_numpy()[train_idx], y.to_numpy()[val_idx]
        y_non_scored_train, y_non_scored_val = y_non_scored.to_numpy()[train_idx], y_non_scored.to_numpy()[val_idx]
        
        clear_session()
        
        #warmup model with no regularization
        model = build_model(X_train.shape[1], y_train.shape[1], l1 = l1, l2 = l2, r = r)
        
        model.compile(optimizer = Adam(2e-3),
                      loss = BinaryCrossentropy(label_smoothing = 0.001),
                      metrics = [custom_metric])
        
        history = model.fit(X_train, y_train,
                      batch_size = batch_size,
                      epochs = epochs,
                      validation_data = (X_val, y_val),
                      callbacks = [reduce_lr],
                      verbose = verbose)
        
        val_log_losses.append(history.history['val_custom_metric'][-1])
        
        y_preds.append(model.predict(X_test))

In [None]:
np.mean(val_log_losses)

In [None]:
predictions = np.mean(np.array(y_preds), axis = 0) #average the predictions from each fold

## Submission

Submit the predictions.

In [None]:
submission = pd.DataFrame(predictions, columns = train_targets.columns)
submission['sig_id'] = test.index
submission = submission[['sig_id']+list(train_targets.columns)]

#set ctl vehicle predictions to 0
submission.loc[list(test.cp_type == 1), train_targets.columns] = 0

print(submission.shape)
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)