# Demonstration of Label smoothing

### The competition metric punishes highly confident incorrect answers. In this notebook, I show that a simple modification where the labels are smoothed to a small extent, and predictions clipped to prevent prediction probabilities close to 0 and 1 boosts performance significantly

### Inspired by this post https://www.kaggle.com/c/lish-moa/discussion/185593


### For more ideas, check out my other notebooks:
* [Pretrained Model Inference and Blending Starter](https://www.kaggle.com/rahulsd91/moa-starter-inference-blending-pretrained-models)
* [Multi-input ResNet Architecture ](https://www.kaggle.com/rahulsd91/moa-multi-input-resnet-model)
* [Autoencoder based approach](https://www.kaggle.com/rahulsd91/moa-autoencoder-features-only-lb-0-01884)

### Best Version by LB Score (if not current) : V10

#### Version 15: No scaling, drop cp_dose/cp_time

#### Version 14: StandardScaler, remove WeightNormalization, no clipping for validation monitor metric

#### Version 12: Add WeightNormalization layer to bias initialization

#### Version 11: Test of bias initialization ([Idea from here](https://www.kaggle.com/tolgadincer/moa-tensorflow-fast-convergence) )

#### Version 10: Increased NN units

#### Version 8: Using keras's native label smoothing option

#### Version 7: Save Models for Blending

#### Version 6: Add Seed Averaging

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,losses
import tensorflow as tf
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
# Import train data, drop sig_id, cp_type

train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
non_ctl_idx = train_features.loc[train_features['cp_type']!='ctl_vehicle'].index.to_list()
train_features = train_features.drop(['sig_id','cp_type','cp_dose','cp_time'],axis=1)
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_scored = train_targets_scored.drop('sig_id',axis=1)
labels_train = train_targets_scored.values

# Drop training data with ctl vehicle

train_features = train_features.iloc[non_ctl_idx]
data_train = train_features.values
labels_train = labels_train[non_ctl_idx]

# Import test data

test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
test_features = test_features.drop(['sig_id','cp_dose','cp_time'],axis=1)
data_test = test_features.drop('cp_type',axis=1).values

In [None]:
n_labels = labels_train.shape[1]
n_features = data_train.shape[1]
n_train = data_train.shape[0]
n_test = data_test.shape[0]


# Prediction Clipping Thresholds

p_min = 0.001
p_max = 0.999

# Evaluation Metric with clipping and no label smoothing

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))


# Generate Seeds

n_seeds = 1

## I seed the np rng with a fixed seed and then use it to generate the random seeds for the MSKF. 
## Keep the same seed for different models that you plan to ensemble/blend so that their OOF performance is comparable.
## Verify that the array "seeds" has unique integers. For eg. if np.random is seeded with 0, n_seeds = 6 results in 5 unique seeds.
np.random.seed(1)
seeds = np.random.randint(0,100,size=n_seeds)

# Training Loop

n_folds = 5
y_pred = np.zeros((n_test,n_labels))
oof = tf.constant(0.0)
hists = []
bias = tf.keras.initializers.Constant(np.log(labels_train.mean(axis=0)))
for seed in seeds:
    fold = 0
    mskf = MultilabelStratifiedKFold(n_splits=n_folds,shuffle=True,random_state=seed)
    for train, test in mskf.split(data_train,labels_train):
        X_train = data_train[train]
        X_test = data_train[test]
        y_train = labels_train[train]
        y_test = labels_train[test]

        # Define NN Model

        model = Sequential()
        model.add(layers.Dropout(0.4))
        model.add(layers.Dense(1024))
        model.add(layers.Activation('swish'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(0.1))
        model.add(layers.Dense(1024))
        model.add(layers.Activation('swish'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(0.1))
        model.add(layers.Dense(n_labels, activation='sigmoid', bias_initializer=bias))
        model.compile(optimizer=optimizers.Adam(learning_rate=5*1E-5), loss=losses.BinaryCrossentropy(label_smoothing=0.001),
                      metrics=metrics.BinaryCrossentropy())
        reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_binary_crossentropy', factor=0.5, patience=5, mode='min', min_lr=1E-5)
        early_stopping = callbacks.EarlyStopping(monitor='val_binary_crossentropy', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True)

        hist = model.fit(X_train,y_train, batch_size=128, epochs=192,verbose=0,validation_data = (X_test,y_test),callbacks=[reduce_lr, early_stopping])
        hists.append(hist)
        
        # Save Model
        model.save('LabelSmoothed_seed_'+str(seed)+'_fold_'+str(fold))

        # OOF Score
        y_val = model.predict(X_test)
        oof += logloss(tf.constant(y_test,dtype=tf.float32),tf.constant(y_val,dtype=tf.float32))/(n_folds*n_seeds)

        # Run prediction
        y_pred += model.predict(data_test)/(n_folds*n_seeds)

        fold += 1


In [None]:
# Analysis of Training

tf.print('OOF score is ',oof)

plt.figure(figsize=(12,8))

hist_trains = []
hist_lens = []
for i in range(n_folds*n_seeds):
    hist_train = (hists[i]).history['binary_crossentropy']
    hist_trains.append(hist_train)
    hist_lens.append(len(hist_train))
hist_train = []
for i in range(min(hist_lens)):
    hist_train.append(np.mean([hist_trains[j][i] for j in range(n_folds*n_seeds)]))

plt.plot(hist_train)

hist_vals = []
hist_lens = []
for i in range(n_folds*n_seeds):
    hist_val = (hists[i]).history['val_binary_crossentropy']
    hist_vals.append(hist_val)
    hist_lens.append(len(hist_val))
hist_val = []
for i in range(min(hist_lens)):
    hist_val.append(np.mean([hist_vals[j][i] for j in range(n_folds*n_seeds)]))

plt.plot(hist_val)

plt.yscale('log')
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])

In [None]:
# Generate submission file, Clip Predictions

sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
sub.iloc[:,1:] = np.clip(y_pred,p_min,p_max)

# Set ctl_vehicle to 0
sub.iloc[test_features['cp_type'] == 'ctl_vehicle',1:] = 0

# Save Submission
sub.to_csv('submission.csv', index=False)