# Experiment: Denoising Autoencoder for FE


## Best LB so far (using only autoencoder features to generate predictions): V8 (LB 0.01879)

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics, Model,losses
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.decomposition import PCA

In [None]:
# Import train data, drop sig_id, cp_type

train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
non_ctl_idx = train_features.loc[train_features['cp_type']!='ctl_vehicle'].index.to_list()
train_features = train_features.drop(['sig_id','cp_type'],axis=1)
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_scored = train_targets_scored.drop('sig_id',axis=1)
labels_train = train_targets_scored.values

# Drop training data with ctl vehicle

train_features = train_features.iloc[non_ctl_idx]
labels_train = labels_train[non_ctl_idx]

# Import test data

test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
test_features = test_features.drop(['sig_id'],axis=1)

In [None]:
# Label Encoder for categorical cp_dose

cat = 'cp_dose'
le = preprocessing.LabelEncoder()
le.fit(train_features[cat])
train_features[cat] = le.transform(train_features[cat])

# Transform categorical

test_features[cat] = le.transform(test_features[cat])

In [None]:
# Quantile Transformer for gene and cell features

scaler = preprocessing.QuantileTransformer(output_distribution='normal')

# Scale train data
data_train = scaler.fit_transform(train_features.iloc[:,2:])

# Scale Test data
data_test = scaler.transform(test_features.drop('cp_type',axis=1).iloc[:,2:])

# Standard Scaling for Dose/Time

scaler = preprocessing.StandardScaler()
data_train =  np.concatenate((scaler.fit_transform(train_features.iloc[:,:2]),data_train),axis=1)
data_test =  np.concatenate((scaler.transform(test_features.drop('cp_type',axis=1).iloc[:,:2]),data_test),axis=1)

In [None]:
n_features = data_train.shape[1]
n_labels = labels_train.shape[1]
n_train = data_train.shape[0]
n_test = data_test.shape[0]

In [None]:
# Autoencoder to create compressed features

# Cells
cs = train_features.columns.str.contains('c-')
cells_train = data_train[:,cs]
cells_test = data_test[:,cs]
encoding_dim = 256
activation = 'swish'
dropout_noise = 0.15
dropout = 0.1
class Autoencoder(Model):
    def __init__(self, n_inputs, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = Sequential([
            layers.BatchNormalization(),
            layers.Dropout(dropout_noise),
            layers.Dense(0.5*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(0.75*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(encoding_dim,activation=activation),
        ])
        self.decoder = Sequential([
            layers.Dense(0.75*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(0.5*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(n_inputs)
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
cells_autoencoder = Autoencoder(cs.sum(),encoding_dim)
cells_autoencoder.compile(optimizer=optimizers.Adam(learning_rate=1E-5), loss='mse')
ae_reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1E-5)
ae_early_stopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=1E-5, patience=16, restore_best_weights=True)
def scheduler(epoch,lr):
    if epoch%32<17:
        lr += (0.3*1E-2)/16
    else:
        lr -= (0.3*1E-2)/16
    return lr

lr_scheduler = callbacks.LearningRateScheduler(scheduler)
hist = cells_autoencoder.fit(cells_train,cells_train,batch_size=128, verbose=0, validation_data = (cells_test,cells_test), epochs=256, shuffle=True,
                             callbacks=[ae_early_stopping,lr_scheduler])

cells_autoencoder.compile(optimizer=optimizers.Adam(learning_rate=5*1E-5), loss='mse')

cells_autoencoder.fit(cells_train,cells_train,batch_size=128, verbose=0, validation_data = (cells_test,cells_test), epochs=256, shuffle=True,
                             callbacks=[ae_early_stopping,ae_reduce_lr])
cells_autoencoder.save('CellsAE')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.yscale('log')
# plt.yticks(ticks=[1,1E-1])
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])

In [None]:
# Genes
gs = train_features.columns.str.contains('g-')
genes_train = data_train[:,gs]
genes_test = data_test[:,gs]
encoding_dim = 1600
activation = 'swish'
dropout_noise = 0.15
dropout = 0.1
class Autoencoder(Model):
    def __init__(self, n_inputs, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = Sequential([
            layers.BatchNormalization(),
            layers.Dropout(dropout_noise),
            layers.Dense(0.5*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(0.75*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(encoding_dim,activation=activation),
        ])
        self.decoder = Sequential([
            layers.Dense(0.75*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(0.5*encoding_dim,activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            layers.Dense(n_inputs)
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
genes_autoencoder = Autoencoder(gs.sum(),encoding_dim)
genes_autoencoder.compile(optimizer=optimizers.Adam(learning_rate=1E-5), loss='mse')
ae_reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1E-5)
ae_early_stopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=1E-5, patience=16, restore_best_weights=True)

hist = genes_autoencoder.fit(genes_train,genes_train,batch_size=128, verbose=0, validation_data = (genes_test,genes_test), epochs=256, shuffle=True, 
                             callbacks=[ae_early_stopping,lr_scheduler])

genes_autoencoder.compile(optimizer=optimizers.Adam(learning_rate=5*1E-5), loss='mse')
genes_autoencoder.fit(genes_train,genes_train,batch_size=128, verbose=0, validation_data = (genes_test,genes_test), epochs=256, shuffle=True, 
                             callbacks=[ae_early_stopping])
genes_autoencoder.save('GenesAE')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.yscale('log')
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])

In [None]:
ae_cells_train = cells_autoencoder.encoder(cells_train).numpy()
ae_cells_test = cells_autoencoder.encoder(cells_test).numpy()
autoencoder_error = mean_squared_error(cells_test,cells_autoencoder.decoder(ae_cells_test).numpy())
print("Cells reconstruction error is " + str(autoencoder_error))

In [None]:
# Comparison with PCA

pca = PCA(n_components=64)
pca.fit(cells_train)
pca_error = mean_squared_error(cells_test,pca.inverse_transform(pca.transform(cells_test)))
print('PCA Reconstruction Error for Cells is ' + str(pca_error))

In [None]:
ae_genes_train = genes_autoencoder.encoder(genes_train).numpy()
ae_genes_test = genes_autoencoder.encoder(genes_test).numpy()
autoencoder_error = mean_squared_error(genes_test,genes_autoencoder.decoder(ae_genes_test).numpy())
print("Genes reconstruction error is " + str(autoencoder_error))

In [None]:
# Comparison with PCA

pca = PCA(n_components=512)
pca.fit(genes_train)
pca_error = mean_squared_error(genes_test,pca.inverse_transform(pca.transform(genes_test)))
print('PCA Reconstruction Error for Genes is ' + str(pca_error))

In [None]:
# Replace data with encoded data

data_train = np.concatenate((data_train[:,~(cs+gs)],ae_genes_train,ae_cells_train),axis=1)
data_test = np.concatenate((data_test[:,~(cs+gs)],ae_genes_test,ae_cells_test),axis=1)

In [None]:
# Train

n_labels = train_targets_scored.shape[1]
n_features = data_train.shape[1]
n_train = data_train.shape[0]
n_test = data_test.shape[0]


# Prediction Clipping Thresholds

p_min = 0.0005
p_max = 0.9995

# Evaluation Metric with clipping and no label smoothing

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

# Generate Seeds

n_seeds = 3
seeds = [34,9,18]

# Training Loop

n_folds = 5
y_pred = np.zeros((n_test,n_labels))
oof = tf.constant(0.0)
hists = []
bias = tf.keras.initializers.Constant(np.log(labels_train.mean(axis=0)))
for seed in seeds:
    fold = 0
    mskf = MultilabelStratifiedKFold(n_splits=n_folds,shuffle=True,random_state=seed)
    for train, test in mskf.split(data_train,labels_train):
        X_train = data_train[train]
        X_test = data_train[test]
        y_train = labels_train[train]
        y_test = labels_train[test]

        # Define NN Model

        model = Sequential()
        model.add(layers.Dropout(0.3))
        model.add(layers.Dense(1536))
        model.add(layers.Activation('elu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(0.7))
        model.add(layers.Dense(1024))
        model.add(layers.Activation('elu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(0.5))
        model.add(layers.Dense(512))
        model.add(layers.Activation('elu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(0.3))
        model.add(layers.Dense(n_labels,activation='sigmoid',bias_initializer=bias))
        model.compile(optimizer=optimizers.Adam(learning_rate=1E-5), loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics=['binary_crossentropy',logloss])
        reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_logloss', factor=0.3, patience=5, mode='min', min_lr=1E-5)
        early_stopping = callbacks.EarlyStopping(monitor='val_logloss', min_delta=1E-5, patience=24, mode='min',restore_best_weights=True)
        def scheduler(epoch,lr):
            if epoch%16<9:
                lr += np.exp(-int(epoch/16))*(0.5*1E-2)/16
            else:
                lr -= np.exp(-int(epoch/16))*(0.5*1E-2)/16
            return lr

        lr_scheduler = callbacks.LearningRateScheduler(scheduler)
        hist = model.fit(X_train,y_train, batch_size=128, epochs=192,verbose=0,validation_data = (X_test,y_test),callbacks=[lr_scheduler, early_stopping])
        
        model.compile(optimizer=optimizers.Adam(learning_rate=1E-5), loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics=['binary_crossentropy',logloss])
        model.fit(X_train,y_train, batch_size=128, epochs=192,verbose=0,validation_data = (X_test,y_test),callbacks=[early_stopping])

        hists.append(hist)
        
        # Save Model
        model.save('AutoEncoded_seed_'+str(seed)+'_fold_'+str(fold))

        # OOF Score
        y_val = model.predict(X_test)
        oof += logloss(tf.constant(y_test,dtype=tf.float32),tf.constant(y_val,dtype=tf.float32))/(n_folds*n_seeds)

        # Run prediction
        y_pred += model.predict(data_test)/(n_folds*n_seeds)

        fold += 1

In [None]:
# Analysis of Training

tf.print('OOF score is ',oof)

plt.figure(figsize=(12,8))

hist_trains = []
hist_lens = []
for i in range(n_folds*n_seeds):
    hist_train = (hists[i]).history['logloss']
    hist_trains.append(hist_train)
    hist_lens.append(len(hist_train))
hist_train = []
for i in range(min(hist_lens)):
    hist_train.append(np.mean([hist_trains[j][i] for j in range(n_folds*n_seeds)]))

plt.plot(hist_train)

hist_vals = []
hist_lens = []
for i in range(n_folds*n_seeds):
    hist_val = (hists[i]).history['val_logloss']
    hist_vals.append(hist_val)
    hist_lens.append(len(hist_val))
hist_val = []
for i in range(min(hist_lens)):
    hist_val.append(np.mean([hist_vals[j][i] for j in range(n_folds*n_seeds)]))

plt.plot(hist_val)

plt.yscale('log')
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])


In [None]:
# Prediction Clipping Thresholds

p_min = 0.0005
p_max = 0.9995

# Generate submission file, Clip Predictions

sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
sub.iloc[:,1:] = np.clip(y_pred,p_min,p_max)

# Set ctl_vehicle to 0
sub.iloc[test_features['cp_type'] == 'ctl_vehicle',1:] = 0

# Save Submission
sub.to_csv('submission.csv', index=False)