In [None]:
#Implementation of Swap Noise to create robust DAEs.  This yields one trained embedder per fold per split.
#The encoders are then used downstream for extracting embeddings from the data.

In [None]:
#HYPERPARAMETERS
SWAP_PERC = .15
SPLITS = 5 
EPOCHS = 70 
BATCH_SIZE = 128
N_STARTS = 7

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import pickle
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
import tensorflow_addons as tfa
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm
import os
import gc

from scipy.special import erfinv as sp_erfinv
from sklearn.preprocessing import QuantileTransformer #from https://www.kaggle.com/kushal1506/moa-pytorch-0-01859-rankgauss-pca-nn/comments#PCA-features-+-Existing-features

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24:-1, 48:0, 72:1})
    del df['sig_id']
    return df

train = preprocess(train_features)
test = preprocess(test_features)

del train_targets['sig_id']

#Deleting all control samples.  These have no MoA so makes sense...
train_targets = train_targets.loc[train['cp_type']==0].reset_index(drop=True)
train = train.loc[train['cp_type']==0].reset_index(drop=True)
test = test.loc[test['cp_type']==0].reset_index(drop=True)

print(train.shape, test.shape)
train

In [None]:
from time import time
def rankGauss(train, test):
    train_rg = train.copy()
    test_rg = test.copy()
    transformer = QuantileTransformer(n_quantiles=1000,random_state=0, output_distribution="normal")
    
    for col in train.columns[3:]:
        vec_len = len(train_rg[col].values)
        vec_len_test = len(test_rg[col].values)
        raw_vec = train_rg[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        train_rg[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        test_rg[col] = transformer.transform(test_rg[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]        
    return train_rg, test_rg

start = time()
train_rg, test_rg = rankGauss(train, test)
print(f'{time() - start :.2f}')

In [None]:
###############################################################################
#The heart of the kernel.  This generator, SwapNoise, replaces swap_perc of 
# data with noise generated from its distribution.  The noise for all columns
# but the 1st 2 will be Gaussian(0,1);  
# the 1st will be drawn from [-1, 0, 1] (cp_time);
# the 2rd from [0,1] (cp_dose).
###############################################################################
class SwapNoise(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, swap_perc = .15, batch_size = 128, shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.data = data
        self.swap_perc = swap_perc

        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.data.shape[0] // self.batch_size

    def __getitem__(self, index):
        'Generate one batch of data'
        #Generate indexes of the batch
        indices = [i for i in range(index*self.batch_size, (index+1)*self.batch_size)]
        #print(indices)
        noisy = self.swap_noise(self.data[indices])
        
        return (noisy), self.data[indices]

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        if self.shuffle == True:
            np.random.shuffle(self.data)
    def get_data(self):
        return self.data
    def swap_noise(self, temp):
        'Generates data containing batch_size samples' 
        
        # Add Swap_noise by row for 'g' and 'c' features.  These are all 'normal' due to rankGauss
        for index in range(temp.shape[0]):            
            num_swaps = np.random.binomial(temp.shape[1], self.swap_perc, size=1)# how many swaps this row
            if num_swaps ==0:
                num_swaps=1
            swap_rvs = np.random.normal(size=num_swaps) #new values to be swapped in 
            positions_in_row = np.random.choice(a= range(2,temp.shape[1]) , size=num_swaps, replace=False)# new positions to be swapped in
            temp[index, positions_in_row] = swap_rvs#swapping
        
        #Adding Swap_Noise by column for 'cp_time'.  These have distr of [-1,0,1]
        num_swaps = np.random.binomial(temp.shape[0], self.swap_perc, size=1)
        if num_swaps ==0:
            num_swaps=1
        swap_rvs = np.random.choice(a= np.array([-1,0,1]), size=num_swaps, replace=True) #new values to be swapped in 
        positions_in_col = np.random.choice(a= np.array(range(temp.shape[0])), size=num_swaps, replace=False)# new positions to be swapped in
        #print(positions_in_col)
        temp[positions_in_col, 0] = swap_rvs#swapping new values for cp_time


        #Adding Noise by column for 'cp_dose'.  These have distr of [0,1]
        num_swaps = np.random.binomial(temp.shape[0], self.swap_perc, size=1)
        if num_swaps ==0:
            num_swaps=1
        swap_rvs = np.random.choice(a= np.array([0,1]), size=num_swaps, replace=True) #new values to be swapped in 
        positions_in_col = np.random.choice(a= np.array(range(temp.shape[0])), size=num_swaps, replace=False)# new positions to be swapped in
        temp[positions_in_col, 1] = swap_rvs#swapping new values for cp_time
        #print(temp)
        
        return temp

In [None]:
def Batch_Drop_Dense(x, layer_size, drop_rate, activation = 'relu'):
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(drop_rate)(x)
        x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(layer_size, activation=activation))(x)
        return x
    
###################################################################
#Residual Block that keeps the inputs as the same size as the 
# incoming layer.
###################################################################
def Residual_Block_same(prev_nonActivations, size, drop_rate, lv):
    x = tf.keras.layers.BatchNormalization()(prev_nonActivations)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.Dropout(drop_rate)(x)
    x = tf.keras.layers.Dense(size)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.Dropout(drop_rate)(x)
    x = tf.keras.layers.Dense(size)(x)
    return tf.keras.layers.Add(name=f'Residuals_same{lv}')([x, prev_nonActivations])

###################################################################
#Residual Block where the size of the network changes.
###################################################################
def Residual_Block_diff(prev_nonActivations, size, drop_rate, lv):
    resized_prev = tf.keras.layers.Dense(size)(prev_nonActivations)

    x = tf.keras.layers.BatchNormalization()(prev_nonActivations)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.Dropout(drop_rate)(x)
    x = tf.keras.layers.Dense(size)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.Dropout(drop_rate)(x)
    x = tf.keras.layers.Dense(size)(x)
    return tf.keras.layers.Add(name=f'Residuals_diff{lv}')([x, resized_prev])

def create_DAE(num_columns, middle=1024):
    inp = tf.keras.layers.Input(num_columns)
    
    #x = tf.keras.layers.Dropout(.15)(inp) Replaced with .15 swapnoise from the generator
    x = tf.keras.layers.Dense(middle)(inp)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    
    x = Residual_Block_same(x, middle, .2, 1)
    x = Residual_Block_diff(x, num_columns, .2, 2)

    model = tf.keras.Model(inputs=inp, outputs=x)
    model.compile(optimizer=tfa.optimizers.Lookahead(tf.optimizers.Adam(), sync_period=10),
                  loss='mse', 
                  )
    return model

In [None]:
#Minor check to make sure the architecture works.  Run a toy DAE 
BATCH_SIZE = 128
train_gen = SwapNoise(train.values[:,1:], swap_perc = .15, batch_size = BATCH_SIZE, shuffle=True)
val_gen = SwapNoise(test.values[:,1:], swap_perc = .15, batch_size = BATCH_SIZE, shuffle=True)

model = create_DAE(train.shape[1]-1, 1092)
model.fit(train_gen, validation_data = val_gen, epochs =1)

In [None]:
#TRAINING THE AUTOENCODERS.  ONE PER SPLIT
tf.random.set_seed(42)

for seed in range(N_STARTS):
    for fold, (tr, te) in enumerate(MultilabelStratifiedKFold(n_splits=SPLITS, random_state=seed, shuffle=True).split(train_targets, train_targets)):
        for EMBEDDING_DIMS in [1024]:

            tf.keras.backend.clear_session()
            model_AE = create_DAE(train.shape[1]-1, EMBEDDING_DIMS)
            checkpoint_path = f'FEATURE_AE_repeat_{seed}_Fold:{fold}_dim{EMBEDDING_DIMS}.hdf5'
            reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
            cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 0, save_best_only = True,
                                         save_weights_only = True, mode = 'min')
            
            train_gen = SwapNoise(np.concatenate((train.values[tr][:, 1:], test.values[:, 1:]), axis=0), 
                                  swap_perc = SWAP_PERC, batch_size = BATCH_SIZE, shuffle=True)
            val_gen = SwapNoise(train.values[te][:, 1:], 
                                  swap_perc = SWAP_PERC, batch_size = BATCH_SIZE, shuffle=True)
            h = model_AE.fit(train_gen, validation_data=val_gen,
                      epochs=EPOCHS, batch_size=BATCH_SIZE,
                      callbacks=[reduce_lr_loss, cb_checkpt], verbose=2
                     )
            model_AE.load_weights(checkpoint_path)
            embedder = tf.keras.Model(inputs = model_AE.input, outputs = model_AE.get_layer(name='Residuals_same1').output)

            os.remove(checkpoint_path)
            tf.keras.models.save_model(embedder, checkpoint_path)
            del train_gen, val_gen; gc.collect()