# MoA: Anomaly Detection

* We have a lot of data in this competition which has no MoAs
* The control data (cp_type = ctl_vehicle) has been unused so far - training the model on this data makes the scores worse.
* However, this data could be useful in identifying which observations are "outliers" i.e likely to have some MoA.

## Anomaly Detection

* The basic idea behind anomaly detection is to create a model which generates expected outputs for the regular examples, and then generates an output with a large deviation in comparison, when fed anomalous data.

* One useful metric to monitor is reconstruction error of an autoencoder - if an autoencoder trained on data without anomalies, with low reconstruction error, suddenly has a large reconstruction error for a data point, it indicates an anomaly. 

* Similarly, PCA reconstruction error can also be used as a metric

## Application to MoA:

* Consider data with a MoA as the anomaly. Then, we want to use a model trained on the data with no MoA to generate features.
* Since we are not actually performing anomaly detection, we do not need the decoding/reconstruction error computation step
* Instead, we feed the encoded features as the input to the model.
* Ideally, the encoded features will be better separated to begin with than the raw features, resulting in improved model performance.


### References:
* https://www.kaggle.com/c/lish-moa/discussion/185126
* https://www.kaggle.com/konradb/anomaly-detection

## Updates in V16:

* Change CV to [Grouped Multilabel Stratified KFold](https://www.kaggle.com/c/lish-moa/discussion/195195) using train_drug.csv
* Use [Denoising AutoEncoder with swap noise](https://www.kaggle.com/c/lish-moa/discussion/195642)
* Separate AutoEncoders for Cell and Gene features

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics, Model,losses
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# CV

def mlgskf(FOLDS = 5,SEED = 42):
    # LOAD FILES
    scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc<=18].index.sort_values()
    vc2 = vc.loc[vc>18].index.sort_values()

    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, 
              random_state=SEED)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, 
              random_state=SEED)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    scored['fold'] = scored.drug_id.map(dct1)
    scored.loc[scored.fold.isna(),'fold'] =\
        scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
    scored.fold = scored.fold.astype('int8')
    return scored


In [None]:
# Import train data, drop sig_id, cp_type

train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
cp_type_train = train_features['cp_type']
non_ctl_idx = train_features.loc[train_features['cp_type']!='ctl_vehicle'].index.to_list()
train_features = train_features.drop(['sig_id','cp_type'],axis=1)
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_scored = train_targets_scored.drop('sig_id',axis=1)
labels_train = train_targets_scored.values

# Import test data

test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
test_features = test_features.drop(['sig_id'],axis=1)

# Cell and Gene Features

cs = train_features.columns.str.contains('c-')
gs = train_features.columns.str.contains('g-')

# Label Encoder for categorical cp_dose

cat = 'cp_dose'
le = preprocessing.LabelEncoder()
le.fit(train_features[cat])
train_features[cat] = le.transform(train_features[cat])

# Transform categorical

test_features[cat] = le.transform(test_features[cat])

In [None]:
# Data with no MoA, including controls from public test

non_moa = train_features.iloc[labels_train.sum(axis=1)==0]
non_moa = non_moa.append(test_features.loc[test_features['cp_type']=='ctl_vehicle'].drop('cp_type',axis=1))

# Non MoA Stratifier

non_moa_stratifier = cp_type_train[labels_train.sum(axis=1)==0].append(test_features.loc[test_features['cp_type']=='ctl_vehicle','cp_type'])

In [None]:
# Drop training data with ctl vehicle

train_features = train_features.iloc[non_ctl_idx]
labels_train = labels_train[non_ctl_idx]

In [None]:
# Scaler for numerical values

scaler = preprocessing.QuantileTransformer(output_distribution='normal')

# Scale train data
data_train = scaler.fit_transform(train_features)

# Scale Test data
data_test = scaler.transform(test_features.drop('cp_type',axis=1))

# Scale Non MoA data

non_moa = scaler.transform(non_moa)

In [None]:
n_features = data_train.shape[1]
n_labels = labels_train.shape[1]
n_train = data_train.shape[0]
n_test = data_test.shape[0]

In [None]:
# Generate swap noise on AutoEncoder inputs

def swap_noise(data_in,noise_fraction = 0.15, n_splits = 100,seed=42):
    np.random.seed(seed)
    data_out = []
    for data in np.array_split(data_in,n_splits):
        
        n_features = data.shape[1]
        n_samples = data.shape[0]

        swap_cols = np.random.choice(np.arange(n_features),int(noise_fraction*n_features),replace=False)
        swap_idx = np.random.permutation(np.arange(n_samples))
        data[:,swap_cols] = data[:,swap_cols][swap_idx,:]
        data_out.append(data)
        
    return np.concatenate(data_out)

def swap_data(data_in):
    data = data_in.copy()
    data[:,cs] = swap_noise(data[:,cs],seed=0)
    data[:,gs] = swap_noise(data[:,gs],seed=1)
    return data

non_moa_train, non_moa_test = train_test_split(non_moa,test_size=0.15,stratify=non_moa_stratifier.values)
non_moa_train_swap = swap_data(non_moa_train)
non_moa_test_swap = swap_data(non_moa_test)

In [None]:
# Create AutoEncoders

activation = 'elu'
dropout = 0.0

class Autoencoder(Model):
    def __init__(self, n_features, encoding_dim):
        super(Autoencoder, self).__init__()        
        self.encoder = Sequential([
            layers.Dropout(dropout),
            layers.BatchNormalization(),
            layers.Dense(0.5*encoding_dim,activation=activation),
            layers.BatchNormalization(),
#             layers.Dropout(dropout),
            layers.Dense(0.75*encoding_dim,activation=activation),
            layers.BatchNormalization(),
#             layers.Dropout(dropout),
            layers.Dense(encoding_dim,activation=activation),
            ])
        
        self.decoder = Sequential([
        layers.Dense(0.75*encoding_dim,activation=activation),
        layers.BatchNormalization(),
#         layers.Dropout(dropout),
        layers.Dense(0.5*encoding_dim,activation=activation),
        layers.BatchNormalization(),
#         layers.Dropout(dropout),
        layers.Dense(n_features)
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# Cells

ae_reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, min_lr=1E-5)
ae_early_stopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=1E-5, patience=32, restore_best_weights=True)

encoding_dim = 512

autoencoder_cells = Autoencoder(cs.sum(),encoding_dim)
autoencoder_cells.compile(optimizer=optimizers.Adam(learning_rate=1E-4), loss='mse')
hist = autoencoder_cells.fit(non_moa_train_swap[:,cs],non_moa_train[:,cs],batch_size=128, verbose=0, 
                             validation_data = (non_moa_test[:,cs],non_moa_test[:,cs]), epochs=1024,
                       shuffle=True, callbacks=[ae_reduce_lr, ae_early_stopping])

plt.figure(figsize=(12,8))
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.yscale('log')
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])

In [None]:
# Genes

encoding_dim = 1536

autoencoder_genes = Autoencoder(gs.sum(),encoding_dim)
autoencoder_genes.compile(optimizer=optimizers.Adam(learning_rate=1E-4), loss='mse')
hist = autoencoder_genes.fit(non_moa_train_swap[:,gs],non_moa_train[:,gs],batch_size=128, verbose=0, validation_data = (non_moa_test[:,gs],non_moa_test[:,gs]),
                             epochs=1024, shuffle=True, callbacks=[ae_reduce_lr, ae_early_stopping])

plt.figure(figsize=(12,8))
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.yscale('log')
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])

In [None]:
# Create Cell features based on autoencoder

autoencoder_cells_train = autoencoder_cells.encoder(data_train[:,cs]).numpy()
autoencoder_cells_test = autoencoder_cells.encoder(data_test[:,cs]).numpy()
autoencoder_cells_non_moa_test = autoencoder_cells.encoder(non_moa_test[:,cs]).numpy()
autoencoder_cells_error = mean_squared_error(non_moa_test[:,cs],autoencoder_cells.decoder(autoencoder_cells_non_moa_test).numpy())
print("Autoencoder Cells reconstruction error for Non MoA is " + str(autoencoder_cells_error))
autoencoder_cells_error = mean_squared_error(data_test[:,cs],autoencoder_cells.decoder(autoencoder_cells_test).numpy())
print("Autoencoder Cells reconstruction error for test dataset is " + str(autoencoder_cells_error))

In [None]:
# Create Gene Features based on autoencoder

autoencoder_genes_train = autoencoder_genes.encoder(data_train[:,gs]).numpy()
autoencoder_genes_test = autoencoder_genes.encoder(data_test[:,gs]).numpy()
autoencoder_genes_non_moa_test = autoencoder_genes.encoder(non_moa_test[:,gs]).numpy()
autoencoder_genes_error = mean_squared_error(non_moa_test[:,gs],autoencoder_genes.decoder(autoencoder_genes_non_moa_test).numpy())
print("Autoencoder Genes reconstruction error for Non MoA is " + str(autoencoder_genes_error))
autoencoder_genes_error = mean_squared_error(data_test[:,gs],autoencoder_genes.decoder(autoencoder_genes_test).numpy())
print("Autoencoder Genes reconstruction error for test dataset is " + str(autoencoder_genes_error))

In [None]:
# Create Cell Features based on PCA

pca_cells = PCA(n_components=64)
pca_cells.fit(non_moa_train[:,cs])
pca_cells_train = pca_cells.transform(data_train[:,cs])
pca_cells_test = pca_cells.transform(data_test[:,cs])
pca_cells_error = mean_squared_error(non_moa_test[:,cs],pca_cells.inverse_transform(pca_cells.transform(non_moa_test[:,cs])))
print('PCA Cells Reconstruction Error for Non MoA is ' + str(pca_cells_error))
pca_cells_error = mean_squared_error(data_test[:,cs],pca_cells.inverse_transform(pca_cells.transform(data_test[:,cs])))
print('PCA Cells Reconstruction Error for test dataset is ' + str(pca_cells_error))

In [None]:
# Create Gene Features based on PCA

pca_genes = PCA(n_components=256)
pca_genes.fit(non_moa_train[:,gs])
pca_genes_train = pca_genes.transform(data_train[:,gs])
pca_genes_test = pca_genes.transform(data_test[:,gs])
pca_genes_error = mean_squared_error(non_moa_test[:,gs],pca_genes.inverse_transform(pca_genes.transform(non_moa_test[:,gs])))
print('PCA Genes Reconstruction Error for Non MoA is ' + str(pca_genes_error))
pca_genes_error = mean_squared_error(data_test[:,gs],pca_genes.inverse_transform(pca_genes.transform(data_test[:,gs])))
print('PCA Genes Reconstruction Error for test dataset is ' + str(pca_genes_error))

In [None]:
# Create reconstruction error features

autoencoder_cells_error_train = mean_squared_error(data_train[:,cs].T,autoencoder_cells.decoder(autoencoder_cells.encoder(data_train[:,cs]).numpy()).numpy().T,
                                                   multioutput='raw_values')
autoencoder_cells_error_test = mean_squared_error(data_test[:,cs].T,autoencoder_cells.decoder(autoencoder_cells.encoder(data_test[:,cs]).numpy()).numpy().T,
                                                 multioutput='raw_values')
pca_cells_error_train = mean_squared_error(data_train[:,cs].T,pca_cells.inverse_transform(pca_cells.transform(data_train[:,cs])).T, multioutput='raw_values')
pca_cells_error_test = mean_squared_error(data_test[:,cs].T,pca_cells.inverse_transform(pca_cells.transform(data_test[:,cs])).T, multioutput='raw_values')

autoencoder_genes_error_train = mean_squared_error(data_train[:,gs].T,autoencoder_genes.decoder(autoencoder_genes.encoder(data_train[:,gs]).numpy()).numpy().T,
                                                   multioutput='raw_values')
autoencoder_genes_error_test = mean_squared_error(data_test[:,gs].T,autoencoder_genes.decoder(autoencoder_genes.encoder(data_test[:,gs]).numpy()).numpy().T,
                                                 multioutput='raw_values')
pca_genes_error_train = mean_squared_error(data_train[:,gs].T,pca_genes.inverse_transform(pca_genes.transform(data_train[:,gs])).T, multioutput='raw_values')
pca_genes_error_test = mean_squared_error(data_test[:,gs].T,pca_genes.inverse_transform(pca_genes.transform(data_test[:,gs])).T, multioutput='raw_values')

# Visualization of Anomaly Detection

### The scatter plot below checks whether the autoencoder and PCA reconstruction errors separate the datapoints corresponding to MoA or no MoA. Some points are very well separated, while others still have a high overlap. The PCA also currently separates the data better than the autoencoder, but there is potential for more improvement.

In [None]:
moa_train = data_train[labels_train.sum(axis=1)!=0,:]

pca_cells_error_non_moa = mean_squared_error(non_moa_test[:,cs].T,pca_cells.inverse_transform(pca_cells.transform(non_moa_test[:,cs])).T, multioutput='raw_values')
pca_cells_error_moa = mean_squared_error(moa_train[:,cs].T,pca_cells.inverse_transform(pca_cells.transform(moa_train[:,cs])).T, multioutput='raw_values')
autoencoder_cells_error_non_moa = mean_squared_error(non_moa_test[:,cs].T,autoencoder_cells.decoder(autoencoder_cells_non_moa_test).numpy().T, multioutput='raw_values')
autoencoder_cells_error_moa = mean_squared_error(moa_train[:,cs].T,autoencoder_cells.decoder(autoencoder_cells.encoder(moa_train[:,cs]).numpy()).numpy().T, 
                                                     multioutput='raw_values')

plt.figure(figsize=(16, 8))
plt.scatter(pca_cells_error_moa,autoencoder_cells_error_moa,c='red',marker='o')
plt.scatter(pca_cells_error_non_moa,autoencoder_cells_error_non_moa,c='blue',marker='s')
plt.legend(['MoA','No MoA'])
plt.xlabel('PCA Cells Reconstruction Error')
plt.ylabel('Autoencoder Cells Reconstruction Error');

In [None]:
pca_genes_error_non_moa = mean_squared_error(non_moa_test[:,gs].T,pca_genes.inverse_transform(pca_genes.transform(non_moa_test[:,gs])).T, multioutput='raw_values')
pca_genes_error_moa = mean_squared_error(moa_train[:,gs].T,pca_genes.inverse_transform(pca_genes.transform(moa_train[:,gs])).T, multioutput='raw_values')
autoencoder_genes_error_non_moa = mean_squared_error(non_moa_test[:,gs].T,autoencoder_genes.decoder(autoencoder_genes_non_moa_test).numpy().T, multioutput='raw_values')
autoencoder_genes_error_moa = mean_squared_error(moa_train[:,gs].T,autoencoder_genes.decoder(autoencoder_genes.encoder(moa_train[:,gs]).numpy()).numpy().T, 
                                                     multioutput='raw_values')

plt.figure(figsize=(16, 8))
plt.scatter(pca_genes_error_moa,autoencoder_genes_error_moa,c='red',marker='o')
plt.scatter(pca_genes_error_non_moa,autoencoder_genes_error_non_moa,c='blue',marker='s')
plt.legend(['MoA','No MoA'])
plt.xlabel('PCA Genes Reconstruction Error')
plt.ylabel('Autoencoder Genes Reconstruction Error');
plt.xlim((0,2))
plt.ylim((0,3))

# Training/Inference

In [None]:
data_train = np.concatenate((data_train[:,:2],pca_cells_train,autoencoder_cells_train,pca_genes_train,autoencoder_genes_train,
                            pca_cells_error_train[:,np.newaxis],autoencoder_cells_error_train[:,np.newaxis],
                             pca_genes_error_train[:,np.newaxis],autoencoder_genes_error_train[:,np.newaxis]),axis=1)
data_test = np.concatenate((data_test[:,:2],pca_cells_test,autoencoder_cells_test,pca_genes_test,autoencoder_genes_test,
                            pca_cells_error_test[:,np.newaxis],autoencoder_cells_error_test[:,np.newaxis],
                            pca_genes_error_test[:,np.newaxis],autoencoder_genes_error_test[:,np.newaxis]),axis=1)

In [None]:
n_labels = labels_train.shape[1]
n_features = data_train.shape[1]
n_train = data_train.shape[0]
n_test = data_test.shape[0]


# Prediction Clipping Thresholds

p_min = 5E-4
p_max = 1-5E-4

# Evaluation Metric with clipping and no label smoothing

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))


# Generate Seeds

seeds = [148,161,41]
n_seeds = len(seeds)

# Training Loop

n_folds = 5
y_pred = np.zeros((n_test,n_labels))
oof = tf.constant(0.0)
hists = []
bias = tf.keras.initializers.Constant(np.log(labels_train.mean(axis=0)))
for seed in seeds:
    labels_grouped = mlgskf()
    for fold in range(n_folds):
        train = ((labels_grouped.iloc[non_ctl_idx]['fold']!=0).values)
        test = ((labels_grouped.iloc[non_ctl_idx]['fold']==0).values)
        X_train = data_train[train]
        X_test = data_train[test]
        y_train = labels_train[train]
        y_test = labels_train[test]

        # Define NN Model

        model = Sequential()
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(0.7))        
        model.add(layers.Dense(1536))
        model.add(layers.Activation('elu'))
        model.add(layers.Dropout(0.5))        
        model.add(layers.Dense(1024))
        model.add(layers.Activation('elu'))
        model.add(layers.Dropout(0.5))        
        model.add(layers.Dense(512))
        model.add(layers.Activation('elu'))
        model.add(layers.Dropout(0.5))    
        model.add(layers.Dense(n_labels,activation='sigmoid',bias_initializer=bias))
        model.compile(optimizer=optimizers.Adam(learning_rate=1E-5), loss=losses.BinaryCrossentropy(label_smoothing=0.0005), metrics=logloss)
        reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_logloss', factor=0.1, patience=2, mode='min', min_lr=1E-5)
        early_stopping = callbacks.EarlyStopping(monitor='val_logloss', min_delta=1E-5, patience=15, mode='min',restore_best_weights=True)
        def scheduler(epoch,lr):
            if epoch%16<9:
                lr += (1E-3)/8
            else:
                lr -= (1E-3)/8
            return lr

        lr_scheduler = callbacks.LearningRateScheduler(scheduler)
        hist = model.fit(X_train,y_train, batch_size=128, epochs=192,verbose=0,validation_data = (X_test,y_test),callbacks=[lr_scheduler, early_stopping])
        model.fit(X_train,y_train, batch_size=128, epochs=192,verbose=0,validation_data = (X_test,y_test),callbacks=[reduce_lr, early_stopping])
        hists.append(hist)
        
        # Save Model
        model.save('Anomaly_seed_'+str(seed)+'_fold_'+str(fold))

        # OOF Score
        y_val = model.predict(X_test)
        oof += logloss(tf.constant(y_test,dtype=tf.float32),tf.constant(y_val,dtype=tf.float32))/(n_folds*n_seeds)

        # Run prediction
        y_pred += model.predict(data_test)/(n_folds*n_seeds)        

In [None]:
# Analysis of Training

tf.print('OOF score is ',oof)

plt.figure(figsize=(12,8))

hist_trains = []
hist_lens = []
for i in range(n_folds*n_seeds):
    hist_train = (hists[i]).history['logloss']
    hist_trains.append(hist_train)
    hist_lens.append(len(hist_train))
hist_train = []
for i in range(min(hist_lens)):
    hist_train.append(np.mean([hist_trains[j][i] for j in range(n_folds*n_seeds)]))

plt.plot(hist_train)

hist_vals = []
hist_lens = []
for i in range(n_folds*n_seeds):
    hist_val = (hists[i]).history['val_logloss']
    hist_vals.append(hist_val)
    hist_lens.append(len(hist_val))
hist_val = []
for i in range(min(hist_lens)):
    hist_val.append(np.mean([hist_vals[j][i] for j in range(n_folds*n_seeds)]))

plt.plot(hist_val)

plt.yscale('log')
plt.xlabel('Epochs')
plt.ylabel('Average Logloss')
plt.legend(['Training','Validation'])

In [None]:
# Generate submission file, Clip Predictions

sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
sub.iloc[:,1:] = np.clip(y_pred,p_min,p_max)

# Set ctl_vehicle to 0
sub.iloc[test_features['cp_type'] == 'ctl_vehicle',1:] = 0

# Save Submission
sub.to_csv('submission.csv', index=False)