In [None]:
IS_KAGGLE_KERNEL = True

In [None]:
import datetime
import os

import tensorflow as tf
from tensorflow import keras as keras
from tensorflow.keras import layers, optimizers, activations, losses, backend
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
import tensorflow_addons as tfa
import numpy as np

# from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample, shuffle
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss

if IS_KAGGLE_KERNEL:
    ! pip install "/kaggle/input/moa-env/joblib-0.17.0-py3-none-any.whl"
    ! pip install "/kaggle/input/moa-env/iterative_stratification-0.1.6-py3-none-any.whl"
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import pandas as pd

import matplotlib.pyplot as plt


In [None]:
def with_input_path(s):
    src = "./kaggle/input/lish-moa/"
    if IS_KAGGLE_KERNEL:
        src = src[1:]
    return os.path.join(src, s)

## Construct folds stratified by drug label

The drugs are highly imbalanced(As visualized above). To K-Fold validation, we need to stratify (preserve class balance approximately) but also make sure that a trained model is able to _classify drugs not seen during training_. To mimic this effect, we will spread out the very frequently-occuring drugs (say, drugs that appear more than 20 times in the dataset) among all folds, but concentrate all of the instances of each infrequent drug into a single fold. Then when a fold is used for testing, it will have a lot of the seen train data, but a _few_ (couple hundred rows) drugs that haven't been seen during training.

In [None]:
# Decided against specialized stratification; really messed up validation scores
# def mskf_cv(n_splits, seed, pkg):
#     """Multilabel Stratified K-fold cross validation
    
#     This algorithm is designed to do stratification in two ways:
#         1. For frequent drugs (occuring >20 times in the training set), construct
#             stratified folds per normal k-fold CV
#         2. For less frequent drugs, allocate all of the drug to a single fold.
#             This replicates the situation where the test set will contain
#             drugs that did not occur in the train set
#     """
    
#     df_id, df_targets = pkg

#     # Get drug_id's and count them
#     drug_counts = df_id.drug_id.value_counts()
    
#     # Construct a composite df that includes drug_id alongside sig_id and all of the (multi)-labels
#     targets = df_targets.columns[1:]
#     composite = df_id.merge(right=df_targets, on='sig_id', how='left')
    
#     # Find the indices of all of the `sig_id` corresponding to frequent drugs counts
#     frequent_drugs = drug_counts.loc[drug_counts >= 20].index.values
#     infrequent_drugs = drug_counts.loc[drug_counts < 20].index.values

#     freq_idx = composite.index[composite.drug_id.isin(frequent_drugs)]
#     infreq_idx = composite.index[composite.drug_id.isin(infrequent_drugs)]

#     test_folds = []
#     train_folds = []

#     # First split is straightfoward: vanilla Mulitlabel Stratified KF on the frequent drugs
#     cv = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
#     y_freq = df_targets.loc[freq_idx][targets]
#     for i, (train_locs, test_locs) in enumerate(cv.split(freq_idx, y_freq)):
#         test_folds.append(freq_idx[test_locs])
#         train_folds.append(freq_idx[train_locs])


#     # Second split takes two steps: first, split on _drug labels_ (not rows), and then
#     # recover row indices corresponding to each of these label(sets)
#     y_dummy = range(len(infrequent_drugs))
#     cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
#     for i, (train_locs, test_locs) in enumerate(cv.split(infrequent_drugs, np.ones_like(infrequent_drugs))):
#         # Single out a st of drugs to put into this fold
#         infreq_tst_fold = infrequent_drugs[test_locs]
#         infreq_tst_fold_idx = composite.loc[composite.drug_id.isin(infreq_tst_fold)].index
#         test_folds[i] = np.concatenate((test_folds[i], infreq_tst_fold_idx.values))

#         infreq_tr_fold = infrequent_drugs[train_locs]
#         infreq_tr_fold_idx = composite.loc[composite.drug_id.isin(infreq_tr_fold)].index

#         train_folds[i] = np.concatenate((train_folds[i], infreq_tr_fold_idx.values))    
        
#     return zip(train_folds, test_folds)

## Basic data preprocessing

Load the data, drop out the ID columns and control columns, replace dosages with numerics, and normalize.

In [None]:
# Prepare train data
df_train = pd.read_csv(with_input_path("train_features.csv"))
df_test = pd.read_csv(with_input_path("test_features.csv"))

In [None]:
# id is meaningess signifier
df_train = df_train.drop("sig_id", axis=1)
df_test = df_test.drop("sig_id", axis=1)

# cp_type indicates control (just vehicle) vs. drug. For now, we'll set all control experiments 
# to have zero MoA's before submission. We will therefore ignore this feature in training
train = df_train.copy()
df_train = df_train[train["cp_type"] != 'ctl_vehicle'].reset_index(drop=True)
# train_control_locs = df_train.loc[df_train["cp_type"] == 'ctl_vehicle'].index
df_train = df_train.drop("cp_type", axis=1)

# Save these to set control exp MoA's to zero after training
test_control_locs = df_test.loc[df_test["cp_type"] == 'ctl_vehicle'].index
df_test = df_test.drop("cp_type", axis=1)

# Dosages are strings right now. I don't exactly know the dosages used but we can pretend it was either a single
# dose or a double dose
df_train['cp_dose'].replace('D1', 1, inplace=True)
df_train['cp_dose'].replace('D2', 2, inplace=True)
df_test['cp_dose'].replace('D1', 1, inplace=True)
df_test['cp_dose'].replace('D2', 2, inplace=True)

In [None]:
# Normalize train data and test data simultaneously
scaler = MinMaxScaler(feature_range=(-1, 1))
X_total = np.vstack((df_train, df_test))
scaler.fit(X_total)
X_train = scaler.transform(df_train)
X_test = scaler.transform(df_test)

In [None]:
# # Compress cell viabilities with PCA since they're highly correlated
n, _ = X_train.shape
pca = PCA(0.97) # Cutoff at 97% cum. explained variance
cell_v_pca = pca.fit_transform(X_total[:,-100:])

X_train = np.hstack((X_train[:,:-100], cell_v_pca[:n,:]))
X_test = np.hstack((X_test[:,:-100], cell_v_pca[n:,:]))

In [None]:
# Prepare train labels
df_targets = pd.read_csv(with_input_path("train_targets_scored.csv"))
df_targets = df_targets[train["cp_type"] != 'ctl_vehicle'].reset_index(drop=True)
y_train = df_targets.drop("sig_id", axis=1).to_numpy()

In [None]:
n, input_dim = X_train.shape
n, num_labels = y_train.shape
n_test, _ = X_test.shape

## A first dry-run model

NN's support multilabel classification natively (instead of using the argmax of the output sigmoid layer, use a threshold to round up to one). This lets us test some preprocessing and smoothing techniques without having to convert multilabel to multiclass.



Important observations:
 - use logloss metric because thats the kaggle scoring metric
 - use "label smoothing": more on that below
 - Binary Crossentropy loss at train time to get multilabel predictions out
 - clipping on the predictions, because very confident predictions get penalized by logarithm loss
 

In [None]:
# Prediction Clipping Thresholds

p_min = 0.001
p_max = 0.999
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred, p_min, p_max)
    return -backend.mean(y_true * backend.log(y_pred) + (1 - y_true) * backend.log(1 - y_pred))


def make_model(input_dim):    
    # Generic feedforward NN
    model = keras.Sequential()
#     model.add(layers.Dense(2048, input_dim=input_dim, activation="relu", name="layer1", kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    model.add(layers.Input(input_dim))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.2))
    
    model.add(tfa.layers.WeightNormalization(
        layers.Dense(2048, activation="relu", name="layer1")))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(.4))
    model.add(tfa.layers.WeightNormalization(
        layers.Dense(1024, activation="relu", name="layer2")))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(.4))
    model.add(tfa.layers.WeightNormalization(
        layers.Dense(512, activation="relu", name="layer3")))
    model.add(layers.Dense(num_labels, activation="sigmoid", name="output"))

    optimizer = optimizers.Adam()
    loss = losses.BinaryCrossentropy(label_smoothing=0.001)
    
    # Early stopping if model converges
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_logloss', min_delta=1e-5, patience=5, verbose=0,
                                                      mode='min', restore_best_weights=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=logloss)
    
    return model

In [None]:
epochs = 25
batch_size = 128

# Tensorboard callbacks; doesn't work with WeightNormalization layer....
# log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# To reduce variance, average the performance of many models trained with diffecrent 
# CV splits (different seeds)
n_splits = 7
seeds = [394, 388, 2772, 105]
n_seeds = len(seeds)

# Rolling averages for validation scores and test predictions
avg_score = 0
test_preds = np.zeros((n_test, num_labels))

histories = []


df_targets = pd.read_csv(with_input_path("train_targets_scored.csv"))
df_targets = df_targets[train["cp_type"] != 'ctl_vehicle'].reset_index(drop=True)
df_id = pd.read_csv(with_input_path("train_drug.csv"))
df_id = df_id[train["cp_type"] != 'ctl_vehicle'].reset_index(drop=True)
pkg = (df_id, df_targets)

for i, seed in enumerate(seeds):
#     for j, (train_locs, val_locs) in enumerate(mskf_cv(n_splits=n_splits, seed=seed, pkg=pkg)):
    for j, (train_locs, val_locs) in enumerate(MultilabelStratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True).split(X_train, y_train)):
        model = make_model(input_dim=input_dim)
        X_train_bal = X_train[train_locs]
        y_train_bal = y_train[train_locs]
        Xval = X_train[val_locs]
        yval = y_train[val_locs]
#         import pdb; pdb.set_trace()
        reduce_lr_loss = ReduceLROnPlateau(
            monitor='val_logloss', factor=0.1, patience=5, verbose=1, min_delta=1e-4, mode='min')
        
        
        history = model.fit(x=X_train_bal, 
                            y=y_train_bal, 
                            epochs=epochs, 
                            batch_size=batch_size,
                            validation_data=(Xval, yval), 
                            callbacks=[reduce_lr_loss])
        histories.append(history)
        # Average validation score
        y_preds = model.predict(Xval)
        fold_score = logloss(yval, y_preds)
        print("\t seed {}, fold {} validation score: {}".format(i, j, fold_score))
        avg_score += fold_score / (n_splits * n_seeds)

        # Update test score from this fold/cv
        test_preds += model.predict(X_test) / (n_splits * n_seeds)
    

In [None]:
sub = pd.read_csv(with_input_path("sample_submission.csv"))
sub.iloc[:,1:].shape
sub.iloc[:,1:] = np.clip(test_preds, p_min, p_max)
sub.iloc[test_control_locs, 1:] = 0

sub.to_csv("submission.csv", index=False)