<center><h2>MoA | Keras Multilabel Classifier NN | New Starter </h2></center><hr>

We now have drug ID for the training data, which we can take advantage of for cross-validation. Here I used a great validation strategy ([Drug and MultiLabel Stratification Code](https://www.kaggle.com/c/lish-moa/discussion/195195)) proposed by @cdeotte.

This kernel can be a good starter using the drug ID.

# Libraries

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
sys.path.append('../input/rank-gauss')
from gauss_rank_scaler import GaussRankScaler

In [None]:
import numpy as np
import pandas as pd

import os, sys
import gc
import math
import random
from tqdm import tqdm
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn import linear_model
import umap

# keras
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers
from tensorflow.keras import models
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras import layers
import tensorflow_addons as tfa

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss
import lightgbm as lgb
from tqdm import tqdm

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('fivethirtyeight')
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

# Config

In [None]:
N_STARTS = 12 # the number of seed average
N_SPLITS = 12 # the number of folds
SEED = 42
VAR_THRESHOLD = 0.6
N_COMPONENTS = [360, 40] # g-feats, c-feats
DROPOUT = 0.24
POSTPROCESS = False
VERBOSE = 0
BATCH_SIZE = 128
EPOCHS = 160
LR = 0.001
NUM_NEURON = 1024 # the number of neurons in the first layer
DECAY_FACTOR = 2 # decides the number of neurons in the second layer by dividing 'NUM_NEURON'
NN_NORM = 'batch' # layer
NUM_HIDDEN_LAYER = 1 # the number of hidden layer
AF = 'mish' # name of activation function

DEBUG = False
if DEBUG:
    N_STARTS = 1
    VERBOSE = 2
    print('DEBUG TRUE!!!')

# Load data

In [None]:
%%time

print('loading train, test, targets, drugs')
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
drug = pd.read_csv('../input/lish-moa/train_drug.csv')
    
print('merging drug ID')
train_features = pd.merge(train_features, drug, how='left', on='sig_id')

print('no ctl')
train_g = train_features['cp_type'] != 'ctl_vehicle'
control_g = test_features['cp_type'] == 'ctl_vehicle'
test_g = test_features['cp_type'] != 'ctl_vehicle'

test_features = test_features.loc[test_g, :].reset_index(drop=True)
train_features = train_features.loc[train_g, :].reset_index(drop=True)
train_targets = train_targets.loc[train_g, :].reset_index(drop=True)    
targets = [f for f in train_targets.columns.values.tolist() if 'sig_id' not in f]

print('loading non-targets')
train_targets_non = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
train_targets_non = train_targets_non.loc[train_g, :].reset_index(drop=True)
vari = train_targets_non.var().reset_index()
train_targets_non = train_targets_non[[f for f in train_targets_non.columns.values.tolist() if f in vari.loc[vari[0] > 0, 'index'].values.tolist()]]
non_targets = [f for f in train_targets_non.columns.values.tolist() if 'sig_id' not in f]
    
ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
# variance threshold
data_all = pd.concat([train_features, test_features], ignore_index=True)
cols_numeric = [feat for feat in list(data_all.columns) if feat not in ['sig_id', 'drug_id', 'cp_type', 'cp_time', 'cp_dose']]
mask = (data_all[cols_numeric].var() >= VAR_THRESHOLD).values
tmp = data_all[cols_numeric].loc[:, mask]
data_all = pd.concat([data_all[['sig_id', 'drug_id', 'cp_type', 'cp_time', 'cp_dose']], tmp], axis=1)

In [None]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
    df = pd.get_dummies(df, columns=['cp_time','cp_dose'])
    return df

data_all = preprocess(data_all)

In [None]:
print(data_all.shape)
data_all.head()

In [None]:
print(train_targets.shape)
train_targets.head()

In [None]:
train_features['drug_id'].value_counts()

# Assign Folds
This is based on [Drug and MultiLabel Stratification Code](https://www.kaggle.com/c/lish-moa/discussion/195195) proposed by @cdeotte. Thanks a lot for this great implementation.

In [None]:
def assign_folds(train, train_targets, targets, seed=SEED):
    # LOCATE DRUGS
    scored = train_targets.copy()
    scored = pd.merge(scored, train[['sig_id', 'drug_id']], how='left', on='sig_id')
    vc = scored['drug_id'].value_counts()
#     vc1 = vc.loc[(vc==6)|(vc==12)|(vc==18)].index.sort_values()
#     vc2 = vc.loc[(vc!=6)&(vc!=12)&(vc!=18)].index.sort_values()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()

    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    scored['fold'] = scored['drug_id'].map(dct1)
    scored.loc[scored.fold.isna(),'fold'] =\
        scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
    scored.fold = scored.fold.astype('int8')
    
    return scored['fold'].values

# example
folds = assign_folds(train_features, train_targets, train_targets.columns.values[1:].tolist(), SEED)
pd.DataFrame(folds).value_counts()

In [None]:
# fold examples
n = 0
tr = np.where(folds != n)[0]
te = np.where(folds == n)[0]
print(tr)

In [None]:
print(te)

# Group targets
Just manually for now...

In [None]:
def target_category_maker(targets):
    targets_category = {'agonist': [], 'antagonist': [], 'agent': [], 'others': []}
    for t in targets:
        if ('_agonist' in t) | ('_activator' in t) | ('_stimulant' in t) | ('_secretagogue' in t) | ('_sensitizer' in t):
            targets_category['agonist'].append(t)
        elif ('_antagonist' in t) | ('_inhibitor' in t) | ('_blocker' in t):
            targets_category['antagonist'].append(t)
        elif ('_agent' in t) | ('_medium' in t):
            targets_category['agent'].append(t)
        else:
            targets_category['others'].append(t)
    return targets_category

targets_category = target_category_maker(targets)
non_targets_category = target_category_maker(non_targets)

In [None]:
# positive label ratio (scored)
for k in list(targets_category.keys()):
    print('----------------------------')
    print('{} ({:,} features)'.format(k, len(targets_category[k])))
    print('----------------------------')
    for t in targets_category[k]:
        print('{}: {:,} ({:.3f} %) positive.'.format(t, train_targets[t].sum(), 100 * train_targets[t].sum() / train_targets.shape[0]))

In [None]:
# positive label ratio (non-scored)
for k in list(non_targets_category.keys()):
    print('----------------------------')
    print('{} ({:,} features)'.format(k, len(non_targets_category[k])))
    print('----------------------------')
    for t in non_targets_category[k]:
        print('{}: {:,} ({:.3f} %) positive.'.format(t, train_targets_non[t].sum(), 100 * train_targets_non[t].sum() / train_targets_non.shape[0]))

# Feature engineering (agg, pca)

In [None]:
# categorize feats
g_feats = [f for f in data_all.columns.values.tolist() if f.startswith('g-')]
c_feats = [f for f in data_all.columns.values.tolist() if f.startswith('c-')]
cp_feats = [f for f in data_all.columns.values.tolist() if f.startswith('cp_')]
print(len(g_feats), len(c_feats), len(cp_feats))

In [None]:
%%time

# agg features
def add_stats_feats(df, feat_list, n):
    # by row
    df[f'{n}stats-mean'] = df[feat_list].mean(axis=1)
    df[f'{n}stats-std'] = df[feat_list].std(axis=1)
    df[f'{n}stats-skew'] = df[feat_list].skew(axis=1)
    df[f'{n}stats-kurt'] = df[feat_list].kurt(axis=1)
    df[f'{n}stats-mad'] = df[feat_list].mad(axis=1)
                
    return df
    
data_all = add_stats_feats(data_all, g_feats, 'g')
data_all = add_stats_feats(data_all, c_feats, 'c')
data_all = add_stats_feats(data_all, g_feats+c_feats, 'gc')

In [None]:
%%time

# PCA features
scaler = StandardScaler()
data_all[g_feats+c_feats] = scaler.fit_transform(data_all[g_feats+c_feats])

# dimensionality reduction
def dim_reducer(data_all, feats, n_components=100):
    trans = PCA(n_components=n_components, random_state=SEED)
    train_dist = trans.fit_transform(data_all[feats].values)
    
    return train_dist

train_g = dim_reducer(data_all, g_feats, n_components=N_COMPONENTS[0])
train_c = dim_reducer(data_all, c_feats, n_components=N_COMPONENTS[1])

for i in range(train_g.shape[1]):
    data_all[f'g-pca{i+1}'] = train_g[:, i]
for i in range(train_c.shape[1]):
    data_all[f'c-pca{i+1}'] = train_c[:, i]

# Scaling features for NN

In [None]:
feats = data_all.columns.values.tolist()
drops = ['sig_id', 'cp_type', 'drug_id']
feats = [f for f in feats if f not in drops]
print('{:,} features'.format(len(feats)))
print(feats)

In [None]:
%%time

# combine
t_feats = [f for f in feats if 'cp_' not in f]

# rank gauss transform
pt = GaussRankScaler()
data_all[t_feats] = pt.fit_transform(data_all[t_feats])

# final scaling
scaler = StandardScaler()
data_all[t_feats] = scaler.fit_transform(data_all[t_feats])

In [None]:
train = data_all.iloc[:len(train_features)]
test = data_all.iloc[len(train_features):]

del data_all
gc.collect()

In [None]:
print(train.shape)
train.head(3)

In [None]:
print(test.shape)
test.head(3)

# NN

In [None]:
def seed_everything(seed : int) -> NoReturn :    
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
seed_everything(SEED)

In [None]:
from tensorflow.keras.layers import Activation
from tensorflow.keras.utils import get_custom_objects

# mish
class Mish(Activation):
    '''
    Mish Activation Function.
    .. math::
        mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
    Shape:
        - Input: Arbitrary. Use the keyword argument `input_shape`
        (tuple of integers, does not include the samples axis)
        when using this layer as the first layer in a model.
        - Output: Same shape as the input.
    Examples:
        >>> X = Activation('Mish', name="conv1_act")(X_input)
    '''

    def __init__(self, activation, **kwargs):
        super(Mish, self).__init__(activation, **kwargs)
        self.__name__ = 'Mish'

def mish(inputs):
    return inputs * tf.math.tanh(tf.math.softplus(inputs))

get_custom_objects().update({'mish': Mish(mish)})

In [None]:
p_min = 0.001
p_max = 0.999

def logloss(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -K.mean(y_true*K.log(y_pred) + (1-y_true)*K.log(1-y_pred))

def create_model(num_feats):
    """very simple MLP for now"""

    # input, first layer
    inp = layers.Input(shape=(num_feats,), name="inp")
    x = layers.Dense(NUM_NEURON, activation=AF)(inp)
    if NN_NORM == 'layer':
        x = layers.LayerNormalization()(x)    
    elif NN_NORM == 'batch':
        x = layers.BatchNormalization()(x)
    x = layers.Dropout(DROPOUT)(x)
    
    # second or later layers
    for i in range(NUM_HIDDEN_LAYER):
        x = layers.Dense(NUM_NEURON // DECAY_FACTOR, activation=AF)(inp)
        if NN_NORM == 'layer':
            x = layers.LayerNormalization()(x)    
        elif NN_NORM == 'batch':
            x = layers.BatchNormalization()(x)
        x = layers.Dropout(DROPOUT)(x)
    preds = layers.Dense(206, activation='sigmoid')(x)
    
    model = models.Model(inp, preds)
    
    opt = optimizers.Adam(lr=LR)
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001)
    model.compile(loss=loss, optimizer=opt, metrics=logloss)
    return model

In [None]:
model = create_model(len(feats))
model.summary()

In [None]:
from keras.utils import plot_model
plot_model(model)

# Fit

In [None]:
def metric(y_true, y_pred, targets):
    metrics = []
    for _target in targets:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels=[0,1]))
    return np.mean(metrics)

In [None]:
def fit_nfold(train, test, ss, train_targets, feats, targets, seed=SEED, n_splits=N_SPLITS):
    res = train_targets.copy()
    ss.loc[:, targets] = 0
    res.loc[:, targets] = 0

    folds = assign_folds(train, train_targets, targets, seed=seed)
    historys = dict()

    for n in range(n_splits):
        # train test split
        tr = np.where(folds != n)[0]
        te = np.where(folds == n)[0]
        
        print(f"======{train_targets.values[tr].shape}========{train_targets.values[te].shape}=====")
        print(f'Seed: {seed} => Fold: {n}')

        if DEBUG:
            if n > 0:
                print(f'Skip fold{n}')
                continue

        # NN model
        model = create_model(len(feats))

        # callbacks
        checkpoint_path = f'repeat{seed}_fold{n}.hdf5'
        reduce_lr_loss = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-5, patience=4, verbose=VERBOSE, mode='min')
        cb_checkpt = callbacks.ModelCheckpoint(checkpoint_path, monitor = 'val_loss', verbose = 2, save_best_only = True,
                                     save_weights_only = True, mode = 'min')
        early = callbacks.EarlyStopping(monitor="val_loss", mode="min", restore_best_weights=True, patience=8, verbose = VERBOSE)
        nn_callbacks = [reduce_lr_loss, cb_checkpt, early]
        
        # nn datasets
        x_train = train[feats].values[tr]
        x_val = train[feats].values[te]
        x_test = test[feats].values
        y_train = train_targets[targets].values[tr]
        y_val = train_targets[targets].values[te]
        
        # fit
        history = model.fit(x_train, y_train, 
                  validation_data=(x_val, y_val),
                  epochs=EPOCHS, batch_size=BATCH_SIZE,
                  callbacks=nn_callbacks, verbose=VERBOSE
                 )
        historys[f'history_{n}'] = history

        # predict
        model.load_weights(checkpoint_path)
        test_predict = model.predict(x_test)
        val_predict = model.predict(x_val)
        
        # assign
        res.loc[te, targets] = val_predict
        ss.loc[test_g, targets] += test_predict / n_splits

        print(f'OOF Metric For SEED {seed} => FOLD {n} : {metric(train_targets.loc[te, targets], pd.DataFrame(val_predict, columns=targets), targets)}')
        print('+-' * 10)

    # average predictions
    print(f'OOF Metric: {metric(train_targets[targets], res[targets], targets)}')
    return ss, res, historys

In [None]:
res = train_targets.copy()
res.loc[:, targets] = 0
ss.loc[:, targets] = 0
    
# seed average
for s in range(N_STARTS):
    # kfold
    ss_tmp = ss.copy()
    ss_, res_, historys = fit_nfold(train, test, ss_tmp, train_targets, feats, targets, seed=SEED+s**2, n_splits=N_SPLITS)
    
    # add
    ss.loc[:, targets] += ss_[targets].values / N_STARTS
    res.loc[:, targets] += res_[targets].values / N_STARTS

# Predict

In [None]:
# Plot training & validation loss values
def plot_history(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper right', frameon=False)
    plt.show()
    
plot_history(historys[f'history_0'])

In [None]:
print(f'OOF Metric: {metric(train_targets, res, targets)}')

if POSTPROCESS:
    print('post-process...')

    # clip
    ss.iloc[:,1:] = np.clip(ss.values[:, 1:], p_min, p_max)

    # Set ctl_vehicle to 0
    ss.iloc[control_g, 1:] = 0
    
ss.to_csv('submission.csv', index=False)

print(ss.shape)
ss.head()