In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let's us first understand what is MoA - Mechanism of Action?  
The term mechanism of action means the biochemical interactions through which a drug generates its pharmacological effect.  

In this notebook we are going to train a model that classifies a drug based on their biological activity.  
The dataset consists of different features of gene expression data, cell viability data as well as multiple targets of mechanism of action (MoA).  
This is a multilabel classification problem which means we have multiple targets (not multiple classes).  
  
We will first perform EDA, and then train a model using deep neural networks with Keras followed by Model Evaluation in the end.

# Import Dependencies

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from time import time
import datetime
import gc

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('white')
sns.set(font_scale=1.2)

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras import layers
#from tensorflow.keras.backend as K

from sklearn.metrics import log_loss
from tensorflow_addons.layers import WeightNormalization

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

# EDA
* `g-` is gene expression data
* `c-` is cell viability data
* `cp_type` is sample treated with a compound (`cp_vehicle`) or with a control perturbations (`ctrl_vehicle`) have no MoA
* `cp_time` and `cp_dose` indicate time duration and dose

In [None]:
df_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
print('Train data size: {}'.format(df_train.shape))
display(df_train.head(3))

df_target_ns = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
print('Train target nonscored size: {}'.format(df_target_ns.shape))
display(df_target_ns.head(3))

df_target_s = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
print('Train target scored size: {}'.format(df_target_s.shape))
display(df_target_s.head(3))

df_test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
print('Test data size: {}'.format(df_test.shape))
display(df_test.head(3))

df_sample = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
print('Sample submission size: {}'.format(df_sample.shape))
display(df_sample.head(3))

## Missing Values

In [None]:
print(df_train.isnull().sum().any())

In [None]:
df_train.info()

This shows that there are 872 floats, 1 integer, and 3 objects. Let's see them.

In [None]:
display(df_train.select_dtypes('int64').head(3))
display(df_train.select_dtypes('object').head(3))

Let's visaualize these features

## Gene Expression Features

In [None]:
g_features = [cols for cols in df_train.columns if cols.startswith('g-')]

In [None]:
color = ['dimgray', 'navy', 'purple', 'orangered', 'red', 'green', 'mediumorchid', 'khaki', 'salmon', 'blue', 'cornflowerblue', 'mediumseagreen']

color_ind = 0
n_row = 6
n_col = 3
n_sub = 1

plt.rcParams['legend.loc'] = 'upper right'
fig = plt.figure(figsize=(8,14))
plt.subplots_adjust(left=-0.3, right=1.3,bottom=-0.3,top=1.3)

for i in (np.arange(0,6,1)):
    plt.subplot(n_row, n_col, n_sub)
    sns.kdeplot(df_train.loc[:,g_features[i]],color=color[color_ind],shade=True,
                 label=['mean:'+str('{:.2f}'.format(df_train.loc[:,g_features[i]].mean()))
                        +'  ''std: '+str('{:.2f}'.format(df_train.loc[:,g_features[i]].std()))])
    
    plt.xlabel(g_features[i])
    plt.legend()                    
    n_sub+=1
    color_ind+=1
plt.show()

## Cell Viability Features

In [None]:
c_features = [cols for cols in df_train.columns if cols.startswith('c-')]

In [None]:
n_row = 6
n_col = 3
n_sub = 1 
fig = plt.figure(figsize=(8,14))
plt.subplots_adjust(left=-0.3, right=1.3,bottom=-0.3,top=1.3)
plt.rcParams["legend.loc"] = 'upper left'
for i in (np.arange(0,6,1)):
    plt.subplot(n_row, n_col, n_sub)
    sns.kdeplot(df_train.loc[:,c_features[i]],color=color[color_ind],shade=True,
                 label=['mean:'+str('{:.2f}'.format(df_train.loc[:,c_features[i]].mean()))
                        +'  ''std: '+str('{:.2f}'.format(df_train.loc[:,c_features[i]].std()))])
    
    plt.xlabel(c_features[i])
    plt.legend()                    
    n_sub+=1
    color_ind+=1
plt.show()

### cp_time and cp_dose

`cp_time` and `cp_dose` indicate treatment duration (24, 48, 72 hours) and dose (high or low which are D1 and D2)

In [None]:
fig = plt.figure(figsize=(10,4))
plt.subplots_adjust(right=1.3)
plt.subplot(1,2,1)

sns.countplot(df_train['cp_time'], palette='nipy_spectral')
plt.subplot(1,2,2)

sns.countplot(df_train['cp_dose'], palette='nipy_spectral')
plt.show()

## Relationship between features and target
Let's now see the relationship of feature and target with respect to dosage and time.  
Here we will do this for one label `target_71` and two random features `g-3` and `c-3`.

In [None]:
train_copy = df_train.copy()
train_copy['target_71'] = df_target_s.iloc[:,72]

fig = plt.figure(figsize=(16,8))
plt.subplots_adjust(right=1.1, top=1.1)

ax1 = fig.add_subplot(121)
sns.stripplot(data=train_copy, x='cp_time', y='g-3', color='red', hue='target_71', ax=ax1)

ax2 = fig.add_subplot(122)
sns.stripplot(data=train_copy, x='cp_dose', y='g-3', color='red', hue='target_71', ax=ax2)

plt.show()

In [None]:
fig = plt.figure(figsize=(16,8))
plt.subplots_adjust(right=1.1, top=1.1)

ax1 = fig.add_subplot(121)
sns.stripplot(data=train_copy, x='cp_time', y='c-3', color='yellow', hue='target_71', ax=ax1)

ax2 = fig.add_subplot(122)
sns.stripplot(data=train_copy, x='cp_dose', y='c-3', color='yellow', hue='target_71', ax=ax2)

plt.show()

Now let's do the same for mean of `g` and `c` features. i.e., plotting the mean of g and c features with respect to target, dosage and time.

In [None]:
train_copy['g_mean'] = train_copy.loc[:, g_features].mean(axis=1)

fig = plt.figure(figsize=(16,10))
plt.subplots_adjust(right=1.1, top=1.1)

ax1 = fig.add_subplot(121)
sns.stripplot(data=train_copy, x='cp_time', y= 'g_mean',color='red', hue='target_71',ax=ax1)

ax2 = fig.add_subplot(122)
sns.stripplot(data= train_copy , x='cp_dose', y= 'g_mean', color='red', hue='target_71',ax=ax2)

plt.show()

In [None]:
train_copy['c_mean'] = train_copy.loc[:, c_features].mean(axis=1)

fig = plt.figure(figsize=(16,10))
plt.subplots_adjust(right=1.1, top=1.1)

ax1 = fig.add_subplot(121)
sns.stripplot(data=train_copy, x='cp_time', y= 'c_mean',color='yellow', hue='target_71',ax=ax1)

ax2 = fig.add_subplot(122)
sns.stripplot(data= train_copy , x='cp_dose', y= 'c_mean', color='yellow', hue='target_71',ax=ax2)

plt.show()

## Targets
### Scored Targets

In [None]:
target_s_copy = df_target_s.copy()
target_s_copy.drop('sig_id', axis=1, inplace=True)

n_row = 20
n_col = 4 
n_sub = 1   

fig = plt.figure(figsize=(20,50))
plt.subplots_adjust(left=-0.3, right=1.3,bottom=-0.3,top=1.3)

for i in np.random.choice(np.arange(0,target_s_copy.shape[1],1),n_row):
    plt.subplot(n_row, n_col, n_sub)
    sns.countplot(y=target_s_copy.iloc[:, i],palette='nipy_spectral',orient='h')
    
    plt.legend()                    
    n_sub+=1

plt.show()

Let's take a look at 20 largest positive number of labels in the scored targets.

In [None]:
plt.figure(figsize=(10,10))

target_s_copy.sum().sort_values()[-20:].plot(kind='barh', color='mediumseagreen')
plt.show()

### Non-scored targets

In [None]:
target_ns_copy = df_target_ns.copy()
target_ns_copy.drop('sig_id', axis=1, inplace=True)
n_row = 20
n_col = 4 
n_sub = 1   
fig = plt.figure(figsize=(20,50))
plt.subplots_adjust(left=-0.3, right=1.3,bottom=-0.3,top=1.3)
for i in np.random.choice(np.arange(0,target_ns_copy.shape[1],1),n_row):
    plt.subplot(n_row, n_col, n_sub)
    sns.countplot(y=target_ns_copy.iloc[:, i],palette='magma',orient='h')
    
    plt.legend()                    
    n_sub+=1
plt.show()

In [None]:
plt.figure(figsize=(10,10))

target_ns_copy.sum().sort_values()[-20:].plot(kind='barh',color='purple')
plt.show()

# Preprocessing and Feature Engineering
* The control group is the group which does not produce the desires effect or MoAs and therefore the target labels are zero. So let's drop those data. Also we will keep track of cintrol group (ctl_vehicle) indexes.

In [None]:
ind_tr = df_train[df_train['cp_type']=='ctl_vehicle'].index

ind_te = df_test[df_test['cp_type']=='ctl_vehicle'].index

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer

transformer = QuantileTransformer(n_quantiles=100, random_state=42, output_distribution='normal')

def preprocess(df):
    df['cp_time'] = df['cp_time'].map({24:1, 48:2, 72:3})
    df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    
    g_features = [cols for cols in df.columns if cols.startswith('g-')]
    c_features = [cols for cols in df.columns if cols.startswith('c-')]
    
    for col in (g_features + c_features):
        vec_len = len(df[col].values)
        raw_vec = df[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)
        df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    
    return df 

In [None]:
X = preprocess(df_train)
X_test = preprocess(df_test)

display(X.head(5))
print('Train data size', X.shape)

display(X_test.head(5))
print('Test data size', X_test.shape)

y = df_target_s.drop('sig_id', axis=1)
display(y.head(3))
print('Target size', y.shape)

y0 = df_target_ns.drop('sig_id', axis=1)

This part of code is taken from kernel: [Rankgauss scaler and PCA](https://www.kaggle.com/kushal1506/moa-pytorch-0-01859-rankgauss-pca-nn?scriptVersionId=44558776)

In [None]:
g_features = [cols for cols in X.columns if cols.startswith('g-')]
n_comp = 0.95

data = pd.concat([pd.DataFrame(X[g_features]), pd.DataFrame(X_test[g_features])])
data2 = (PCA(0.95, random_state=42).fit_transform(data[g_features]))

train2 = data2[:X.shape[0]]
test2 = data2[-X_test.shape[0]:]
train2 = pd.DataFrame(train2, columns=[f'pca_g-{i}' for i in range(data2.shape[1])])
test2 = pd.DataFrame(test2, columns=[f'pca_g-{i}' for i in range(data2.shape[1])])

X = pd.concat((X, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)

c_features = [cols for cols in X.columns if cols.startswith('c-')]
n_comp = 0.95

data = pd.concat([pd.DataFrame(X[c_features]), pd.DataFrame(X_test[c_features])])
data2 = (PCA(0.95, random_state=42).fit_transform(data[c_features]))

train2 = data2[:X.shape[0]]
test2 = data2[-X_test.shape[0]:]
train2 = pd.DataFrame(train2, columns=[f'pca_c-{i}' for i in range(data2.shape[1])])
test2 = pd.DataFrame(test2, columns=[f'pca_c-{i}' for i in range(data2.shape[1])])

X = pd.concat((X, train2), axis=1)
X_test = pd.concat((X_test, test2), axis=1)
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)  
data = X.append(X_test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : X.shape[0]]
test_features_transformed = data_transformed[-X_test.shape[0] : ]


X = pd.DataFrame(X[['sig_id','cp_type', 'cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])
X = pd.concat([X, pd.DataFrame(train_features_transformed)], axis=1)

X_test = pd.DataFrame(X_test[['sig_id','cp_type', 'cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])
X_test = pd.concat([X_test, pd.DataFrame(test_features_transformed)], axis=1)

display(X.head(2))
print(X.shape)

display(X_test.head(2))
print(X_test.shape)

In [None]:
from sklearn.cluster import KMeans

def fe_cluster(train, test, n_clusters_g=35, n_clusters_c=5, SEED=239):
    features_g = list(train.columns[4:776])
    features_c = list(train.columns[776:876])
    
    def create_cluster(train, test, features, kind='g', n_clusters=n_clusters_g):
        train_ = train[features].copy()
        test_ = test[features].copy()
        
        data = pd.concat([train_, test_], axis=0)
        kmeans = KMeans(n_clusters=n_clusters, random_state=SEED).fit(data)
        
        train[f'clusters_{kind}'] = kmeans.labels_[:train.shape[0]]
        test[f'clusters_{kind}'] = kmeans.labels_[train.shape[0]:]
        
        train = pd.get_dummies(train, columns=[f'clusters_{kind}'])
        test = pd.get_dummies(test, columns=[f'clusters_{kind}'])
        
        return train, test
    
    train, test = create_cluster(train, test, features_g, kind='g', n_clusters=n_clusters_g)
    train, test = create_cluster(train, test, features_c, kind='c', n_clusters=n_clusters_c)
    
    return train, test

In [None]:
X, X_test = fe_cluster(X, X_test)

display(X.head(2))
print(X.shape)

display(X_test.head(2))
print(X_test.shape)

In [None]:
def fe_stats(train, test):
    
    features_g = list(train.columns)[4:776]
    features_c = list(train.columns)[776:876]
    
    for df in train, test:
        df['g_sum'] = df[features_g].sum(axis=1)
        df['g_mean'] = df[features_g].mean(axis=1)
        df['g_std'] = df[features_g].std(axis=1)
        df['g_kurt'] = df[features_g].kurtosis(axis=1)
        df['g_skew'] = df[features_g].skew(axis=1)
        
        df['c_sum'] = df[features_c].sum(axis=1)
        df['c_mean'] = df[features_c].mean(axis=1)
        df['c_std'] = df[features_c].std(axis=1)
        df['c_kurt'] = df[features_c].kurtosis(axis=1)
        df['c_skew'] = df[features_c].skew(axis=1)
        
        df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
        df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
        df['gc_std'] = df[features_g + features_c].std(axis = 1)
        df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
        df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
        
        return train, test

In [None]:
X, X_test = fe_stats(X, X_test)

display(X.head(2))
print(X.shape)

display(X_test.head(2))
print(X_test.shape)

In [None]:
y0 = y0[X['cp_type'] == 'trt_cp'].reset_index(drop = True)

y = y[X['cp_type'] == 'trt_cp'].reset_index(drop = True)
X = X[X['cp_type'] == 'trt_cp'].reset_index(drop = True)

X.drop(['cp_type','sig_id'], axis=1, inplace=True)

X_test.drop(['cp_type','sig_id'], axis=1, inplace=True)

print('New data shape', X.shape)

# Model Training

In [None]:
p_min = 0.001
p_max = 0.999

from tensorflow.keras import regularizers


def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    
    return -K.mean(y_true*K.log(y_pred) + (1-y_true) * K.log(1-y_pred))



def create_model(num_cols, hid_layers, activations, dropout_rate, lr, num_cols_y):
    
    inp1 = tf.keras.layers.Input(shape=(num_cols, ))
    x1 = tf.keras.layers.BatchNormalization()(inp1)

    for i, units in enumerate(hid_layers):
        x1 = tf.keras.layers.Dense(units, activation=activations[i])(x1)
        x1 = tf.keras.layers.Dropout(dropout_rate[i])(x1)
        x1 = tf.keras.layers.BatchNormalization()(x1)
    
    x1 = tf.keras.layers.Dense(num_cols_y,activation='sigmoid')(x1)
    model = tf.keras.models.Model(inputs= inp1, outputs= x1)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                 loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001), metrics=logloss)
    
    return model 

In [None]:
hid_layers = [[2048, 768, 2048],[128, 1152, 1152],[128,1280,896],[1152, 2048, 1152],
              [128, 1408, 1024], [2048, 512, 1280], [128,1536,1024],[128,2048,1408],
              [128, 1408, 896],[1048,2048,1792]]

dropout_rate = [[0.55,0.55,0.55],[0.55,0.45,0.55],[0.55,0.45,0.45],[0.55,0.55,0.55],
                [0.55,0.55,0.55],[0.55,0.4,0.55],[0.55,0.45,0.5],[0.55,0.55,0.5],
               [0.55, 0.45, 0.5],[0.45, 0.5, 0.55]]

activations = [['selu', 'swish', 'swish'], ['selu','relu','swish'], ['selu','relu','swish'],
              ['selu','relu','swish'],['selu','relu','swish'],['selu','relu','swish'],
               ['selu','relu','elu'],['selu','relu','swish'],['selu','relu','swish'],
               ['elu','relu','swish']]

lr = [0.00035388197445653164,0.0003,0.0003,0.0003,0.0003,0.0003,0.0003,
      0.0003, 0.0010958464491213106, 0.0003]

feats = np.arange(0,X.shape[1],1)
inp_size = int(np.ceil(1* len(feats)))
res = y.copy()
df_sample.loc[:, y.columns] = 0
res.loc[:, y.columns] = 0

n_round = 2

In [None]:
def callbacks():
    rlr = ReduceLROnPlateau(monitor='val_logloss', factor=0.2, patience=3, verbose=0, 
                                min_delta=1e-4, min_lr=1e-6, mode='min')
        
    ckp = ModelCheckpoint("model.h5", monitor='val_logloss', verbose=0, 
                              save_best_only=True, mode='min')
        
    es = EarlyStopping(monitor='val_logloss', min_delta=1e-5, patience=10, mode='min', 
                           baseline=None, restore_best_weights=True, verbose=0)
    return rlr, ckp, es

In [None]:
def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in y.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels=[0,1]))
    
    return np.mean(metrics)