In [None]:
#the basics
import pandas as pd, numpy as np
import math, re, gc, random, os, sys
from matplotlib import pyplot as plt

#for maximum aesthetics
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#tensorflow deep learning basics
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K

#for model evaluation
from sklearn.model_selection import train_test_split

#no warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 34

def seed_everything(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(SEED)

# I. Intro

**What is a Mechanism of Action? From Wikipedia:**

> In pharmacology, the term mechanism of action (MOA) refers to the specific biochemical interaction through which a drug substance produces its pharmacological effect. A mechanism of action usually includes mention of the specific molecular targets to which the drug binds, such as an enzyme or receptor. Receptor sites have specific affinities for drugs based on the chemical structure of the drug, as well as the specific action that occurs there.

**In this [competition](https://www.kaggle.com/c/lish-moa), we are asked to predict multiple target Mechanism of Action (MoA) responses of different samples(`sig_id`) given inputs like gene expression data and cell viability data. Note that the training data has an additional (optional) set of MoA labels that are *not* included in the test data are not used in scoring.**

### Files

**The following is taken from the data tab of the competition description page, found [here](https://www.kaggle.com/c/lish-moa/data):**

* `train_features.csv` - Features for the training set. Features g- signify gene expression data, and c- signify cell viability data. cp_type indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle); control perturbations have no MoAs; cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).
* `train_targets_scored.csv` - The binary MoA targets that are scored.
* `train_targets_nonscored.csv` - Additional (optional) binary MoA responses for the training data. These are not predicted nor scored.
* `test_features.csv` - Features for the test data. You must predict the probability of each scored MoA for each row in the test data.
* `sample_submission.csv` - A submission file in the correct format.

In [None]:
#load files into memory as Pandas DataFrames
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

# II. EDA (Plotly)

### 1. Basic

In [None]:
#sneak peak at training features
print(train_features.shape)
if ~ train_features.isnull().values.any(): print('No missing values')
train_features.head()

In [None]:
#sneak peak at train targets
print(train_targets_scored.shape)
if ~ train_targets_scored.isnull().values.any(): print('No missing values')
train_targets_scored.head()

In [None]:
#sneak peak at non scored train targets
print(train_targets_nonscored.shape)
if ~ train_targets_nonscored.isnull().values.any(): print('No missing values')
train_targets_nonscored.head()

In [None]:
#sneak peak at test features
print(test_features.shape)
if ~ test_features.isnull().values.any(): print('No missing values')
test_features.head()

**Let's add `train_target` and `train_target_noscore` to `train` now for convenience:**

In [None]:
train = train_features.merge(train_targets_scored, on='sig_id', how='left')
train = train.merge(train_targets_nonscored, on='sig_id', how='left')

### 2. Train Features

**We are told that features with `g-` signify gene expression data and `c-` signifies cell viability data. `cp_type` indicated samples that were treated with a compound (`cp_vehicle`) or with a control perturbation (`ctl_vehicle`). Control perturbations have no MoAs. Furthermore, `cp_time` and `cp_dose` indicate treatment duration (either 24, 48, or 72 hours) and dose (either high or low):**

In [None]:
fig = px.histogram(train, x='cp_type', histfunc='count',
                  height=500, width=500)
fig.show()

**Let's see if the description we were given is accurate: do control group samples have any targets?**

In [None]:
control_ids = train.loc[train['cp_type'] == 'ctl_vehicle', 'sig_id']
train_targets_scored.loc[train_targets_scored['sig_id'].isin(control_ids)].sum()[1:].sum()

In [None]:
cp_time_count = train['cp_time'].value_counts().reset_index()
cp_time_count.columns = ['cp_time', 'count']

fig = px.bar(cp_time_count, x='cp_time', y='count',
             height=500, width=600)
fig.show()

In [None]:
fig = px.histogram(train, x='cp_dose', height=500, width=600)
fig.show()

In [None]:
fig = make_subplots(rows=15, cols=1)

for i in range(1,15):
    fig.add_trace(
    go.Histogram(x=train[f'g-{i}'], name=f'g-{i}'),
    row=i, col=1)


fig.update_layout(height=1200, width=800, title_text="Gene Expression Features")
fig.show()

In [None]:
fig = make_subplots(rows=15, cols=1)

for i in range(1,15):
    fig.add_trace(
    go.Histogram(x=train[f'c-{i}'], name=f'c-{i}'),
    row=i, col=1)


fig.update_layout(height=1200, width=800, title_text="Cell Viability Features")
fig.show()

### 3. Target Features

**As mentioned above, the target features are categorized into two groups, scored and unscored. Both of the features in these two groups are binary. Now, this is a multi-classification problem but one sample can be classified as multiple or no targets. Let's see how often this happens:**

In [None]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
        go.Histogram(x=train[train_targets_scored[1:].columns.tolist()].sum(axis=1), name='Training Unique Scored Targets per Sample'),
        row=1, col=1)

fig.add_trace(
        go.Histogram(x=train[train_targets_nonscored[1:].columns.tolist()].sum(axis=1), name='Training Unique Non-Scored Targets per Sample'),
        row=1, col=2)

fig.update_layout(height=400, width=1000, title_text="Unique Labels per Sample")
fig.show()

**We see that most of the time, samples are either assigned to no label or one label, but there is a large difference in the 0 and 1 labels for the unscored targets.**

In [None]:
fig = px.bar(x=train[train_targets_scored.columns[1:].tolist()].sum(axis=0).sort_values(ascending=False).values,
            y=train[train_targets_scored.columns[1:].tolist()].sum(axis=0).sort_values(ascending=False).index,
            height=800, width=800, color=train[train_targets_scored.columns[1:].tolist()].sum(axis=0).sort_values(ascending=False).values)

fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'}, title='Training Scored Target Classification Counts')

fig.show()

**From the above count plot, we conclude that the most frequently classified scored labels are `nfkb_inhibitor`, `proteasome_inhibitor`, `cyclooxygenase_inhibitor`, `dopamine_receptor_antagonist`, and `seratonin_receptor_antagonist`. Now let's do the same for the nonscored labels:**

In [None]:
fig = px.bar(x=train[train_targets_nonscored.columns[1:].tolist()].sum(axis=0).sort_values(ascending=False).values,
            y=train[train_targets_nonscored.columns[1:].tolist()].sum(axis=0).sort_values(ascending=False).index,
            height=800, width=800, color=train[train_targets_nonscored.columns[1:].tolist()].sum(axis=0).sort_values(ascending=False).values)

fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'}, title='Training NonScored Target Classification Counts')

fig.show()

### 4. Correlations

**Correlated features will not always worsen a model, but it does not always improve one either. In general, there are 3 reasons to remove correlated features:**

1. Model trains faster
2. Remove harmful bias
3. Improve model interpretability

**That being said, we only want to remove correlated features that are weakly correlated with our target. Suppose we have 3 features `col1`, `col2`, and `col3`. Also suppose that `col1` and `col2` are highly correlated to the target, but all 3 features are correlated to eachother. If we leave all the features and randomly select one of them, we have a `2/3` chance of getting a feature correlated with our target (which is what we want). If we decide to remove one of these 3 correlated features, and we happen to remove, say, `col2`, then this probability of getting a 'good' feature drops to `1/2`.**

**So, we want to locate features that are highly correlated with eachother but only drop those that are weakly correlated with the target. Let's begin by creating some heatmaps:**

In [None]:
fig = px.imshow(train[train_features[1:].columns.tolist()].corr(method='pearson'), 
                title='Correlations Among Training Features',
                height=800, width=800)
fig.show()

**It would seem we have a lot of correlation going on in the bottom right corner of the above graph. This is the `c-` region.**

In [None]:
fig = px.imshow(train[[col for col in train_features.columns if 'c-' in col]].corr(method='pearson'), 
                title='Correlations Among Training Features',
                height=800, width=800)
fig.show()

# III. Processing

**Indeed, we have a lot of correlation between `c-` variables. The least correlated pair seems to be no less than `.6`. Let's try to find the ones that aren't highly correlated with the target. We can do, rather sloppily I admit, like so:**

In [None]:
c_cols = [col for col in train_features.columns if 'c-' in col]
g_cols = [col for col in train_features.columns if 'g-' in col]

c_corrs = train[[*c_cols,*train_targets_scored]].corr(method='pearson')

**Now we come up with a list of `c-` features that are highly correlated with eachother and a list of `c-` features that are highly correlated to the target columns:**

In [None]:
threshold_bad = .85
bad_c_cols = []

for col in c_corrs.iloc[:len(c_cols), :len(c_cols)].columns:
    for pair in c_corrs.iloc[:len(c_cols):, :len(c_cols)][col].iteritems():
        if abs(pair[1]) > threshold_bad:
            if pair[0] not in bad_c_cols and pair[0] is not col: 
                bad_c_cols.append(pair[0])
            
print(f"{len(bad_c_cols)} c- columns with correlation to other c- columns above {threshold_bad}")
print('')
print(bad_c_cols)

In [None]:
threshold_good = .65
good_c_cols = []

for col in c_corrs.iloc[:len(c_cols), len(c_cols):].columns:
    for pair in c_corrs.iloc[:len(c_cols):, len(c_cols):][col].iteritems():
        if abs(pair[1]) > threshold_good:
            if pair[0] not in good_c_cols and pair[0] is not col: 
                good_c_cols.append(pair[0])
            
print(f"{len(good_c_cols)} c- columns with correlation to target above {threshold_good}")
print('')
print(good_c_cols)

**Now we create a combined list of features: `c-` columns that are highly correlated with eachother, but that are also not too correlated with the target columns.**

In [None]:
c_cols_to_drop = [col for col in bad_c_cols if col not in good_c_cols]
print(len(c_cols_to_drop))
print(c_cols_to_drop)

**Great! That seemed to do the trick. So using the above procedure, we can drop only the highly correlated features that are uncorrelated to the target. We can also perform feature selection offline with permutation importanace, as is done in this notebook [here](https://www.kaggle.com/stanleyjzheng/multilabel-neural-network-improved)**

In [None]:
great_cols = [  0,   1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  14,  15,
        16,  18,  19,  20,  21,  23,  24,  25,  27,  28,  29,  30,  31,
        32,  33,  34,  35,  36,  37,  39,  40,  41,  42,  44,  45,  46,
        48,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
        63,  64,  65,  66,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  92,
        93,  94,  95,  96,  97,  99, 100, 101, 103, 104, 105, 106, 107,
       108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
       121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 132, 133, 134,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       149, 150, 151, 152, 153, 154, 155, 157, 159, 160, 161, 163, 164,
       165, 166, 167, 168, 169, 170, 172, 173, 175, 176, 177, 178, 180,
       181, 182, 183, 184, 186, 187, 188, 189, 190, 191, 192, 193, 195,
       197, 198, 199, 202, 203, 205, 206, 208, 209, 210, 211, 212, 213,
       214, 215, 218, 219, 220, 221, 222, 224, 225, 227, 228, 229, 230,
       231, 232, 233, 234, 236, 238, 239, 240, 241, 242, 243, 244, 245,
       246, 248, 249, 250, 251, 253, 254, 255, 256, 257, 258, 259, 260,
       261, 263, 265, 266, 268, 270, 271, 272, 273, 275, 276, 277, 279,
       282, 283, 286, 287, 288, 289, 290, 294, 295, 296, 297, 299, 300,
       301, 302, 303, 304, 305, 306, 308, 309, 310, 311, 312, 313, 315,
       316, 317, 320, 321, 322, 324, 325, 326, 327, 328, 329, 330, 331,
       332, 333, 334, 335, 338, 339, 340, 341, 343, 344, 345, 346, 347,
       349, 350, 351, 352, 353, 355, 356, 357, 358, 359, 360, 361, 362,
       363, 364, 365, 366, 368, 369, 370, 371, 372, 374, 375, 376, 377,
       378, 379, 380, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
       392, 393, 394, 395, 397, 398, 399, 400, 401, 403, 405, 406, 407,
       408, 410, 411, 412, 413, 414, 415, 417, 418, 419, 420, 421, 422,
       423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435,
       436, 437, 438, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,
       452, 453, 454, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465,
       466, 468, 469, 471, 472, 473, 474, 475, 476, 477, 478, 479, 482,
       483, 485, 486, 487, 488, 489, 491, 492, 494, 495, 496, 500, 501,
       502, 503, 505, 506, 507, 509, 510, 511, 512, 513, 514, 516, 517,
       518, 519, 521, 523, 525, 526, 527, 528, 529, 530, 531, 532, 533,
       534, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547,
       549, 550, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563,
       564, 565, 566, 567, 569, 570, 571, 572, 573, 574, 575, 577, 580,
       581, 582, 583, 586, 587, 590, 591, 592, 593, 595, 596, 597, 598,
       599, 600, 601, 602, 603, 605, 607, 608, 609, 611, 612, 613, 614,
       615, 616, 617, 619, 622, 623, 625, 627, 630, 631, 632, 633, 634,
       635, 637, 638, 639, 642, 643, 644, 645, 646, 647, 649, 650, 651,
       652, 654, 655, 658, 659, 660, 661, 662, 663, 664, 666, 667, 668,
       669, 670, 672, 674, 675, 676, 677, 678, 680, 681, 682, 684, 685,
       686, 687, 688, 689, 691, 692, 694, 695, 696, 697, 699, 700, 701,
       702, 703, 704, 705, 707, 708, 709, 711, 712, 713, 714, 715, 716,
       717, 723, 725, 727, 728, 729, 730, 731, 732, 734, 736, 737, 738,
       739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751,
       752, 753, 754, 755, 756, 758, 759, 760, 761, 762, 763, 764, 765,
       766, 767, 769, 770, 771, 772, 774, 775, 780, 781, 782, 783, 784,
       785, 787, 788, 790, 793, 795, 797, 799, 800, 801, 805, 808, 809,
       811, 812, 813, 816, 819, 820, 821, 822, 823, 825, 826, 827, 829,
       831, 832, 833, 834, 835, 837, 838, 839, 840, 841, 842, 844, 845,
       846, 847, 848, 850, 851, 852, 854, 855, 856, 858, 860, 861, 862,
       864, 867, 868, 870, 871, 873, 874]

In [None]:
def preprocess(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 2, 'D2': 3})
    del df['sig_id']
    return df

In [None]:
train_features = preprocess(train_features)
test_features = preprocess(test_features)

In [None]:
train_targets_scored = train_targets_scored.drop('sig_id', axis = 1)

**Drop control group from training since all MoA's are 0:**

In [None]:
train_targets_scored = train_targets_scored.loc[train_features['cp_type'] == 0].reset_index(drop=True)
train_features = train_features.loc[train_features['cp_type'] == 0].reset_index(drop=True)

**Initialize sample submission with all 0s:**

In [None]:
sample_sub.loc[:, train_targets_scored.columns] = 0

# IV. Neural Net Ensemble

**Neural networks are random in nature because the weights of the nodes in each layer are randomly initalized at the beginning of training. From an experimental perspective, this means we need to run many experiments with the same parameters and average their results to compare different model architectures and processesing techniques. From a prediction perspective, this means we can train the same model many different times and use each of these trained models to predict, taking an average (weighted or not) for our final predictions:**

In [None]:
USE_NN_ENSEMBLE = False
USE_PROCESSED = True

In [None]:
#basic training configuration
NUM_NETS = 1
EPOCHS = 30
BATCH_SIZE = 64
VERBOSE = 0

### 1. Model

**With lookahead optimizer inspired from [here](https://www.kaggle.com/simakov/keras-multilabel-neural-network-v1-2). For more on lookahead, I recommend this [video](https://www.youtube.com/watch?v=ypqf7UUird4)**

![Graph of different activation functions](https://raw.githubusercontent.com/krutikabapat/krutikabapat.github.io/master/assets/activation.png)

**I also wanted to freedom to experiment with different activation layers and batch normalization, so I included that in the `build_model` function. Here are some relevant papers:**

**Batch Normalization**
* **Batch normalization paper [here](https://arxiv.org/abs/1502.03167)**
* **TensorFlow documentation [here](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)**

**Swish Activation Function**
* **Original paper [here](https://arxiv.org/pdf/1710.05941.pdf)**
* **TensorFlow documentation [here](https://www.tensorflow.org/api_docs/python/tf/keras/activations/swish)**

**Mish Activation Function**
* **Original paper [here](https://arxiv.org/abs/1908.08681)**
* **TensorFlow documentation [here](https://www.tensorflow.org/addons/api_docs/python/tfa/activations/mish)**

**Relu Activation Function**
* **Original paper [here](https://arxiv.org/pdf/1803.08375.pdf)**
* **TensorFlow documentation [here](https://www.tensorflow.org/api_docs/python/tf/keras/activations/relu)**

**Selu Activation Function**
* **Original paper [here](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)**
* **TensorFlow documentation [here](https://www.tensorflow.org/api_docs/python/tf/keras/activations/selu)**

In [None]:
def build_model(num_columns, num_nodes = 1024, use_swish = False, use_mish = False,
                use_relu = False, use_selu = False, batch_norm = True, dropout = .4):
    model = tf.keras.Sequential()
    
    if use_swish:
        #first layer
        if batch_norm:
            model.add(tf.keras.layers.BatchNormalization(input_shape=(num_columns,)))
            model.add(tf.keras.layers.Dropout(.2))
        else: model.add(tf.keras.layers.Dropout(.2, input_shape=(num_columns,)))
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes, activation='swish')))
        if batch_norm: model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
    
        #second layer
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes, activation='swish')))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
                      
    if use_mish:
        #first layer
        if batch_norm:
            model.add(tf.keras.layers.BatchNormalization(input_shape=(num_columns,)))
            model.add(tf.keras.layers.Dropout(.2))
        else: model.add(tf.keras.layers.Dropout(.2, input_shape=(num_columns,)))
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes,
                                                 activation = tfa.activations.mish)))
        if batch_norm: model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
    
        #second layer
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes,
                                                 activation=tfa.activations.mish)))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
        
    if use_relu:
        #first layer
        if batch_norm:
            model.add(tf.keras.layers.BatchNormalization(input_shape=(num_columns,)))
            model.add(tf.keras.layers.Dropout(.2))
        else: model.add(tf.keras.layers.Dropout(.2, input_shape=(num_columns,)))
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes, activation='relu')))
        if batch_norm: model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
    
        #second layer
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes, activation='relu')))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
        
    if use_selu:
        #first layer
        if batch_norm:
            model.add(tf.keras.layers.BatchNormalization(input_shape=(num_columns,)))
            model.add(tf.keras.layers.Dropout(.2))
        else: model.add(tf.keras.layers.Dropout(.2, input_shape=(num_columns,)))
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes, activation='selu')))
        if batch_norm: model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
    
        #second layer
        model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_nodes, activation='selu')))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(dropout))
    
    #output layer
    model.add(tf.keras.layers.Dense(206, activation='sigmoid'))
    
    #compiler
    model.compile(optimizer = tfa.optimizers.Lookahead(tf.optimizers.Adam(), sync_period = 10),
                  loss = 'binary_crossentropy', metrics = ['AUC'])
              
    return model

### 2. Random Ensemble Training

**To get as diverse a set of NNs as possible, we place `train_test_split` within the training loop so that each model is trained/evaluated on a different subset of the data. This approach might generalize better than stratified fold training if a large enough number of NNs are trainined.**

In [None]:
preds = np.zeros((test_features.shape[0], 206)) 
histories = []

if USE_NN_ENSEMBLE:
    for j in range(NUM_NETS):

        #get datasets
        train_ds = train_features.values
        train_targets = train_targets_scored.values

        #create a validation set to evaluate our model(s) performance
        train_ds, val_ds, train_targets, val_targets = train_test_split(train_ds, train_targets, test_size = 0.1)

        #some callbacks we can use
        sv = tf.keras.callbacks.ModelCheckpoint(f'net-{j}.h5', monitor = 'val_loss', verbose = 0,
                                                save_best_only = True, save_weights_only = True, mode = 'min')
        reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, 
                                                              verbose=VERBOSE, epsilon=1e-4, mode='min')

        #print fold info
        model = build_model(train_features.shape[1], use_swish = True)
        history = model.fit(train_ds, train_targets,
                            validation_data = (val_ds, val_targets),
                            epochs = EPOCHS, batch_size = BATCH_SIZE, 
                            callbacks = [reduce_lr_loss, sv], verbose = VERBOSE)
        histories.append(history)

        #report training results
        print(f"Neural Net {j + 1}: Epochs={EPOCHS}, Train AUC={round(max(history.history['auc']), 5)}, Train loss={round(min(history.history['loss']), 5)}, Validation AUC={round(max(history.history['val_auc']), 5)}, Validation loss={round(min(history.history['val_loss']), 5)}")  
        print('')

        #predict out of fold
        model.load_weights(f'net-{j}.h5')
        pred = model.predict(test_features)
        preds += pred / NUM_NETS

**Now we train with our processed data to compare performance:**

In [None]:
preds_proc = np.zeros((test_features.shape[0], 206)) 
histories_proc = []

if USE_NN_ENSEMBLE & USE_PROCESSED:
    for j in range(NUM_NETS):
        
        #train_dataset = train_features.drop(columns=c_cols_to_drop)
        train_dataset = train_features.iloc[:, great_cols]
        
        #get datasets
        train_ds = train_dataset.values
        train_targets = train_targets_scored.values

        #create a validation set to evaluate our model(s) performance
        train_ds, val_ds, train_targets, val_targets = train_test_split(train_ds, train_targets, test_size = 0.1)

        #some callbacks we can use
        sv = tf.keras.callbacks.ModelCheckpoint(f'net-{j}.h5', monitor = 'val_loss', verbose = 0,
                                                save_best_only = True, save_weights_only = True, mode = 'min')
        reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, 
                                                              verbose=VERBOSE, epsilon=1e-4, mode='min')

        #print fold info
        model = build_model(train_dataset.shape[1], use_swish = True)
        history = model.fit(train_ds, train_targets,
                            validation_data = (val_ds, val_targets),
                            epochs = EPOCHS, batch_size = BATCH_SIZE, 
                            callbacks = [reduce_lr_loss, sv], verbose = VERBOSE)
        histories_proc.append(history)

        #report training results
        print(f"Neural Net {j + 1}: Epochs={EPOCHS}, Train AUC={round(max(history.history['auc']), 5)}, Train loss={round(min(history.history['loss']), 5)}, Validation AUC={round(max(history.history['val_auc']), 5)}, Validation loss={round(min(history.history['val_loss']), 5)}")  
        print('')

        #predict out of fold
        model.load_weights(f'net-{j}.h5')
        pred = model.predict(test_features)
        preds += pred / NUM_NETS

### 3. Evaluation

In [None]:
if USE_NN_ENSEMBLE:
    print(f"Average validation loss: {np.average([min(histories[i].history['val_loss']) for i in range(len(histories))])}")
    print(f"Average validation AUC: {np.average([max(histories[i].history['val_auc']) for i in range(len(histories))])}")
    
if USE_NN_ENSEMBLE & USE_PROCESSED:
    print(f"Average validation loss: {np.average([min(histories[i].history['val_loss']) for i in range(len(histories))])}")
    print(f"Average validation AUC: {np.average([max(histories[i].history['val_auc']) for i in range(len(histories))])}")

In [None]:
#define function to visualize learning curves
def plot_learning_curves(histories, num): 
    fig, ax = plt.subplots(figsize = (20, 10))

    #plot losses
    for i in range(num):
        plt.plot(histories[i].history['loss'], color = 'C0')
        plt.plot(histories[i].history['val_loss'], color = 'C1')
    
    #set master title
    fig.suptitle("Model Performance", fontsize=14)

if USE_NN_ENSEMBLE:
    plot_learning_curves(histories, NUM_NETS)
    plot_learning_curves(histories_proc, NUM_NETS)

# V. RepeatedMultilabelStratifiedKFold

**Now we can try stratified fold training. Since we have a multi-label target, we cannot use scikit-learn's `StratifiedKFold`, but we can use iterative stratification based on [this paper](https://link.springer.com/chapter/10.1007/978-3-642-23808-6_10) from [this GitHub repository](https://github.com/trent-b/iterative-stratification). Of course, internet is not allowed for this competition, so we will use [this Kaggle dataset](https://www.kaggle.com/mudittiwari255/iterativestrat):**

In [None]:
USE_SKF_ENSEMBLE = True
USE_PROCESSED = True

In [None]:
#basic training configuration
FOLDS = 5
REPEATS = 2
BATCH_SIZE = 64
VERBOSE = 0

In [None]:
sys.path.append('../input/iterativestrat/iterative-stratification-master')
from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold

### 1. StratifiedKFold Training

In [None]:
skf_preds = np.zeros((test_features.shape[0], 206)) 
skf_histories = []
skf = RepeatedMultilabelStratifiedKFold(n_splits=FOLDS, n_repeats=REPEATS, random_state=SEED)

if USE_SKF_ENSEMBLE:
    for f, (train_index, val_index) in enumerate(skf.split(train_features.values, train_targets_scored.values)):

        #get datasets
        train_ds = train_features.values[train_index]
        train_targets = train_targets_scored.values[train_index]
        val_ds = train_features.values[val_index]
        val_targets = train_targets_scored.values[val_index]

        #some callbacks we can use
        sv = tf.keras.callbacks.ModelCheckpoint(f'fold-{f}.h5', monitor = 'val_loss', verbose = 0,
                                                save_best_only = True, save_weights_only = True, mode = 'min')
        reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3,
                                                              verbose=VERBOSE, epsilon=1e-4, mode='min')

        #print fold info
        model = build_model(train_features.shape[1], use_swish = True)
        history = model.fit(train_ds, train_targets,
                            validation_data = (val_ds, val_targets),
                            epochs = EPOCHS, batch_size = BATCH_SIZE, 
                            callbacks = [reduce_lr_loss, sv], verbose = VERBOSE)
        print('')
        skf_histories.append(history)

        #report training results
        print(f"Fold {f + 1}: Epochs={EPOCHS}, Train AUC={round(max(history.history['auc']), 5)}, Train loss={round(min(history.history['loss']), 5)}, Validation AUC={round(max(history.history['val_auc']), 5)}, Validation loss={round(min(history.history['val_loss']), 5)}")  
        print('')

        #predict out of fold
        model.load_weights(f'fold-{f}.h5')
        pred = model.predict(test_features)
        skf_preds += pred / FOLDS / REPEATS

In [None]:
skf_preds_proc = np.zeros((test_features.shape[0], 206)) 
skf_histories_proc = []
rmskf = RepeatedMultilabelStratifiedKFold(n_splits=FOLDS, n_repeats=REPEATS, random_state=SEED)

if USE_SKF_ENSEMBLE & USE_PROCESSED:
    for f, (train_index, val_index) in enumerate(rmskf.split(train_features.values, train_targets_scored.values)):

        #train_dataset = train_features.drop(columns=c_cols_to_drop)
        train_dataset = train_features.iloc[:, great_cols]
        
        #get datasets
        train_ds = train_dataset.values[train_index]
        train_targets = train_targets_scored.values[train_index]
        val_ds = train_dataset.values[val_index]
        val_targets = train_targets_scored.values[val_index]

        #some callbacks we can use
        sv = tf.keras.callbacks.ModelCheckpoint(f'fold-{f}.h5', monitor = 'val_loss', verbose = 0,
                                                save_best_only = True, save_weights_only = True, mode = 'min')
        reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3,
                                                              verbose=VERBOSE, epsilon=1e-4, mode='min')

        #print fold info
        model = build_model(train_dataset.shape[1],
                            use_swish = True)

        history = model.fit(train_ds, train_targets, validation_data = (val_ds, val_targets),
                            epochs = EPOCHS, batch_size = BATCH_SIZE,
                            callbacks = [reduce_lr_loss, sv], verbose = VERBOSE)
        print('')
        skf_histories_proc.append(history)

        #report training results
        print(f"Fold {f + 1}: Epochs={EPOCHS}, Train AUC={round(max(history.history['auc']), 5)}, Train loss={round(min(history.history['loss']), 5)}, Validation AUC={round(max(history.history['val_auc']), 5)}, Validation loss={round(min(history.history['val_loss']), 5)}")  
        print('')

        #predict out of fold
        model.load_weights(f'fold-{f}.h5')
        pred = model.predict(test_features.iloc[:, great_cols])
        skf_preds_proc += pred / FOLDS / REPEATS

## 2. Evaluation

In [None]:
if USE_SKF_ENSEMBLE:
    print('#'*25)
    print('SKF Ensemble Results')
    print('#'*25); print('')
    print(f"Average validation loss: {np.average([min(skf_histories[i].history['val_loss']) for i in range(len(skf_histories))])}")
    print(f"Average validation AUC: {np.average([max(skf_histories[i].history['val_auc']) for i in range(len(skf_histories))])}")
    print('')
    
if USE_SKF_ENSEMBLE & USE_PROCESSED:
    print('#'*25)
    print('SKF Ensemble Results - Processed')
    print('#'*25); print('')
    print(f"Average validation loss: {np.average([min(skf_histories_proc[i].history['val_loss']) for i in range(len(skf_histories))])}")
    print(f"Average validation AUC: {np.average([max(skf_histories_proc[i].history['val_auc']) for i in range(len(skf_histories))])}")

# VI. Submission

**We have already predicted during the training loops, so now we just need to assemble a submission DataFrame. We can just steal the sample submission given to us and with some minor tweaks, we are done:**

In [None]:
print(sample_sub.shape)
sample_sub.head()

In [None]:
if USE_NN_ENSEMBLE:
    #sample_sub.loc[:, train_targets_scored.columns] = preds
    sample_sub.loc[:, train_targets_scored.columns] = preds_proc
    
if USE_SKF_ENSEMBLE:
    #sample_sub.loc[:, train_targets_scored.columns] = skf_preds
    sample_sub.loc[:, train_targets_scored.columns] = skf_preds_proc
    
sample_sub.head()

**Recall that `cp_type` indicated samples that were treated with a compound (`cp_vehicle`) or with a control perturbation (`ctl_vehicle`) and that control perturbations have no MoAs. So, we should set the control samples in our test dataset to 0.**

In [None]:
#sanity check
sample_sub.loc[test_features['cp_type'] == 1].head()

In [None]:
sample_sub.loc[test_features['cp_type'] == 1, train_targets_scored.columns] = 0

In [None]:
#last sanity check
sample_sub.head()

In [None]:
sample_sub.to_csv('submission.csv', index = False)
print('Submission saved')