# Summary
- `cp_dose` is the only used categorical variable.
- <i> <b>Control</b> patients have been not been used </i> since they don't have MoA.
- Numerours Neural Networks have been used and aggregated to make predictions
- Neural Network with different entries will run in parallel
- Some of the structures used are Linear, Residual Connections, CNN, inception-style feature extraction and mix of all.
- Also the high performing Neural Netowrks have been saved in <a href="https://www.kaggle.com/damoonshahhosseini/nnmoa">NN-MoA</a> dataset for futher improvement

In [None]:
# Imprting the needed utilities for NN
import tensorflow as tf
from tensorflow.keras import Model, Input, models
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization, MaxPooling2D, ReLU, Dropout, Flatten, Dense, InputLayer, Concatenate, Add, SeparableConv2D, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.initializers import TruncatedNormal, he_uniform, he_normal

# Data Preprocessing utilities
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

# Importing the plotting utilities
import pylab as plot
from matplotlib import pyplot as plt

# Setting the parameters for plotting
params = {'legend.fontsize': 20,
          'legend.handlelength': 5,
          'xtick.labelsize' : 30,
          'ytick.labelsize' : 30,
         'axes.titlesize' : 50}

plot.rcParams.update(params)

# Base Libraries
import numpy as np
import pandas as pd 

# Importing and preprocessing
- Excluding the roww with cp_type equal to ctrl_vehicle since control pertubations have no MoAs
- Normalizing the cp_time column
- Mapping cp_dose values to 1 and -1
- Setting the type of all data to float64 in order to get them ready for Neural Network layers.
- Run min_max scaler on the combination of X_train and X_test then split

In [None]:
# Importing train and test data
train_features, test_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv'), pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
# Sample submission file
submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

# Target data
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
train_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

# Getting the indices with MoA reactions
# indices = train_features[train_features['cp_type'] == 'trt_cp'].index.to_list()

In [None]:
# Dropping the control rows since they have no MoA
# train_targets = train_targets.iloc[indices, :].reset_index(drop=True)
# train_targets_nonscored = train_targets_nonscored.iloc[indices, :].reset_index(drop=True)
# train_features = train_features.iloc[indices, :].reset_index(drop=True)

# Encoding categorical variables
cp_dose = {'D1': 1, 'D2': -1} # Dictionary
cp_ctrl = {'trt_cp': 1, 'ctl_vehicle': -1}

# Mapping dictionaries to columns
train_features['cp_dose'], test_features['cp_dose'] = train_features['cp_dose'].map(cp_dose), test_features['cp_dose'].map(cp_dose)
train_features['cp_type'], test_features['cp_type'] = train_features['cp_type'].map(cp_ctrl), test_features['cp_type'].map(cp_ctrl)

# Deleting unecessary columns of data
del train_features['sig_id']
# del train_features['cp_type']
del test_features['sig_id']
# del test_features['cp_type']

# Scaling data
- Using MinMaxScaler to scale the data between 0 to 1 relative to their columns
- This will increase the performace of the Neural Network for convergence

In [None]:
# Min_max scaler
train_len = train_features.shape[0]
comb = pd.concat([train_features, test_features])
# Breakdown of the data
genes_comb = comb.iloc[:,2:774]
cells_comb = comb.iloc[:,774:]
# PCA Versions
pca_genes = PCA(n_components=20).fit_transform(genes_comb)
pca_cells = PCA(n_components=10).fit_transform(cells_comb)
pca_all = PCA(n_components=100).fit_transform(comb)

comb_scaled = MinMaxScaler().fit_transform(comb)

scaled_train_features = comb_scaled[0:train_len]
scaled_test_features = comb_scaled[train_len:]

In [None]:
# Converting the data set to a float64 dtype
# Training
X = train_features.astype('float64').copy()
Xs = pd.DataFrame(scaled_train_features.astype('float64')).copy()

pca_all_train = pca_all[:train_len]
pca_cells_train = pca_cells[:train_len]
pca_genes_train = pca_genes[:train_len]


# Testing
X_test = test_features.astype('float64').copy()
Xs_test = pd.DataFrame(scaled_test_features.astype('float64')).copy()

pca_all_test = pca_all[train_len:]
pca_cells_test = pca_cells[train_len:]
pca_genes_test = pca_genes[train_len:]


# Targets
y = train_targets.iloc[:,1:].astype('float64').copy()
y_non = train_targets_nonscored.iloc[:,1:].astype('float64').copy()  # The non_scored drugs

# Concatenating the non-scored data to the scored one
y_all = pd.concat([train_targets, y_non], axis=1).iloc[:,1:].astype('float64').copy()

## Input Pipelines
Different Input structure will be fed to the Models:
- all_data -> All the columns in a row
- Cells(_test) -> Only the features with the name cell in them
- Genes(-test) -> Only the features with the name gene in them 
- Cell_image(_test) -> cells data converted to 10x10 image so it can be treated as an image

In [None]:
# Training 
Cells = X.iloc[:,773:].copy()
Cells_scaled = Xs.iloc[:,773:].copy() 
Genes = X.iloc[:,3:773].copy()
Genes_scaled = Xs.iloc[:,3:773].copy()
# Cells_image = Cells.values.reshape(Cells.shape[0], 10, 10, 1)
# Cells_image_scaled = Cells_scaled.values.reshape(Cells.shape[0], 10, 10, 1)

# Testing
Cells_test = X_test.iloc[:,773: ].copy()
Cells_test_scaled = Xs_test.iloc[:,773: ].copy()
Genes_test = X_test.iloc[:,3:773].copy()
Genes_test_scaled = Xs_test.iloc[:,3:773].copy()
# Cells_image_test = Cells_test.values.reshape(Cells_test.shape[0], 10, 10, 1)
# Cells_image_test_scaled = Cells_test_scaled.values.reshape(Cells_test.shape[0], 10, 10, 1)

# Input pipelines
all_input = Input(shape=(X.shape[1]), name='All')
cells_input = Input(shape=(Cells.shape[1]), name='Cells')
genes_input = Input(shape=(Genes.shape[1]), name='Genes')
# cell_image_input = Input(shape=(10,10,1), name='Cells_image')
pca_cell_input = Input(shape=pca_cells.shape[1], name='PCA_cells')
pca_gene_input = Input(shape=pca_genes.shape[1], name='PCA_genes')
pca_all_input = Input(shape=pca_all.shape[1], name='PCA_all')

# Models
### Models will be named based on their input and structure.
- all_data: have an AD in their name
- cells: C, Genes: G, Cells_image: CI
- L: Linear, M: mix, I: inception, R: residual
- Pattern: [input_type]_[NN_type]_[ext]

### `Notes`:
1. The weights that are used in sigmoid layers should be relatively low (>=1e-3) for higher accuracy and better convergence.
2. The goal is write to minimalist Neural Networks (less than 1e6 params) to focus on particular entries and techniques, then aggregate all of them in one network.
3. Multi-input NN (uni-output):
    - Make a prediction in sub-NNs then aggregate the results to get a final result
    - Use inter-connected NNs
    - Use structures to find patterns within the data.

In [None]:
def residual_module(x, add, std=1, Unit=32, seed=11):
    """ A residual connection module """
    out = Dense(Unit, activation='relu', kernel_initializer=TruncatedNormal(0, std, seed))(x)
    out = BatchNormalization()(out)
    out = Dense(y_all.shape[1], activation='sigmoid', kernel_initializer=TruncatedNormal(0, std, seed))(out)
    
    return Add()([out, add])


def linear_NN(x, std, seed=1, unit=32):
    """ Symmetric Linear NNs ran on cells, genes, all_data """
    c = Dense(unit, activation='selu', kernel_initializer=TruncatedNormal(0, std, seed))(x)
    c = Dense(unit * 2, kernel_initializer=TruncatedNormal(0, std, seed))(c)
    c = BatchNormalization()(c)
    c = Dense(unit * 4, kernel_initializer=TruncatedNormal(0, std, seed))(c)
    c = BatchNormalization()(c)
    c = Dense(unit * 8, kernel_initializer=TruncatedNormal(0, std, seed))(c)
    
    return c

In [None]:
all_output = linear_NN(all_input, std=1e-1, seed=3, unit=64)
p_all = BatchNormalization()(all_output)
p_all = Dense(y_all.shape[1], activation='sigmoid', kernel_initializer=TruncatedNormal(0, 3, 1))(p_all)

pca_cell_output = linear_NN(pca_cell_input, std=1e-1, seed=4, unit=64)
p_pca_cell = BatchNormalization()(pca_cell_output)
p_pca_cell = Dense(y_all.shape[1], activation='sigmoid', kernel_initializer=TruncatedNormal(0, 5, 123))(p_pca_cell)

pca_gene_output = linear_NN(pca_gene_input, std=1e-1, seed=5, unit=64)
p_pca_gene = BatchNormalization()(pca_gene_output)
p_pca_gene = Dense(y_all.shape[1], activation='sigmoid', kernel_initializer=TruncatedNormal(0, 4, 123))(p_all)

pca_all_output = linear_NN(pca_all_input, std=1e-1, seed=6, unit=64)
p_pca_all = BatchNormalization()(pca_all_output)
p_pca_all = Dense(y_all.shape[1], activation='sigmoid', kernel_initializer=TruncatedNormal(0, 3, 123))(p_pca_all)


comb = Concatenate()([all_output, pca_cell_output, pca_gene_output, pca_all_output])
comb = Dense(512, kernel_initializer=TruncatedNormal(0, 5, 13))(comb)
comb = BatchNormalization()(comb)

out = residual_module(comb, p_all, std=5e-2, Unit=2048, seed=5)

out = residual_module(out, p_pca_cell, std=5e-2, Unit=512, seed=7)
    
out = residual_module(out, p_pca_gene, std=5e-2, Unit=256, seed=9)

out = residual_module(out, p_pca_all, std=5e-2, Unit=128, seed=11)

out = Dense(y_all.shape[1], activation='sigmoid', kernel_initializer=TruncatedNormal(0, 2, 11))(out)

out = Add()([out, p_all, p_pca_all, p_pca_gene, p_pca_cell])
out = BatchNormalization()(out)

out = Dense(y_all.shape[1], activation='sigmoid', kernel_initializer=TruncatedNormal(0, 2.5, 11))(out)

model = Model(inputs=[all_input, pca_cell_input, pca_gene_input, pca_all_input], outputs=[out])

In [None]:
model.count_params() / 1e6

In [None]:
plot_model(model, show_shapes=1, show_layer_names=0)

In [None]:
def scheduler(epoch, lr): return max(1e-25, lr * 0.96 ** (epoch // 50))

BATCH_SIZE = 128
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, random_state=1, shuffle=True)
history, index = {}, 0
prediction = np.zeros((test_features.shape[0], y.shape[1]))

for train_indices, val_indices in kf.split(X, y):
    
    print(f'{index + 1}th fold, Validation Indices: ', val_indices[:5])
    # Gene, y, and Cell data divided into Train and Validation splits
    X_train, X_val = Xs.loc[train_indices], Xs.loc[val_indices]
#     train_cells, val_cells = Cells_scaled.iloc[train_indices], Cells_scaled.iloc[val_indices]
#     train_genes, val_genes = Genes_scaled.iloc[train_indices], Genes_scaled.iloc[val_indices]
    y_train, y_val = y_all.iloc[train_indices], y_all.iloc[val_indices]
    
    pca_gene_train, pca_gene_val = pca_genes_train[train_indices], pca_genes_train[val_indices]
    pca_cell_train, pca_cell_val = pca_cells_train[train_indices], pca_cells_train[val_indices]
    pca_All_train, pca_all_val = pca_all_train[train_indices], pca_all_train[val_indices]
    
    # Instantiating the model
    model = Model(
        inputs=[all_input, pca_cell_input, pca_gene_input, pca_all_input], 
        outputs=[out])
    model.compile(optimizer=Adam(0.004), loss=BinaryCrossentropy())
    
    # Fitting
    history[index] = model.fit(
        x=[X_train, pca_cell_train, pca_gene_train, pca_All_train], 
        y=y_train, epochs=600, batch_size=BATCH_SIZE, verbose=0, 
        validation_data=([X_val, pca_cell_val, pca_gene_val, pca_all_val], y_val),
        callbacks=[ 
            ReduceLROnPlateau(factor=0.95, patience=20, verbose=True, monitor='loss', min_lr=1e-45, min_delta=1e-4),
            EarlyStopping(monitor='loss', patience=200, restore_best_weights=True, min_delta=1e-3, verbose=True)
        ]
    )
    
    model_prediction = model.predict(
        [Xs_test, pca_cells_test, pca_genes_test, pca_all_test], 
        batch_size=BATCH_SIZE, verbose=False)[:,:y.shape[1]]
    
    prediction += model_prediction / N_FOLDS
    
    index += 1
    print('#----------------#----------------#----------------#----------------#')

# Plotting and Visualization
- Plotting the loss and val_loss in each fold
- Trying to make assumptions about the state of the model and improve it
- Difference between val_loss and loss, and the oscilations within the graph are things to look for

In [None]:
num_cols = 3
fig, axes = plt.subplots(len(history), num_cols, figsize=(40,60))
fig.legend(["blue", "orange"], prop={"size":10})

for i in range(len(history)):
    d = pd.DataFrame(history[i].history)
    d['Epoch'] = range(0,d.shape[0])

#     d.iloc[:,:].plot(x="Epoch", y=["loss","val_loss"], ax=axes[i][0])
    for j in range(num_cols):
        d.iloc[d.shape[0]//num_cols*j:d.shape[0]//num_cols*(j+1),:].plot(
            x="Epoch", y=["loss","val_loss"], ax=axes[i][j], title=f'{i+1}th fold')

## Prediction
- Replacing the sample submission with my submission
- Writting the submission into the directory

In [None]:
# prediction = model.predict([Cells_test_scaled, Genes_test_scaled, Xs_test, pca_cells_test, pca_genes_test, pca_all_test])[:,:y.shape[1]]
# prediction[prediction > 0.5] = 1 
# prediction[prediction < 1e-1] = 0
submission.iloc[:, 1:] = prediction
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
pd.read_csv('/kaggle/working/submission.csv').describe().T['max'].values

Checking mean, std, and quantiles of each drug: 
- Max should be in order of e-1.
- Min should be close to zero.
- Std should be relatively small (<1e-2)

In [None]:
pd.read_csv('/kaggle/working/submission.csv').describe()

# Saving Model
- To put into the <a href='https://www.kaggle.com/damoonshahhosseini/nnmoa'>dataset</a> and aggregate their results.
- The naming convention used should follow the one discussed above

In [None]:
# model.save('./NN01')