# Mechanism of Action (MoA) participation kernel

In [None]:
from time import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Deep learning libraries
import tensorflow as tf
from keras import Model, models
from keras.models import Sequential, load_model
from keras.layers import Dense, BatchNormalization, Dropout

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import tensorflow_addons as tfa
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,losses

# Utility functions
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler

from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

np.random.seed(7)
%matplotlib inline

In [None]:
train_feat_df = pd.read_csv('../input/lish-moa/train_features.csv')
test_feat_df = pd.read_csv('../input/lish-moa/test_features.csv')

scored_train_targets_df = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
nscored_train_targets_df = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

submission_sample_df = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
gene_cols = train_feat_df.columns[4:-100]
gene_data = train_feat_df[gene_cols]

In [None]:
cell_via_cols = train_feat_df.columns[-100:]
cell_via_data = train_feat_df[cell_via_cols]

## PCA gene features

In [None]:
scaled_gene_data = scale(gene_data)

In [None]:
pca1 = PCA(0.95)
pca1.fit(scaled_gene_data)

In [None]:
pca_gene = pca1.transform(scaled_gene_data)
per_var = np.round(pca1.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
transformed_gene_feats = pd.DataFrame(pca_gene, columns=labels)

## PCA cell features

In [None]:
scaled_cell_data = scale(cell_via_data)

In [None]:
pca2 = PCA(.95)
pca2.fit(scaled_cell_data)

In [None]:
pca_cell = pca2.transform(scaled_cell_data)
per_var = np.round(pca2.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
transformed_cell_feats = pd.DataFrame(pca_cell, columns=labels)

## NN Model

In [None]:
Train_data = train_feat_df.drop(columns=['sig_id']+list(gene_cols)+list(cell_via_cols) )

In [None]:
Train_data = Train_data.merge(transformed_gene_feats, how='left', left_index=True, right_index=True)
Train_data = Train_data.merge(transformed_cell_feats, how='left', left_index=True, right_index=True)
Train_data[['ctl_vehicle','trt_cp']] = pd.get_dummies(Train_data.cp_type)
Train_data[['D1','D2']] = pd.get_dummies(Train_data.cp_dose)
Train_data.drop(columns=['cp_type','cp_dose'], inplace=True)

In [None]:
Train_data

In [None]:
scaler = StandardScaler()
scaler.fit(Train_data)

In [None]:
Train_data_scaled = pd.DataFrame(scaler.transform(Train_data))
Train_data_scaled

In [None]:
Train_targets = scored_train_targets_df.drop(columns=['sig_id'])
Train_targets

In [None]:
ns_Train_targets = nscored_train_targets_df.drop(columns=['sig_id'])
ns_Train_targets

In [None]:
val_idx = np.random.choice(range(Train_data_scaled.shape[0]), Train_data_scaled.shape[0]//10, replace=False)

In [None]:
Tuner_train_data = np.delete(np.array(Train_data_scaled), val_idx, 0)
Tuner_train_target = np.delete(np.array(ns_Train_targets), val_idx, 0)

Tuner_val_data = np.array(Train_data_scaled)[val_idx,:]
Tuner_val_target = np.array(ns_Train_targets)[val_idx,:]

In [None]:
p_min = 0.001
p_max = 0.999
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

In [None]:
def build_model(hp):
    
    Model = Sequential()
    Model.add(Dense(hp.Choice('input_units', values=[512,1024,2048]), activation='relu', kernel_initializer='he_normal', input_shape=(Train_data_scaled.shape[1],)))
    Model.add(Dropout(hp.Float('input_drop', min_value=0.3, max_value=0.9,step=0.1)))
    Model.add(BatchNormalization())
              
    for i in range(hp.Int('nbr_lay', min_value=5, max_value=10, step=1)):
        Model.add(Dense(hp.Choice(f'dense_{i}_units', values=[256,512,1024]), activation=hp.Choice(f'dense_{i}_act', values=['relu','elu','swish']), kernel_initializer='he_normal'))
        Model.add(Dropout(hp.Float(f'lay_{i}_drop', min_value=0.2, max_value=0.9,step=0.1)))
        Model.add(BatchNormalization())
    
    Model.add(Dense(402, activation='sigmoid', kernel_initializer='he_normal'))
    
    Model.compile(optimizer = 'adam', loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics=logloss)
    
    return Model

In [None]:
Tuner = RandomSearch(build_model,
                    objective='val_loss',
                    max_trials=77,
                    executions_per_trial=1,
                    seed=77,
                    directory='./')

In [None]:
Tuner.search(Tuner_train_data, Tuner_train_target,
             epochs=10, verbose=0,
             validation_data=(Tuner_val_data, Tuner_val_target))

In [None]:
Model_tuned = Tuner.get_best_models(num_models=1)
Model_tuned[0].summary()

In [None]:
trials_dir=[]
for s in os.listdir('./untitled_project'):
    if 'trial_' in s:
        trials_dir.append(s)

In [None]:
Trials = []
for i in trials_dir:
    with open(f'./untitled_project/{i}/trial.json', 'r') as handle:
        parsed = json.load(handle)
        Trials.append((parsed['hyperparameters']['values'],parsed['score']))

In [None]:
max_score=1
for trial in Trials:
    if trial[1]<max_score:
        best_trial = trial
        max_score = trial[1]

In [None]:
best_trial

In [None]:
splits_nbr = 10
kf = KFold(n_splits = splits_nbr)
skf = StratifiedKFold(n_splits = splits_nbr, random_state = 10, shuffle = True)

In [None]:
def get_model_name(k):
    return 'Model_'+str(k)+'.h5'

In [None]:
VAL_ACCURACY = []
VAL_LOSS = []

fold_var = 1

for train_index, val_index in kf.split(Train_data_scaled,Train_targets):
    
    train_data = Train_data_scaled.iloc[train_index]
    train_target = Train_targets.iloc[train_index]
    
    val_data = Train_data_scaled.iloc[val_index]
    val_target = Train_targets.iloc[val_index]
    
    Model = Sequential(name='FFN')
    Model.add(Dense(best_trial[0]['input_units'], activation='relu', kernel_initializer='he_normal', input_shape=(Train_data_scaled.shape[1],)))
    Model.add(Dropout(best_trial[0]['input_drop']))
    Model.add(BatchNormalization())
    for l in range(best_trial[0]['nbr_lay']):
        Model.add(Dense(best_trial[0][f'dense_{l}_units'], activation=best_trial[0][f'dense_{l}_act'], kernel_initializer='he_normal'))
        Model.add(Dropout(best_trial[0][f'lay_{l}_drop']))
        Model.add(BatchNormalization())
    
    Model.add(Dense(206, activation='sigmoid', kernel_initializer='he_normal'))

    Model.compile(optimizer = 'adam', loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics='accuracy')
    
    Checkpoint = tf.keras.callbacks.ModelCheckpoint(get_model_name(fold_var), 
                                                    monitor='val_loss', verbose=1, 
                                                    save_best_only=True, mode='min')

    LR_OnPlat = ReduceLROnPlateau(monitor='val_loss',
                                  patience = 2,
                                  cooldown=1,
                                  verbose=1,
                                  factor=0.8,
                                  epsilon=1e-4,
                                  min_lr=0.000001)
    
    results = Model.fit(x=train_data,
                        y=train_target,
                        batch_size=64,
                        validation_data=(val_data,val_target),
                        epochs=50,
                        verbose=0,
                        callbacks=[LR_OnPlat, Checkpoint])
    
    Model.load_weights("./"+get_model_name(fold_var))
    
    results = Model.evaluate(val_data, val_target, batch_size=128)
    results = dict(zip(Model.metrics_names,results))

    VAL_LOSS.append(results['loss'])

    tf.keras.backend.clear_session()

    fold_var += 1

In [None]:
VAL_LOSS

In [None]:
Model_1 = load_model('./Model_1.h5')
Model_2 = load_model('./Model_2.h5')
Model_3 = load_model('./Model_3.h5')
Model_4 = load_model('./Model_4.h5')
Model_5 = load_model('./Model_5.h5')
Model_6 = load_model('./Model_6.h5')
Model_7 = load_model('./Model_7.h5')
Model_8 = load_model('./Model_8.h5')
Model_9 = load_model('./Model_9.h5')
Model_10 = load_model('./Model_10.h5')

In [None]:
def logloss_np(y_true, y_pred):
    y_pred = np.clip(y_pred,p_min,p_max)
    return -np.mean(np.array(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred)))


In [None]:
models=[Model_1,Model_2,Model_3,Model_4,Model_5,Model_6,Model_7,Model_8,Model_9,Model_10]
val_losses=[]
for i in range(1,splits_nbr+1):
    temp = logloss_np(Train_targets,models[i-1].predict(Train_data_scaled))
    val_losses.append(temp)

AVG_Val_Loss= np.mean(val_losses)
print(f'The average validation log loss: {AVG_Val_Loss}')

## Submission

In [None]:
Test_data = test_feat_df.drop(columns=['sig_id']+list(gene_cols)+list(cell_via_cols))

In [None]:
test_gene_data = test_feat_df[gene_cols]
test_cell_via_data = test_feat_df[cell_via_cols]
scaled_test_gene_data = scale(test_gene_data)
scaled_test_cell_via_data = scale(test_cell_via_data)

In [None]:
pca_test_gene = pca1.transform(scaled_test_gene_data)
pca_test_cell = pca2.transform(scaled_test_cell_via_data)
labels_gene_pca = ['PC' + str(x) for x in range(1, pca_test_gene.shape[1]+1)]
labels_cell_pca = ['PC' + str(x) for x in range(1, pca_test_cell.shape[1]+1)]
transformed_test_gene_feats = pd.DataFrame(pca_test_gene, columns=labels_gene_pca)
transformed_test_cell_feats = pd.DataFrame(pca_test_cell, columns=labels_cell_pca)

In [None]:
Test_data = Test_data.merge(transformed_test_gene_feats, how='left', left_index=True, right_index=True)
Test_data = Test_data.merge(transformed_test_cell_feats, how='left', left_index=True, right_index=True)
Test_data[['ctl_vehicle','trt_cp']] = pd.get_dummies(Test_data.cp_type)
Test_data[['D1','D2']] = pd.get_dummies(Test_data.cp_dose)
Test_data.drop(columns=['cp_type','cp_dose'], inplace=True)

In [None]:
scaler2 = StandardScaler()
scaler2.fit(Test_data)

In [None]:
Test_data_scaled = pd.DataFrame(scaler.transform(Test_data))
Test_data_scaled

In [None]:
models_pred = np.zeros((Test_data_scaled.shape[0],206))
for i in range(1,splits_nbr+1):
    temp = models[i-1].predict(Test_data_scaled)
    models_pred += temp
AVG_test_pred = models_pred/splits_nbr

In [None]:
Prediction = pd.DataFrame(AVG_test_pred, columns=Train_targets.columns)
sig_id_df = pd.DataFrame(test_feat_df.sig_id)
Prediction = sig_id_df.merge(Prediction, how='left', left_index=True, right_index=True)

In [None]:
Prediction.to_csv('submission.csv', index=False)