In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Read train and test datasets

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
sample_sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
print('Number of training samples: {}'.format(train_features.shape[0]))
print('Number of features: {}'.format(train_features.shape[1]))

print('Number of test samples: {}'.format(test_features.shape[0]))

# Let us look at the types of features in the dataset

In [None]:
def feature_distribution_visualize(data):
    g_feats = []
    c_feats = []
    others = []
    for feature in data.columns:
        if feature.find('c-')!=-1:
            c_feats.append(feature)
        elif feature.find('g-')!=-1:
            g_feats.append(feature)
        else:
            others.append(feature)
    return c_feats, g_feats, others

c_feats, g_feats, others = feature_distribution_visualize(train_features)

#Plot distribution of the type of features
plt.figure(figsize=(8,4))
plt.bar(['cell viability', 'gene expression', 'others'], [len(c_feats), len(g_feats), len(others)], 
            color = ['gray','pink','magenta'])
plt.xticks(['cell viability', 'gene expression', 'others'], rotation = 0)
plt.title('Distribution of type of features')
plt.show()

 **Plot the distribution for the features that are related to treatment plan**

In [None]:
plt.figure(figsize = (16,4))
o_count = len(others)
for i in range(1,o_count):
    plt.subplot(1,o_count-1, i)
    sns.countplot(train_features[others[i]], palette = 'pink')
plt.suptitle('Distribution of features related to treatment')
plt.show()

**Investigate the statistics of cell viability features**

In [None]:

summary_stat_c = train_features[c_feats].describe()
plt.figure(figsize = (8,8))
#Plot the distribution of mean of cell viability
plt.subplot(2,2,1)
sns.distplot(summary_stat_c.values[1], color = 'gray')
plt.title('Mean Cell Viability')

#Plot the distribution of std of cell viability
plt.subplot(2,2,2)
sns.distplot(summary_stat_c.values[2], color = 'gray')
plt.title('Standard Deviation of Cell Viability')

#Plot the distribution of minimum values of cell viability
plt.subplot(2,2,3)
sns.distplot(summary_stat_c.values[3], color = 'gray',kde_kws={'bw': 0.1})
plt.title('Minimum Cell Viability')

#Plot the distribution of maximum values of cell viability
plt.subplot(2,2,4)
sns.distplot(summary_stat_c.values[7], color = 'gray')
plt.title('Maximum Cell Viability')

plt.show()

**Investigate the statistics of gene expression features**

In [None]:
#Investigate the statistics of cell viability features
summary_stat_g = train_features[g_feats].describe()
plt.figure(figsize = (8,8))
#Plot the distribution of mean of cell viability
plt.subplot(2,2,1)
sns.distplot(summary_stat_g.values[1], color = 'pink')
plt.title('Mean Gene Expression')

#Plot the distribution of std of cell viability
plt.subplot(2,2,2)
sns.distplot(summary_stat_g.values[2], color = 'pink')
plt.title('Standard Deviation of Gene Expression')

#Plot the distribution of minimum values of cell viability
plt.subplot(2,2,3)
sns.distplot(summary_stat_g.values[3], color = 'pink',kde_kws={'bw': 0.1})
plt.title('Minimum Gene Expression Viability')

#Plot the distribution of maximum values of cell viability
plt.subplot(2,2,4)
sns.distplot(summary_stat_g.values[7], color = 'pink')
plt.title('Maximum Gene Expression Viability')

plt.show()

**Look at Correlations across Cell Viability and Gene Expressions**

In [None]:
# corr_Matrix_cell = train_features[c_feats].corr()
# plt.figure(figsize= (10,10))
# sns.heatmap(corr_Matrix_cell, cmap = 'bwr')
# plt.title('Correlation across c-type features')
# plt.show()

In [None]:
# corr_Matrix_gene = train_features[g_feats].corr()
# plt.figure(figsize= (10,10))
# sns.heatmap(corr_Matrix_gene, cmap = 'bwr')
# plt.title('Correlation across g-type features')
# plt.show()

# Let us consider the target variables

In [None]:
train_targets.head()

#Look at the range of values of the targets
# plt.figure(figsize=(8,8))
# plt.pcolor(train_targets.iloc[1:100,1:100])
# plt.title('Heatmap of values of the target')
# plt.show()

# EDA on target variables

# Prepare dataset for model generation

In [None]:
#check if all ids match in training features and training targets
def check_ids(features, targets):
    for i in range(train_features.shape[0]):
        if train_features['sig_id'][i]!= train_targets['sig_id'][i]:
            print('Mismatch detected!')
    print('Done!')
    
def preprocess_categorical(data):
    data.loc[:,'cp_type'] = data.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    data.loc[:,'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1':0, 'D2':1})
    return data

check_ids(train_features, train_targets)
aug_data_g = pd.concat([pd.DataFrame(train_features[g_feats]), 
                      pd.DataFrame(test_features[g_feats])])
aug_data_c = pd.concat([pd.DataFrame(train_features[c_feats]), 
                      pd.DataFrame(test_features[c_feats])])

X = train_features.drop(labels ='sig_id', axis = 1)
X = preprocess_categorical(X)
y = train_targets.drop(labels = 'sig_id', axis = 1)

X_test = test_features.drop(labels = 'sig_id', axis = 1)
X_test = preprocess_categorical(X_test)

from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

def feature_augment(aug_data, feature_set, n_comp):
    data_pca = (PCA(n_components=n_comp, random_state=42).fit_transform(aug_data[feature_set]))
    train2 = data_pca[:train_features.shape[0]]; test2 = data_pca[-test_features.shape[0]:]    
    return train2, test2

train_g_add, test_g_add = feature_augment(aug_data_g, g_feats, 600)
train_c_add, test_c_add = feature_augment(aug_data_c, c_feats, 40)

train_g_add = pd.DataFrame(train_g_add, columns=[f'pca_G-{i}' for i in range(600)])
test_g_add = pd.DataFrame(test_g_add, columns=[f'pca_G-{i}' for i in range(600)])

train_c_add = pd.DataFrame(train_c_add, columns=[f'pca_C-{i}' for i in range(40)])
test_c_add = pd.DataFrame(test_c_add, columns=[f'pca_C-{i}' for i in range(40)])

X = pd.concat((X, train_g_add, train_c_add), axis=1)
X_test = pd.concat((X_test, test_g_add, test_c_add), axis=1)

var_thresh = VarianceThreshold(0.8)
data = X.append(X_test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 3:])

train_transform = data_transformed[ : train_features.shape[0]]
test_transform = data_transformed[-test_features.shape[0] : ]

X_train_features = pd.DataFrame(X[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
                              columns=['cp_type','cp_time','cp_dose'])
X = pd.concat([X_train_features, pd.DataFrame(train_transform)], axis=1)


X_test_features = pd.DataFrame(X_test[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
                             columns=['cp_type','cp_time','cp_dose'])
X_test = pd.concat([X_test_features, pd.DataFrame(test_transform)], axis=1)

print(X.head())
print('Number of features in modified training set: ', X.shape[1])
print('Number of features in modified test set: ', X_test.shape[1])

# Begin building ML Model to make predictions

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
EPOCHS = 30
LR = 1e-3
BATCH_SIZE = 32
INPUT_DIM = X.shape[1]
OUTPUT_DIM = y.shape[1]
HIDDEN = [1024, 1024, 512, 512, 512, 512]
NUM_FOLD = 5

def create_model(input_dim = INPUT_DIM, hidden_layers = HIDDEN, output_dim = OUTPUT_DIM):
    inp = layers.Input(shape = (input_dim, ))
    x = inp
    
    x = layers.BatchNormalization()(x)
    
    for units in hidden_layers:
        x = layers.Dense(units, activation = 'relu', 
                         kernel_regularizer = tf.keras.regularizers.l2(1e-4),
                         bias_regularizer = tf.keras.regularizers.l2(1e-4))(x)
        x = layers.BatchNormalization()(x)
        
    outp = layers.Dense(output_dim, activation = 'sigmoid', 
                        kernel_regularizer = tf.keras.regularizers.l2(1e-4), 
                        bias_regularizer = tf.keras.regularizers.l2(1e-4))(x)
    model = models.Model(inputs = inp, outputs = outp, name = 'multioutput_model')
    
#     model.summary()
    return model

def train_model(X, y, K_Fold =False, epochs = EPOCHS, 
                learning_rate = LR, batch_size = BATCH_SIZE):
    
    opt = optimizers.Adam(learning_rate=LR)
    reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 3)
    
    if not K_Fold:
        print('-----------------------------------------------------------')
        print('Performing hold-out validation')
        print('-----------------------------------------------------------')
        X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.25, 
                                                  random_state = True, shuffle = True)
        model = create_model()
        model.compile(loss = 'binary_crossentropy', optimizer = opt, metrics = ['binary_crossentropy'])
        history = model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = epochs, 
                       batch_size = batch_size, callbacks = [reduce_lr])
        plot_train_result(history)
    else:
        print('-----------------------------------------------------------')
        print('Performing k-fold cross validation')
        print('-----------------------------------------------------------')
        kfold = KFold(n_splits=NUM_FOLD, shuffle=True)
        loss_per_fold = []
        
        fold_num = 1
        for train, val in kfold.split(X, y):
            
            model = create_model()
            model.compile(loss = 'binary_crossentropy', optimizer = opt)
            
            print('-----------------------------------------------------------')
            print('Training on Fold: {}'.format(fold_num))
            print('-----------------------------------------------------------')
            
            X_train, X_val = X.values[train], X.values[val]
            y_train, y_val = y.values[train], y.values[val]
            history = model.fit(X_train, y_train, validation_data = (X_val, y_val), 
                                epochs = epochs, batch_size = batch_size, 
                                callbacks = [reduce_lr])
            fold_num += 1   
            loss_per_fold.append(history.history['val_loss'][-1])
        print('Best Validation Loss:', min(loss_per_fold))
        
    return model

def plot_train_result(history):
    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['Training Loss', 'Validation Loss'])
    plt.show()

In [None]:
model = train_model(X, y, K_Fold = False)

In [None]:
submission_predict = sample_sub.copy()
submission_predict.loc[:, y.columns] = 0
submission_predict.loc[:, y.columns] += model.predict(X_test.values[:])

training_predict = y.copy()
training_predict.loc[:, y.columns] = 0
training_predict.loc[:, y.columns] += model.predict(X.values[:])

metrics = []
for _target in y.columns:
    metrics.append(log_loss(train_targets.loc[:, _target], training_predict.loc[:, _target]))
    
print(f'OOF Metric: {np.mean(metrics)}')

submission_predict.loc[X_test['cp_type'] == 1, y.columns] = 0

submission_predict.to_csv('submission.csv', index = False)

