In [None]:
# !python3 -m pip install iterative-stratification
# !python3 -m pip install tensorflow
# !python3 -m pip install keras
# !python3 -m pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import tensorflow as tf
import keras
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from tensorflow.keras import Sequential,Model,backend
from tensorflow.keras import layers,regularizers
from tensorflow.keras import callbacks,optimizers,metrics,losses
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

In [None]:
# Train, test, targets and submission file:
train_features = pd.read_csv(f'../input/lish-moa/train_features.csv')
train_target = pd.read_csv(f'../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv(f'../input/lish-moa/test_features.csv')
sample_sub = pd.read_csv(f'../input/lish-moa/sample_submission.csv')

In [None]:
# Statistical analysis of data ---------------------------------------------------------------------------------------

In [None]:
print("Training set of features sample:")
train_features.head(10)

In [None]:
print("Training set of target sample:")
train_target.head(10)

In [None]:
print("Test set of features sample:")
train_features.head(10)

In [None]:
train_features.describe()

In [None]:
train_features.groupby( ['cp_dose','cp_type','cp_time'] ).agg( ['mean','std'] )

In [None]:
# Data Visualization -------------------------------------------------------------------------------------------------

In [None]:
# Pie chart showing distribtions of CPtypes of Training data.
colors = ["green", "orange"]

plt.pie(train_features["cp_type"].value_counts(),labels=["trt_cp","ctl_vehicle"],autopct="%.2f%%", colors=colors)
plt.title("Distribution of CPtypes of Training data")
plt.show()

# Draw a pie chart about Cpdose of Training data.
plt.pie(train_features["cp_dose"].value_counts(),labels=["D1","D2"],autopct="%.2f%%", colors=colors)
plt.title("Ratio of CPdose")
plt.show()

In [None]:
# Labels per sample.
plt.figure(figsize=(14,6))
features = train_target.columns.values[1:]
sns.countplot(train_target[features].sum(axis=1))
plt.xlabel('Number of Targets per Sample')
plt.title('Target Score Counts')
plt.show()

In [None]:
# Number of Cell viability columns
c_columns = train_features.columns[train_features.columns.str.startswith('c-')]

# Plotting some of the features
plt.figure(figsize=(12, 12))
cols =c_columns[0:12]
for i, col in enumerate(cols):
    plt.subplot(4, 4, i + 1)
    plt.hist(train_features.loc[:, col], bins=100, alpha=1, color='grey');
    plt.title(col)

In [None]:
# Number of Cell viability columns
g_columns = train_features.columns[train_features.columns.str.startswith('g-')]

# Plotting some of the features starting with c
plt.figure(figsize=(12, 12))
cols =g_columns[0:12]
for i, col in enumerate(cols):
    plt.subplot(4, 4, i + 1)
    plt.hist(train_features.loc[:, col], bins=100, alpha=1, color='grey');
    plt.title(col)

In [None]:
# Plotting correlation Matrix
corr = train_features.corr()

kot = corr[corr>=.9]
plt.figure(figsize=(12,8))
plt.title("Features with correlation > 0.9")
sns.heatmap(kot, cmap="coolwarm")

In [None]:
# Model Training -----------------------------------------------------------------------------------------------------

In [None]:
# Checking for empty/blank values in our data set

def get_percentage_missing(series):
    num = series.isnull().sum()
    den = len(series)
    return round(num/den, 2)

print("Column Name","Percentage Missing")
for i in train_features.columns:
    print(i,":\t",get_percentage_missing(train_features[i]))

In [None]:
train_features = train_features.drop(['sig_id','cp_type','cp_dose','cp_time'],axis=1)
test_features = test_features.drop(['sig_id','cp_dose','cp_time'],axis=1)
train_targets_scored = pd.read_csv(f'../input/lish-moa/train_targets_nonscored.csv')
train_targets_scored.pop('sig_id')
labels_train = train_targets_scored.values
numerical_features= train_features.columns[train_features.dtypes!="object"]

In [None]:
cfeatures = train_features.columns.str.contains('c-')
gfeatures = train_features.columns.str.contains('g-')

def get_features(X_train,X_test):
    n_gfeatures = 20
    n_cfeatures = 100     
    pca_cfeatures = PCA(n_components = n_cfeatures)
    pca_gfeatures = PCA(n_components = n_gfeatures)
    X_train_pca_gfeatures = pca_gfeatures.fit_transform(X_train[:,gfeatures])
    X_train_pca_cfeatures = pca_cfeatures.fit_transform(X_train[:,cfeatures])
    X_test_pca_gfeatures = pca_gfeatures.transform(X_test[:,gfeatures])
    X_test_pca_cfeatures = pca_cfeatures.transform(X_test[:,cfeatures])
    X_train_c_mean = X_train[:,cfeatures].mean(axis=1)
    X_test_c_mean = X_test[:,cfeatures].mean(axis=1)    
    X_train_g_mean = X_train[:,gfeatures].mean(axis=1)
    X_test_g_mean = X_test[:,gfeatures].mean(axis=1)
    X_train = np.concatenate((X_train,X_train_pca_gfeatures,X_train_pca_cfeatures,X_train_c_mean[:,np.newaxis]
                            ,X_train_g_mean[:,np.newaxis]),axis=1)
    X_test = np.concatenate((X_test,X_test_pca_gfeatures,X_test_pca_cfeatures,X_test_c_mean[:,np.newaxis],
                           X_test_g_mean[:,np.newaxis]),axis=1)
    
    # Standardizing data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test

In [None]:
n_labels = train_targets_scored.shape[1]
n_train = train_features.shape[0]
n_test = test_features.shape[0]
# Prediction Thresholds
p_min = 4E-4
p_max = 0.9

def custom_logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true * backend.log(y_pred) + (1-y_true) * backend.log(1-y_pred))

In [None]:
# Prediction & Logloss -----------------------------------------------------------------------------------------------

In [None]:
# Model Definition for prediction
def Define_model_prediction(n_features, n_features_2, n_labels, label_smoothing = 0.0005):    
    input_1 = layers.Input(shape = (n_features,), name = 'Input1')
    input_2 = layers.Input(shape = (n_features_2,), name = 'Input2')

    part_1 = Sequential([
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        layers.Dense(512, activation="elu",kernel_initializer='glorot_normal',
        kernel_regularizer=regularizers.l1_l2(l1=1e-7, l2=1e-6)), 
        layers.BatchNormalization(),
        layers.Dense(256, activation = "relu")
        ],name='part1') 

    input_3 = part_1(input_1)
    input_3_concat = layers.Concatenate()([input_2, input_3])

    part_2 = Sequential([
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(512, activation="relu",kernel_initializer='glorot_normal',
        kernel_regularizer=regularizers.l1_l2(l1=1e-7, l2=1e-6)), 
        layers.BatchNormalization(),
        layers.Dense(512, "elu"),
        layers.BatchNormalization(),
        layers.Dense(256, "relu"),
        layers.BatchNormalization(),
        layers.Dense(256, "elu")
        ],name='part2')

    input_4 = part_2(input_3_concat)
    input_4_avg = layers.Average()([input_3, input_4]) 

    part_3 = Sequential([
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(1024, activation="relu",kernel_initializer='glorot_normal',
        kernel_regularizer=regularizers.l1_l2(l1=1e-7, l2=1e-6)), 
        layers.BatchNormalization(),
        layers.Dense(512, "elu"),
        layers.BatchNormalization(),
        layers.Dense(512, "relu"),
        layers.BatchNormalization(),
        layers.Dense(256, "elu")
        ],name='part3')
    input_5 = part_3(input_4_avg)
    input_5_avg = layers.Average()([input_4, input_5]) 

    
    part_4 = Sequential([
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(512, activation="relu",kernel_initializer='glorot_normal',
        kernel_regularizer=regularizers.l1_l2(l1=1e-7, l2=1e-6)), 
        layers.BatchNormalization(),
        layers.Dense(512, "elu"),
        layers.BatchNormalization(),
        layers.Dense(256, "relu"),
        layers.BatchNormalization(),
        layers.Dense(256, "elu")
        ],name='part4')
    input_6 = part_4(input_5_avg)
    input_6_avg = layers.Average()([input_5, input_6]) 

    
    part_5 = Sequential([
        layers.BatchNormalization(),
        layers.Dense(256, kernel_initializer='glorot_normal', activation='relu',
        kernel_regularizer=regularizers.l1_l2(l1=1e-6, l2=1e-5),
        ),
        layers.BatchNormalization(),
        layers.Dense(n_labels, kernel_initializer='glorot_normal', activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(n_labels, activation="sigmoid")
        ],name='part5')

    output = part_5(input_6_avg)


    model = Model(inputs = [input_1, input_2], outputs = output)
    model.compile(optimizer='adam', loss=losses.BinaryCrossentropy(label_smoothing=label_smoothing),
                  metrics=custom_logloss)
    
    return model



In [None]:
n_seeds = 5
np.random.seed(4545)
n_folds = 5
seeds = np.random.randint(0,100,size=n_seeds)
y_pred = np.zeros((n_test,n_labels))

for seed in seeds:
    fold = 0
    kf = KFold(n_splits=n_folds,shuffle=True,random_state=seed)
    for train, test in kf.split(train_features):
        X_train, X_test = get_features(train_features.iloc[train].values,train_features.iloc[test].values)
        _,unknown = get_features(train_features.iloc[train].values,test_features.drop('cp_type',axis=1).values)
        
        allcols_train = train_features.iloc[train][numerical_features].values
        allcols_test = train_features.iloc[test][numerical_features].values
        unknown_2 = test_features[numerical_features].values
        
        y_train = labels_train[train]
        y_test = labels_train[test]
        n_features = X_train.shape[1]
        n_features_2 = allcols_train.shape[1]

        model = Define_model_prediction(n_features, n_features_2, n_labels)
        reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_custom_logloss',  mode='min', min_lr=1E-6, factor=0.1, patience=2,)
        early_stopping = callbacks.EarlyStopping(monitor='val_custom_logloss', min_delta=1E-6, patience=10, mode='min',restore_best_weights=True)
        hist = model.fit([X_train,allcols_train],y_train, batch_size=128, epochs=12,verbose=1,validation_data = ([X_test,allcols_test],y_test),
                         callbacks=[reduce_lr, early_stopping])
        
        # Run prediction
        y_pred += model.predict([unknown,unknown_2])/(n_folds*n_seeds)
                       
        # Plotting logloss
        plt.plot(hist.history['custom_logloss'], color='red')
        plt.plot(hist.history['val_custom_logloss'], color='black')
        plt.title('Model Accuracy - Logloss')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Validation'], loc='upper right')
        plt.show()
        
        #plotting loss
        plt.plot(hist.history['loss'], color='red')
        plt.plot(hist.history['val_loss'], color='black')
        plt.title('Model Accuracy - loss')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Validation'], loc='upper right')
        plt.show()

        fold += 1        

In [None]:
sample_sub

In [None]:
sample_sub.to_csv("submission.csv",index=False)

In [None]:
# References:
# data-flair.training/blogs/keras-models/
# alanpryorjr.com/visualizations/seaborn/heatmap/heatmap/
# keras.io/api/models/sequential/
# keras.io/api/optimizers/
# arxiv.org/abs/1412.6980
# ruder.io/optimizing-gradient-descent/
# tensorflow.org/api_docs/python/tf/keras/callbacks/ReduceLROnPlateau
# ENPM809N - Lecture 4 by Deogratias Kibira: Exploratory Data Analysis; Principle Component Analysis

# Referred submissions & discussions on kaggle to understand the approach for the problem:
# kaggle.com/roshankumarg/extensive-guide-to-exploratory-data-analysis
# kaggle.com/arjunsehajpal/mechanism-of-action-exploratory-data-analysis
# kaggle.com/pankajdubey87/ensemble-nn-and-xgboost
# kaggle.com/fchollet/moa-keras-kerastuner-best-practices
# kaggle.com/ravy101/drug-moa-tf-keras-starter
# kaggle.com/c/lish-moa/discussion/201051
# kaggle.com/c/lish-moa/discussion/200992