In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/sample_submission.csv


In [2]:
import matplotlib.pyplot as plt 
import seaborn as sns

# Load Datasets

In [3]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
sample_sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

print('Number of training examples: {}'.format(train_features.shape[0]))
print('Number of features:{}'.format(train_features.shape[1]))
print('Number of target outputs: {}'.format(train_targets.shape[1]))
print('Number of test examples: {}'.format(test_features.shape[0]))

print('The submission file should be of dimension: ', test_features.shape[0],'x', train_targets.shape[1])


Number of training examples: 23814
Number of features:876
Number of target outputs: 207
Number of test examples: 3982
The submission file should be of dimension:  3982 x 207


**Extract the column names from the dataframe**

In [4]:
def feature_columns(data):
    g_feats = []
    c_feats = []
    others = []
    for feature in data.columns:
        if feature.find('c-')!=-1:
            c_feats.append(feature)
        elif feature.find('g-')!=-1:
            g_feats.append(feature)
        else:
            others.append(feature)
    return c_feats, g_feats, others

c_feats, g_feats, others = feature_columns(train_features)

print('Number of genetic expression features: {}'.format(len(g_feats)))
print('Number of cell viability features: {}'.format(len(c_feats)))
print('Other features: {}'.format(len(others)))

Number of genetic expression features: 772
Number of cell viability features: 100
Other features: 4


# Prepare dataset for model generation

**Import packages for feature engineering and model generation**

In [5]:
from sklearn.decomposition import PCA, FastICA
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping 
import tensorflow_addons as tfa

**Augment features using PCA on training data**

Include citation

In [6]:
#check if all ids match in training features and training targets
def check_ids(features, targets):
    for i in range(train_features.shape[0]):
        if train_features['sig_id'][i]!= train_targets['sig_id'][i]:
            print('Mismatch detected!')
    print('Done!')
    
check_ids(train_features, train_targets)
    
def preprocess_categorical(data):
    data.loc[:,'cp_type'] = data.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    data.loc[:,'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1':0, 'D2':1})
    return data

def feature_augment(aug_data, feature_set, n_comp):
    data_pca = PCA(n_components = n_comp, random_state = 7).fit_transform(aug_data[feature_set])
    train_pca_feat = data_pca[:train_features.shape[0]]
    test_pca_feat = data_pca[-test_features.shape[0]:]
    return train_pca_feat, test_pca_feat

X = train_features.drop(labels = 'sig_id', axis = 1)
X = preprocess_categorical(X)

X_test = test_features.drop(labels = 'sig_id', axis = 1)
X_test = preprocess_categorical(X_test)

#begin data augmentation
# aug_data_g = pd.concat([pd.DataFrame(train_features[g_feats]), 
#                        pd.DataFrame(test_features[g_feats])])
# aug_data_c = pd.concat([pd.DataFrame(train_features[c_feats]), 
#                        pd.DataFrame(test_features[c_feats])])

# g_num, c_num = 500, 50

# train_g_add, test_g_add = feature_augment(aug_data_g, g_feats, g_num)
# train_c_add, test_c_add = feature_augment(aug_data_c, c_feats, c_num)

# train_g_add = pd.DataFrame(train_g_add, columns=[f'pca_G-{i}' for i in range(g_num)])
# test_g_add = pd.DataFrame(test_g_add, columns=[f'pca_G-{i}' for i in range(g_num)])

# train_c_add = pd.DataFrame(train_c_add, columns=[f'pca_C-{i}' for i in range(c_num)])
# test_c_add = pd.DataFrame(test_c_add, columns=[f'pca_C-{i}' for i in range(c_num)])

# X = pd.concat((X, train_g_add, train_c_add), axis=1)
# X_test = pd.concat((X_test, test_g_add, test_c_add), axis=1)

#Remove features with low variance
var_thresh = VarianceThreshold(0.95)
data = X.append(X_test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 3:])

#Perform Quantile Transformation to adjust outliers
# QUA = QuantileTransformer(n_quantiles=100, output_distribution='normal')
# data_transformed = QUA.fit_transform(data_transformed)
data_transformed = MinMaxScaler().fit_transform(data_transformed)

#Perform ICA to reduce dimensionality of data 
ica = FastICA(n_components=500,max_iter=500)
data_transformed_ica=ica.fit_transform(data_transformed)

train_transform = data_transformed_ica[ : train_features.shape[0]]
test_transform = data_transformed_ica[-test_features.shape[0] : ]

X_train_features = pd.DataFrame(X[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
                              columns=['cp_type','cp_time','cp_dose'])
X = pd.concat([X_train_features, pd.DataFrame(train_transform)], axis=1)


X_test_features = pd.DataFrame(X_test[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
                             columns=['cp_type','cp_time','cp_dose'])
X_test = pd.concat([X_test_features, pd.DataFrame(test_transform)], axis=1)

#end augmentation

print(X.shape)

y = train_targets.drop(labels = 'sig_id', axis = 1)

print(y.shape)


Done!
(23814, 503)
(23814, 206)


**Build Deep Neural Network Model that learns patterns**

In [7]:
EPOCHS = 25
BATCH_SIZE = 128
INPUT_DIM = X.shape[1]
OUTPUT_DIM = y.shape[1]
HIDDEN_1 = [1024, 512, 256]
HIDDEN_2 = [512, 512, 256]
HIDDEN_3 = [1024, 512, 512]

def create_model(hidden_layers, input_dim = INPUT_DIM, output_dim = OUTPUT_DIM):
    inp = layers.Input(shape = (input_dim, ))
    x = inp
    x = layers.BatchNormalization()(x)

    for units in hidden_layers:
        x = tfa.layers.WeightNormalization(layers.Dense(units))(x)
        x = layers.LeakyReLU()(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.3)(x)
        
    
#     x = layers.Dropout(0.1)(x)
    outp = tfa.layers.WeightNormalization(layers.Dense(output_dim, activation = 'sigmoid'))(x)
    
    model = models.Model(inputs = inp, outputs = outp, name = 'multioutput_model')
    
#     model.summary()
    return model

In [8]:
N_FOLDS = 5

def train_model_k_fold(X, y, hidden_layers, epochs = EPOCHS, batch_size = BATCH_SIZE):
    kf = KFold(n_splits = N_FOLDS, random_state = 1, shuffle = True)
    history, index = {}, 0
    
    model_predict = sample_sub.copy()
    model_predict.loc[:, y.columns] = 0
    
#     opt = optimizers.Adam(learning_rate = 0.001)
    opt = tfa.optimizers.AdamW(lr=1e-3, weight_decay=1e-5, clipvalue=700)
    reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 10)
    early_stop = EarlyStopping(monitor='val_loss', patience=5)
    
    for train, val in kf.split(X,y):
        
        model = create_model(hidden_layers)
        model.compile(loss = 'binary_crossentropy', optimizer = opt)
        
        print('-----------------------------------------------------------')
        print('Training on Fold: {}'.format(index+1))
        print('-----------------------------------------------------------')
        
        X_train, X_val = X.values[train], X.values[val]
        y_train, y_val = y.values[train], y.values[val]
        history[index] = model.fit(X_train, y_train, validation_data = (X_val, y_val), 
                                epochs = epochs, batch_size = batch_size,
                                callbacks = [early_stop])
        prediction = model.predict(X_test.values[:])
        model_predict.loc[:, y.columns] += prediction/N_FOLDS
        index += 1
    return model, model_predict

submission_predict = sample_sub.copy()
submission_predict.loc[:, y.columns] = 0

print('Training Model 1............................................')
model1, model1_predict = train_model_k_fold(X, y, hidden_layers = HIDDEN_1)

print('Training Model 2............................................')
model2, model2_predict = train_model_k_fold(X, y, hidden_layers = HIDDEN_2)

print('Training Model 3............................................')
model3, model3_predict = train_model_k_fold(X, y, hidden_layers = HIDDEN_3)

weight = [0.25, 0.5, 0.25]
model_predict = [model1_predict, model2_predict, model3_predict]

for i in range(3):
    curr_model = model_predict[i]
    submission_predict.loc[:, y.columns] += weight[i]*curr_model.loc[:, y.columns]
# submission_predict.loc[:, y.columns] +=model2_predict.loc[:, y.columns]
print(submission_predict[:5])

print('Converting to csv file.....................')
submission_predict.to_csv('submission.csv', index = False)


Training Model 1............................................
-----------------------------------------------------------
Training on Fold: 1
-----------------------------------------------------------
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
-----------------------------------------------------------
Training on Fold: 2
-----------------------------------------------------------
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
-----------------------------------------------------------
Training on Fold: 3
-----------------------------------------------------------
Epoch 1/2

In [9]:
# def train_model(X, y, hidden_layers, epochs = EPOCHS, batch_size = BATCH_SIZE):
    
#     opt = tfa.optimizers.AdamW(lr=1e-3, weight_decay=1e-5, clipvalue=700)
#     reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 10)
#     early_stop = EarlyStopping(monitor='val_loss', patience=20)
    
#     print('-----------------------------------------------------------')
#     print('Performing hold-out validation')
#     print('-----------------------------------------------------------')
#     X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.25, 
#                                               random_state = True, shuffle = True)
#     model = create_model(hidden_layers)
#     model.compile(loss = 'binary_crossentropy', optimizer = opt)
#     history = model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = epochs, 
#                    batch_size = batch_size, callbacks = [reduce_lr, early_stop])
#     plot_train_result(history)
#     return model

# def plot_train_result(history):
#     plt.figure()
#     plt.plot(history.history['loss'])
#     plt.plot(history.history['val_loss'])
#     plt.legend(['Training Loss', 'Validation Loss'])
#     plt.show()

# model1 = train_model(X, y, hidden_layers = HIDDEN_1)
# model2 = train_model(X, y, hidden_layers = HIDDEN_2)
# model3 = train_model(X, y, hidden_layers = HIDDEN_3)

# submission_predict = sample_sub.copy()
# submission_predict.loc[:, y.columns] = 0

# weight = [0.15, 0.7, 0.15]
# model = [model1, model2, model3]

# for i in range(3):
#     submission_predict.loc[:, y.columns] += weight[i]*model[i].predict(X_test.values[:])

# print(submission_predict[:5])

# print('Converting to csv file.....................')
# submission_predict.to_csv('submission.csv', index = False)