In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_features = pd.read_csv("/kaggle/input/lish-moa/train_features.csv")
test_features = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")

train_drug = pd.read_csv("/kaggle/input/lish-moa/train_drug.csv")

train_targets_nonscored = pd.read_csv("/kaggle/input/lish-moa/train_targets_nonscored.csv")
train_targets_scored = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")

sample_submission = pd.read_csv("/kaggle/input/lish-moa/sample_submission.csv")

In [None]:
train_features.shape, test_features.shape

In [None]:
train_features

## test_features

In [None]:
test_features

In [None]:
test_features.sig_id.nunique()

In [None]:
test_features.cp_type.value_counts()

In [None]:
test_features.cp_time.value_counts()

In [None]:
test_features.cp_dose.value_counts()

In [None]:
test_features.columns.str[:2].value_counts()

## train_targets_scored

In [None]:
train_targets_scored

In [None]:
sample_submission

In [None]:
( train_features.sig_id.sort_values() == train_targets_scored.sig_id.sort_values() ).value_counts()

In [None]:
ctl_sig_ids = train_features.loc[train_features.cp_type == "ctl_vehicle","sig_id"]
train_targets_scored.set_index('sig_id').loc[ctl_sig_ids].sum().sum()

# Data Preparation

In [None]:
train_dataset = train_features.sort_values('sig_id').drop(['sig_id', 'cp_type'],1)
test_dataset = test_features.sort_values('sig_id').drop(['sig_id', 'cp_type'],1)

train_targets = train_targets_scored.sort_values('sig_id').drop(['sig_id'],1)

train_dataset.cp_dose = train_dataset.cp_dose.str[1:].astype('f')
test_dataset.cp_dose = test_dataset.cp_dose.str[1:].astype('f')

[i.shape for i in [train_dataset, train_targets]]

## Train Test Spliting

In [None]:
import tensorflow as tf
import tensorflow.keras as keras

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [None]:
np.random.seed(1291)
X_train, X_test, y_train, y_test = train_test_split(train_dataset.values, train_targets.values, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
[i.shape for i in [X_train, X_test, y_train, y_test, X_val, y_val]]

# Normalization

In [None]:
from sklearn.preprocessing import QuantileTransformer

test_dataset_X = test_dataset.values.copy()

for coli in range(X_train.shape[1]):

    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    raw_vec = np.concatenate([X_train[:,coli:(coli+1)], np.array([[-10.0],[10.0]])])
    transformer.fit(raw_vec)

    X_train[:,coli:(coli+1)] = transformer.transform(raw_vec[:-2,:])
    X_val[:,coli:(coli+1)] = transformer.transform(X_val[:,coli:(coli+1)])
    X_test[:,coli:(coli+1)] = transformer.transform(X_test[:,coli:(coli+1)])
    
    test_dataset_X[:,coli:(coli+1)] = transformer.transform(test_dataset_X[:,coli:(coli+1)])
    


# Model Defination

In [None]:
def get_model():
    tf.keras.backend.clear_session()
    model = tf.keras.Sequential([
        tf.keras.layers.Input((X_train.shape[1],1)),
        
        tf.keras.layers.Conv1D(128, 3, 2, activation = 'linear'),
        tf.keras.layers.MaxPool1D(2),
        tf.keras.layers.BatchNormalization(),
        
#         tf.keras.layers.Conv1D(64, 3, 1, activation = 'relu'),
#         tf.keras.layers.MaxPool1D(2),
#         tf.keras.layers.BatchNormalization(),
        
        tf.keras.layers.Flatten(),
        
#         tf.keras.layers.Dropout(0.7),
#         tf.keras.layers.Dense(750, activation = 'relu'),
        
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(450, activation = 'sigmoid'),

        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(y_train.shape[1], activation = 'sigmoid')

    ])
    return model

model = get_model()
# model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['mae', tf.keras.metrics.AUC()])
model.summary()


In [None]:
callbacks=[
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=25, mode="min", verbose=1, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(filepath="best_model.hdf5", verbose=1, save_best_only=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                            factor=0.7,
                                            patience=3,
                                            verbose=1,
                                            mode='min',
                                            min_delta=0.0001,
                                            cooldown=0,
                                            min_lr=0.00001)
] 

In [None]:
np.expand_dims(X_train,2).shape

In [None]:
model = get_model()
adam = tf.keras.optimizers.Adam(lr=0.1)
model.compile(loss='binary_crossentropy', optimizer=adam,metrics=['mae', tf.keras.metrics.AUC()])
np.random.seed(1291)
tf.random.set_seed(1291)
history = model.fit(np.expand_dims(X_train,2), y_train, epochs=1500, batch_size=120, 
                    validation_data=(np.expand_dims(X_val,2), y_val), callbacks=callbacks)

In [None]:
model = tf.keras.models.load_model('best_model.hdf5')

In [None]:
def plot_learning_curve(history):
        # plt.plot(history.epoch, history.history["auc"], ".:")
        # plt.plot(history.epoch, history.history["val_auc"], ".:")

        plt.plot(history.epoch, history.history["loss"], ".:", label="loss")
        plt.plot(history.epoch, history.history["val_loss"], ".:", label="val_loss")
        plt.legend()
        plt.yscale('log')

plot_learning_curve(history)
val_auc = history.history["val_auc"][-1]

In [None]:
model.evaluate(np.expand_dims(X_test,2), y_test)

In [None]:
# multi label ROC, rank labels to further focus on

In [None]:
test_predictions = model.predict(np.expand_dims(test_dataset_X,2))

In [None]:
test_predictions.round(2)[:1,:]

# post processing

In [None]:
test_features = test_features.sort_values('sig_id')
ctl_index = np.where(test_features.cp_type!="trt_cp")[0]

test_predictions_mod = test_predictions.copy()
test_predictions_mod[ctl_index,:] = 0

test_predictions_mod = test_predictions_mod.round(2)
test_predictions_mod

In [None]:
test_sub = sample_submission.sort_values('sig_id')
# test_sub = pd.DataFrame(test_predictions, columns=test_sub.columns[1:])
test_sub = pd.DataFrame(test_predictions_mod, columns=test_sub.columns[1:])
test_sub["sig_id"] = sample_submission.sort_values('sig_id')['sig_id']
test_sub

In [None]:
test_sub.to_csv("submission.csv", index=False)

In [None]:
!head submission.csv