# Mechanisms of Action (MoA) Prediction

## importing necessary libraries

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import sklearn.model_selection as ms
import sklearn.preprocessing as prep
import seaborn as sns
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import sklearn.metrics as metric

## Helper Classes and Functions

- CrossValidation: Splitting data into training and validation sets
- EncodeCategories: Encode categorical variables
- TFSimpleDataset: Create tfdataset input pipeline from data
- write_submission: Write submission dataframe

In [None]:
class CrossValidation:
    def __init__(self, df, target_cols, shuffle,random_state=0):
        self.df = df
        self.target_cols = target_cols
        self.random_state = 0
        if shuffle is True:
            self.df = df.sample(frac=1).reset_index(drop=True)

    def hold_out_split(self,percent,stratify=True):
        if stratify:
            y = self.df[self.target_cols]
            train,val = ms.train_test_split(self.df,test_size=percent/100,
                stratify=y, random_state=self.random_state)
            return train,val
        size = len(self.df) - int(len(self.df)*(percent/100))
        train = self.df.iloc[:size,:]
        val = self.df.iloc[size:,:]
        return train,val

    def kfold_split(self, splits, stratify=None):
        if stratify is not None:
            if type('stratify')=="str":
                kf = ms.StratifiedKFold(n_splits=splits)
                y = self.df[stratify].values
                for train, val in kf.split(X=self.df,y=y):
                    t = self.df.iloc[train,:]
                    v = self.df.iloc[val, :]
                    yield t,v
            else:
                kf = MultilabelStratifiedKFold(n_splits=splits)
                y = self.df[stratify].values
                for train, val in kf.split(X=self.df,y=y):
                    t = self.df.iloc[train,:]
                    v = self.df.iloc[val, :]
                    yield t,v
        else:
            kf = ms.KFold(n_splits=splits)
            for train, val in kf.split(X=self.df):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v

In [None]:
class EncodeCategories:
    def __init__(self, df, encode_cols, encoding_type, 
        handle_na=False, na_placeholder="NaN"):
        self.df = df
        self.encode_cols = encode_cols
        self.encoding_type = encoding_type
        self.handle_na = handle_na

        self.label_encoders = {}
        self.binary_encoders = {}
        self.one_hot_encoder = None
        self.na_placeholder = na_placeholder

        if self.handle_na:
            self.df = self.__handle_missing_category(self.df, 
                placeholder=na_placeholder)

    def __handle_missing_category(self, df, placeholder="NaN"):
        for cat in self.encode_cols:
            df.loc[:, cat] = df.loc[:, cat].astype(str).fillna(placeholder)
        return df

    def __label_encoder_fit(self,df, cat):
        le = prep.LabelEncoder()
        le.fit(self.df[cat].values)
        self.label_encoders[cat] = le

    def __label_encoder_transform(self,df, cat):
        return self.label_encoders[cat].transform(df[cat].values)

    def __binary_encoder_fit(self,df, cat):
        lbl = prep.LabelBinarizer()
        lbl.fit(self.df[cat].values)
        self.binary_encoders[cat] = lbl

    def __binary_encoder_transform(self,df, cat):
        return self.binary_encoders[cat].transform(df[cat].values)

    def __one_hot_fit(self,df, sparse=False):
        ohe = prep.OneHotEncoder(sparse=sparse)
        ohe.fit(self.df[self.encode_cols].values)
        self.one_hot_encoder = ohe

    def __one_hot_transform(self,df, cat):
        return self.one_hot_encoder.transform(df[cat].values)

    def __label_encoder(self, df, fit=True):
        for cat in self.encode_cols:
            if fit:
                self.__label_encoder_fit(df,cat)
            df.loc[:,cat] = self.__label_encoder_transform(df,cat)
        return df

    def __binary_encoder(self, df, fit=True):
        for cat in self.encode_cols:
            if fit:
                self.__binary_encoder_fit(df,cat)
            val = self.__binary_encoder_transform(df, cat)
            df = df.drop(cat, axis=1)
            for i in range(val.shape[1]):
                new_col_name = f"{cat}_bin_{i}"
                df[new_col_name] = val[:, i]
        return df

    def __one_hot_encoder(self, df, sparse=False, fit=True):
        if fit:
            self.__one_hot_fit(df, sparse)
        val = self.__one_hot_transform(df, self.encode_cols)
        for cat in self.encode_cols:
            df = df.drop(cat, axis=1)
            for i in range(val.shape[1]):
                new_col_name = f"{cat}_ohe_{i}"
                df[new_col_name] = val[:, i]
        return df

    def fit(self):
        if self.encoding_type == "label":
            for cat in self.encode_cols:
                self.__label_encoder_fit(self.df,cat)
        elif self.encoding_type == "binary":
            for cat in self.encode_cols:
                self.__binary_encoder_fit(self.df,cat)
        elif self.encoding_type == "onehot":
            self.__one_hot_fit(self.df, False)
        elif self.encoding_type == "onehot_sparse":
            self.__one_hot_fit(self.df, True)
        else:
            raise Exception("specified encoding type not defined")

    def fit_transform(self):
        df = self.df.copy(deep=True)
        if self.encoding_type == "label":
            return self.__label_encoder(df)
        elif self.encoding_type == "binary":
            return self.__binary_encoder(df)
        elif self.encoding_type == "onehot":
            return self.__one_hot_encoder(df)
        elif self.encoding_type == "onehot_sparse":
            return self.__one_hot_encoder(df, True)
        else:
            raise Exception("specified encoding type not defined")

    def transform(self,dataframe):
        if self.handle_na:
            dataframe = self.__handle_missing_category(dataframe, 
                placeholder=self.na_placeholder)
        df = dataframe.copy(deep=True)
        if self.encoding_type == "label":
            return self.__label_encoder(df, fit=False)
        elif self.encoding_type == "binary":
            return self.__binary_encoder(df, fit=False)
        elif self.encoding_type == "onehot":
            return self.__one_hot_encoder(df, sparse=False, fit=False)
        elif self.encoding_type == "onehot_sparse":
            return self.__one_hot_encoder(df, sparse=True, fit=False)
        else:
            raise Exception("specified encoding type not defined")

In [None]:
class TFSimpleDataset:
    def __init__(self,batch_size, repeat,
        drop_remainder_in_batch=False, 
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
        buffer_size=tf.data.experimental.AUTOTUNE):
        self.batch_size = batch_size
        self.drop_remainder = drop_remainder_in_batch
        self.num_parallel_calls = num_parallel_calls
        self.buffer_size = buffer_size
        self.repeat = repeat

    def create_dataset(self, X, Y=None):
        datasetX = tf.data.Dataset.from_tensor_slices(X)
        if Y is not None :
            datasetY = tf.data.Dataset.from_tensor_slices(Y)
            dataset = tf.data.Dataset.zip((datasetX,datasetY))
        else:
            dataset = datasetX
        dataset = dataset.batch(self.batch_size, 
            drop_remainder=self.drop_remainder)
        if self.repeat:
            dataset = dataset.repeat()
        dataset = dataset.prefetch(buffer_size=self.buffer_size)
        return dataset

In [None]:
def write_submission(preds):
    sub_pred = preds.transpose()
    submission = pd.read_csv("../input/lish-moa/sample_submission.csv")
    for i, col in enumerate(target_cols):
        submission[col]= sub_pred[i]
    return submission

## Loading, visualizing and setting up data

In [None]:
train_features = pd.read_csv("../input/lish-moa/train_features.csv")
test_features = pd.read_csv("../input/lish-moa/test_features.csv")
train_targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv")

In [None]:
train_features.head()

In [None]:
sns.countplot('cp_type',data=train_features)

In [None]:
sns.countplot('cp_dose',data=train_features)

In [None]:
sns.countplot('cp_time',data=train_features)

In [None]:
test_features.head()

In [None]:
sns.countplot('cp_type',data=test_features)

In [None]:
sns.countplot('cp_dose',data=test_features)

In [None]:
sns.countplot('cp_time',data=test_features)

In [None]:
train_targets.head()

In [None]:
len(train_targets.columns)

Encode Categorical variables into onehot

In [None]:
enc = EncodeCategories(train_features, encode_cols=["cp_type", "cp_dose", "cp_time"], 
                       encoding_type="onehot")
train_features = enc.fit_transform()

encoding categories of test data too

In [None]:
test_df = enc.transform(test_features)

Merge target dataframe with training features on basis of their respective id

In [None]:
train_df = train_features.merge(train_targets, on="sig_id")
train_df.head()

In [None]:
len(train_df.columns)

In [None]:
train_df = train_df.drop("sig_id", axis=1)

In [None]:
train_df.head()

In [None]:
test_df = test_df.drop("sig_id", axis=1)

In [None]:
test_df.head()

list of target columns in dataframe

In [None]:
target_cols = [t for t in train_targets.columns if not t=="sig_id"]
print(len(target_cols))

list of input columns in dataframe

In [None]:
input_features = [t for t in train_features.columns if not t=="sig_id"]
print(len(input_features))

Splitting data into folds

In [None]:
cv = CrossValidation(train_df, target_cols, shuffle=True, random_state=11)

## Defining model, callbacks and metrices

In [None]:
metrices = ["AUC"]

In [None]:
def get_model(hidden_layers_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=len(input_features)),
        tf.keras.layers.Dense(len(input_features)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.LeakyReLU(),
    ])
    for units in hidden_layers_units:
        model.add(tf.keras.layers.Dense(units))
        model.add(tf.keras.layers.Dropout(0.2))
        model.add(tf.keras.layers.LeakyReLU())
    model.add(tf.keras.layers.Dense(len(target_cols), activation="sigmoid"))
    model.compile(loss="binary_crossentropy", metrics=metrices, optimizer="adam")
    return model

In [None]:
LR_START = 0.00001
LR_MAX = 0.00005
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 0
LR_EXP_DECAY = .8

def change_lr(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

In [None]:
def get_callbacks(fold):
    model_checkpointer = tf.keras.callbacks.ModelCheckpoint(
        f"best_model{fold}.h5",
        monitor="val_auc",
        verbose=1,
        save_best_only=True,
        mode="max"
    )
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor="val_auc",
        min_delta=0,
        patience=5,
        verbose=1,
        mode="max"
    )
    lr_callback = tf.keras.callbacks.LearningRateScheduler(change_lr, verbose=True)
    
    return [model_checkpointer,early_stop,lr_callback]

Parameters to tune

In [None]:
batch_size=128
epochs=100
hidden_layers=[1200]
num_folds=5

## Training the model

In [None]:
tf_dataset_obj = TFSimpleDataset(batch_size=batch_size,
                                repeat=False)

In [None]:
k_fold_models=[]

Training model with 5 folds

In [None]:
f=1
val_auc = []
train_auc = []
for train, val in cv.kfold_split(splits=num_folds, stratify= target_cols):
    print("Fold: ",f)
    trainX = train.iloc[:,:len(input_features)].values
    trainY = train.iloc[:,len(input_features):].values
    valX = val.iloc[:,:len(input_features)].values
    valY = val.iloc[:,len(input_features):].values
    train_dataset = tf_dataset_obj.create_dataset(X=trainX, Y=trainY)
    val_dataset = tf_dataset_obj.create_dataset(X=valX,Y=valY)
    model = get_model(hidden_layers)
    k_fold_models.append(model)
    callbacks = get_callbacks(f)
    history = model.fit(train_dataset, epochs=epochs,
              validation_data = val_dataset,
              callbacks = callbacks)
    val_auc.append(np.max(history.history['val_auc']))
    train_auc.append(np.max(history.history['auc']))
    f+=1

Checking up model max validation AUC

In [None]:
for val_acc in val_auc:
    print(val_acc)

In [None]:
for train_acc in train_auc:
    print(train_acc)

Average AUC

In [None]:
print(np.mean(val_auc))
print(np.mean(train_auc))

Loading best weights for inferencing

In [None]:
for i, model in enumerate(k_fold_models):
    weights = f"best_model{i+1}.h5"
    model.load_weights(weights)

## Ensemble Methods

- mean_ensemble: Ensemble models prediction by averaging predictions from different models
- max_ensemble: Ensemble models prediction by taking max prediction from different models
- ensemble_accuracy: Check AUC score of ensembled predictions

In [None]:
def mean_ensemble(models, dataset):
    predictions = []
    for model in models:
        prediction = model.predict(dataset, verbose=1)
        predictions.append(prediction)
    predictions = np.mean(predictions, axis=0)
    return predictions

In [None]:
def max_ensemble(models, dataset):
    predictions = []
    for model in models:
        prediction = model.predict(dataset, verbose=1)
        predictions.append(prediction)
    predictions = np.max(predictions, axis=0)
    return predictions

In [None]:
def ensemble_accuracy(models, ensemble_fn, num_folds=5):
    f=1
    val_fold_auc = []
    train_fold_auc = []
    for train, val in cv.kfold_split(splits=num_folds, stratify= target_cols):
        print("Fold: ",f)
        trainX = train.iloc[:,:len(input_features)].values
        trainY = train.iloc[:,len(input_features):].values
        valX = val.iloc[:,:len(input_features)].values
        valY = val.iloc[:,len(input_features):].values
        train_dataset = tf_dataset_obj.create_dataset(X=trainX)
        val_dataset = tf_dataset_obj.create_dataset(X=valX)
        prediction_train = ensemble_fn(models, train_dataset)
        metric_train = tf.keras.metrics.AUC()
        metric_train.update_state(trainY, prediction_train)
        train_fold_auc.append(metric_train.result().numpy())

        prediction_val = ensemble_fn(models, val_dataset)
        metric_val = tf.keras.metrics.AUC()
        metric_val.update_state(valY, prediction_val)
        val_fold_auc.append(metric_val.result().numpy())
        f+=1
    return train_fold_auc, val_fold_auc

## Checking ensembled AUC with respect to best model

In [None]:
train_auc, val_auc= ensemble_accuracy(k_fold_models,mean_ensemble, num_folds=5)
print("\n======AUC=========")
print("Train AUC", train_auc)
print("Validation AUC: ",val_auc)

In [None]:
train_auc, val_auc= ensemble_accuracy(k_fold_models,max_ensemble, num_folds=5)
print("\n======AUC=========")
print("Train AUC", train_auc)
print("Validation AUC: ",val_auc)

In [None]:
best_val_model = k_fold_models[np.argmax(val_auc)]
train_auc, val_auc= ensemble_accuracy([best_val_model],mean_ensemble, num_folds=5)
print("\n======AUC=========")
print("Train AUC", train_auc)
print("Validation AUC: ",val_auc)

## Test data predictions for submission

In [None]:
test_dataset = tf_dataset_obj.create_dataset(X=test_df.values)

In [None]:
# mean_predictions = mean_ensemble(k_fold_models,test_dataset)
max_predictions = max_ensemble(k_fold_models,test_dataset)

In [None]:
# print(mean_predictions.shape)
print(max_predictions.shape)

In [None]:
# submission_mean = write_submission(mean_predictions)
# submission_mean.to_csv("submission.csv", index=False)
# submission_mean.head()

In [None]:
submission_max = write_submission(max_predictions)
submission_max.to_csv("submission.csv", index=False)
submission_max.head()