# Libraries

In [None]:
import os

import numpy as np
import pandas as pd
import random

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.utils import check_random_state

import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow_addons as tfa

from typing import Tuple, List, Callable, Any

from tqdm.notebook import tqdm

import plotly.express as px

# Dataset

In [None]:
# Files in competition directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Create pandas dataframes

train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

test_predictions = pd.read_csv('../input/lish-moa/sample_submission.csv')

# Data preprocessing

In [None]:
# Principal component analysis (PCA) for cell viability data only
pca_cv = PCA()
pca_cv.fit(train_features.iloc[:,776:876])
labels_cv = ['PC' + str(x) for x in range (1, 101)]

In [None]:
# Preprocess training and testing features datasets
def preprocess(df):
    df = df.copy()
    
    # PCA - 97 cell viability principal components to replace 100 cell viability variables
    pca_cv_data = pca_cv.transform(df.iloc[:,776:876])
    df = pd.merge(df.iloc[:,:776], pd.DataFrame(pca_cv_data, columns = labels_cv).iloc[:,:97], how = 'inner', left_index = True, right_index = True)

    # Assign numeric labels to categorical values
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2}) 
    
    # Delete ID column 
    del df['sig_id']
    
    return df

train_features = preprocess(train_features)
test_features = preprocess(test_features)

In [None]:
# Preprocess training targets datasets
del train_targets['sig_id']

# Predictive model

In [None]:
def create_model():
    
    # Keras sequential neural network model
    model = tf.keras.Sequential([
        
        tf.keras.layers.Input(train_features.shape[1]),
        
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tfa.layers.WeightNormalization(tf.keras.layers.Dense(600, activation="relu")),
        
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tfa.layers.WeightNormalization(tf.keras.layers.Dense(300, activation="relu")),
        
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tfa.layers.WeightNormalization(tf.keras.layers.Dense(train_targets.shape[1], activation="sigmoid"))
        ])
    
    # Model compilation
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr = 1e-4), 
        loss='binary_crossentropy')
    
    return model

In [None]:
# Model structure
model = create_model()
model.summary()

In [None]:
# Model fitting

# Fitting variables
N_MODELS = 10
N_SPLITS = 5
EPOCHS = 50
BATCHES = 128
#SEED = 123

# Model seed 
#tf.random.set_seed(SEED)

# Model training tunning
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.1, 
    min_lr=1e-5, 
    patience=5, 
    verbose=1, 
    mode='min')    

early_stop = EarlyStopping(
    monitor="val_loss", 
    mode="min", 
    restore_best_weights=True, 
    patience= 5, 
    verbose = 1)

# Create dataframes to save predictions in
train_predictions = train_targets.copy()
train_predictions.loc[:, train_targets.columns] = 0
test_predictions.loc[:, train_targets.columns] = 0

# Dictionary for model recording 
model_records = dict()

# Fit 'N_MODELS' models with 'N_FOLDS' folds
for seed in range(N_MODELS):
    for n, (train, test) in enumerate(KFold(n_splits=N_SPLITS, random_state=seed, shuffle=True).split(train_targets)):
        print(f"Fold: {n+1}")
                       
        model_record = model.fit(
            train_features.values[train],
            train_targets.values[train],
            validation_data=(train_features.values[test], train_targets.values[test]),
            epochs=EPOCHS, 
            batch_size= BATCHES,
            callbacks=[reduce_lr, early_stop], 
            verbose=2)
        
        model_records[f"model_redord_{seed+1}"] = model_record
                
        # Predict testing subset of training data 
        train_test_predict = model.predict(train_features.values[test])
        train_predictions.loc[test, train_targets.columns] += train_test_predict

        # Predict external testing data
        external_test_predict = model.predict(test_features.values)
        test_predictions.loc[:, train_targets.columns] += external_test_predict
    
# Average all predictions    
train_predictions.loc[:, train_targets.columns] /= N_MODELS
test_predictions.loc[:, train_targets.columns] /= ((n+1) * N_MODELS)


# Predictive model performance

In [None]:
# Calculate log loss for training dataset predictions
metrics = []

for _target in train_targets.columns:
    metrics.append(log_loss(train_targets.loc[:, _target], train_predictions.loc[:, _target]))
print(f"Mean log loss for training dataset: {round(np.mean(metrics), 5)}")

# Features importance

In [None]:
# Supporting functions
def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, random_state=None):
    rng = check_random_state(random_state)

    if columns_to_shuffle is None:
        columns_to_shuffle = range(X.shape[1])

    if pre_shuffle:
        X_shuffled = X.copy()
        rng.shuffle(X_shuffled)

    X_res = X.copy()
    for columns in tqdm(columns_to_shuffle):
        if pre_shuffle:
            X_res[:, columns] = X_shuffled[:, columns]
        else:
            rng.shuffle(X_res[:, columns])
        yield X_res
        X_res[:, columns] = X[:, columns]


def get_score_importances(score_func, X, y, n_iter=5, columns_to_shuffle=None, random_state=None):
    rng = check_random_state(random_state)
    base_score = score_func(X, y)
    scores_decreases = []
    
    for i in range(n_iter):
        scores_shuffled = _get_scores_shufled(
            score_func, 
            X, 
            y, 
            columns_to_shuffle=columns_to_shuffle,
            random_state=rng, 
            base_score=base_score)
        scores_decreases.append(scores_shuffled)

    return base_score, scores_decreases


def _get_scores_shufled(score_func, X, y, base_score, columns_to_shuffle=None, random_state=None):
    Xs = iter_shuffled(X, columns_to_shuffle, random_state=random_state)
    res = []
    
    for X_shuffled in Xs:
        res.append(-score_func(X_shuffled, y) + base_score)
        
    return res

def metric(y_true, y_pred):
    metrics = []
    
    for i in range(y_pred.shape[1]):
        if y_true[:, i].sum() > 1:
            metrics.append(log_loss(y_true[:, i], y_pred[:, i]))
            
    return np.mean(metrics)   

# Permatation importance empty array
perm_imp = np.zeros(train_features.shape[1])

# Fit one models with 'N_FOLDS' folds
for n, (train, test) in enumerate(KFold(n_splits=5, random_state=0, shuffle=True).split(train_targets)):
    print(f'Fold {n}')

    model = create_model()
    
    reduce_lr_loss = ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.1, 
        patience=5, 
        verbose=1, 
        epsilon=1e-4, 
        mode='min')
    
    model.fit(
        train_features.values[train],
        train_targets.values[train],
        validation_data=(train_features.values[test], train_targets.values[test]),
        epochs=EPOCHS, 
        batch_size=BATCHES,
        callbacks=[reduce_lr_loss], 
        verbose=2
             )
        
    def _score(X, y):
        pred = model.predict(X)
        return metric(y, pred)

    base_score, local_imp = get_score_importances(
        _score, 
        train_features.values[test], 
        train_targets.values[test], 
        n_iter=1, 
        random_state=0)
    
    perm_imp += np.mean(local_imp, axis=0)
    print('')
    break

In [None]:
perm_imp

In [None]:
perm_imp_dict = dict(zip(list(train_features.columns[1:]), perm_imp))
perm_imp_dict = dict(sorted(perm_imp_dict.items(), key=lambda x: x[1], reverse=True))
perm_imp_df = pd.DataFrame(perm_imp_dict, index = pd.RangeIndex(1))

In [None]:
# Histogram plot of 50 most important features
fig = px.bar(
    perm_imp_df.iloc[:,:50].transpose(),
    template="simple_white")

# Layout   
fig.update_layout(
    showlegend=False,     
    autosize=False,
    width=900,
    height=300, 
    margin={'l': 20, 'r': 20, 't':  20, 'b': 20},
    xaxis_title = "Feature importance",
    yaxis_title = "Features")
    

fig.show()

In [None]:
# Histogram plot of 50 least important features
fig = px.bar(
    perm_imp_df.iloc[:,822:].transpose(),
    template="simple_white")

# Layout   
fig.update_layout(
    showlegend=False,     
    autosize=False,
    width=900,
    height=300, 
    margin={'l': 20, 'r': 20, 't':  20, 'b': 20},
    xaxis_title = "Feature importance",
    yaxis_title = "Features")
    

fig.show()

# Submission file

In [None]:
# Replace all MoA predictions for control perturbations in testing dataset with zeros
test_predictions.loc[test_features['cp_type']==1, train_targets.columns] = 0

In [None]:
test_predictions

In [None]:
test_predictions.to_csv('submission.csv', index=False)

# Source material
- https://www.kaggle.com/elcaiseri/moa-keras-multilabel-classifier-nn-starter
- https://www.kaggle.com/stanleyjzheng/baseline-nn-with-k-folds