#Prep the environment

In [None]:
!pip install ../input/nnmodels/joblib-1.0.1-py3-none-any.whl
!pip install ../input/nnmodels/threadpoolctl-2.2.0-py3-none-any.whl
!pip install ../input/nnmodels/scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl
!pip install ../input/nnmodels/tqdm-4.62.0-py2.py3-none-any.whl
!pip install ../input/nnmodels/numpy-1.21.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
!pip install ../input/nnmodels/scipy-1.7.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl
!pip install ../input/nnmodels/torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/nnmodels/pytorch_tabnet-3.1.1-py3-none-any.whl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#TF stuff
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
import keras_tuner as kt
from tensorflow import keras


#XGBoost
from xgboost import XGBClassifier

#Scikit
from sklearn.metrics import log_loss
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

#TabNet
import torch
from torch.nn.modules.loss import _WeightedLoss
from torch.nn import BCEWithLogitsLoss
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau


# Scipy
from scipy.special import expit, logit

In [None]:
from sklearn.preprocessing import normalize 

#Reading and understanding the data

Main training and target data commponents

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

Encoding the categorical variables

In [None]:
train_features_enc = pd.get_dummies(train_features, columns=['cp_type', 'cp_dose'], drop_first=True)
print(train_features_enc.head())

Define the independent variables set **X** and the dependent one **y**


In [None]:
X = train_features_enc.iloc[:,1:].to_numpy()
y = train_targets.iloc[:,1:].to_numpy() 

Applying PCA on the independent variables X

In [None]:
pca = PCA()
pca.fit(X)

cum_sum_sv2 = np.cumsum(pca.explained_variance_ratio_)

pca = PCA(n_components = 50)
X_trim = pca.fit_transform(X)
#Concatenation of the PCA features and the original ones
X_pca = np.concatenate([X_trim, X],axis=1)

Preprocessing of the data: Normalization of the independent variables

In [None]:
X_norm = normalize(X, axis=0, norm='max')
X_norm_pca = normalize(X_pca, axis=0, norm='max')

**Adding useful functions**


In [None]:
# A convenient plotting function as we train models
def plot_hist(hist, last = None):
    if last == None:
        last = len(hist.history["loss"])
    plt.plot(hist.history["loss"][-last:])
    plt.plot(hist.history["val_loss"][-last:])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

#Models implementation

###Model 1: NN 4 layer

In [None]:
def l4_model(input_shape, no_classes, lr):
    inputs = tf.keras.Input(shape=input_shape)
    x = layers.Dense(128, activation='sigmoid')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='sigmoid')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='sigmoid')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='sigmoid')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(no_classes, activation='sigmoid')(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate = lr), metrics=['binary_crossentropy'])
    return model

### CV setup for NN
"""
losses_NN=[]
kf = KFold(n_splits=10)
tf.random.set_seed(1010)
np.random.seed(1010)

for train_index, test_index in kf.split(X_norm_pca):
    X_train, X_test = X_norm_pca[train_index], X_norm_pca[test_index]
    y_train, y_test = y[train_index], y[test_index]

    control_vehicle_mask = X_train[:,-2] == 0
    X_train = X_train[~control_vehicle_mask,:]
    y_train = y_train[~control_vehicle_mask]

    nnclf = l4_model((925,),206,0.0005)
    hist = nnclf.fit(X_train, y_train, batch_size=512, epochs=50, validation_data=(X_test, y_test), verbose=0)
    plot_hist(hist, last = 20)

    preds = nnclf.predict(X_test) # list of preds per class

    control_mask = X_test[:,-2]==0
    preds[control_mask] = 0

    loss = log_loss(np.ravel(y_test), np.ravel(preds))
    print('Loss: '+str(loss))
    losses_NN.append(loss)

print('Average Loss: '+str(np.average(losses_NN))) 
"""

###Model 2: Residual Neural Network with three layers and fixed parameters

In [None]:
def l3_res_model(input_shape, no_classes, lr):
    inputs = tf.keras.Input(shape=input_shape)
    x = layers.Dense(128, activation='sigmoid')(inputs)
    x = layers.BatchNormalization()(x)
    b_1 = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='sigmoid')(b_1)
    x = layers.BatchNormalization()(x)
    b_2 = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='sigmoid')(b_2)
    x = layers.BatchNormalization()(x)
    b_3 = layers.Dropout(0.2)(x)
    tot_op = tf.keras.layers.add([b_1, b_2, b_3])
    outputs = layers.Dense(no_classes, activation='sigmoid')(tot_op)
    model = tf.keras.Model(inputs, outputs)
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate = lr), metrics=['binary_crossentropy'])
    return model

### CV setup for NN
"""
losses_NN=[]
kf = KFold(n_splits=10)
tf.random.set_seed(1010)
np.random.seed(1010)

for train_index, test_index in kf.split(X_norm_pca):
    X_train, X_test = X_norm_pca[train_index], X_norm_pca[test_index]
    y_train, y_test = y[train_index], y[test_index]

    control_vehicle_mask = X_train[:,-2] == 0
    X_train = X_train[~control_vehicle_mask,:]
    y_train = y_train[~control_vehicle_mask]

    nnclf = l3_res_model((925,),206,0.0005)
    hist = nnclf.fit(X_train, y_train, batch_size=512, epochs=50, validation_data=(X_test, y_test), verbose=0)
    plot_hist(hist, last=20)

    preds = nnclf.predict(X_test) # list of preds per class

    control_mask = X_test[:,-2]==0
    preds[control_mask] = 0

    loss = log_loss(np.ravel(y_test), np.ravel(preds))
    print('Loss: '+str(loss))
    losses_NN.append(loss)

print('Average Loss: '+str(np.average(losses_NN))) 
"""

### TabNet Classifier

In [None]:
# helper class that manually implements logloss for TabNet
class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = expit(y_pred)
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [None]:
### CV setup for TabNet
"""
losses_tabnet=[]
kf = KFold(n_splits=3)
tf.random.set_seed(1010)
np.random.seed(1010)

for train_index, test_index in kf.split(X_norm_pca):
    X_train_cv, X_test_cv = X_norm_pca[train_index], X_norm_pca[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]

    control_vehicle_mask = X_train_cv[:,-2] == 0
    X_train_cv = X_train_cv[~control_vehicle_mask,:]
    y_train_cv = y_train_cv[~control_vehicle_mask]
    
    clf = TabNetRegressor(optimizer_fn = optim.Adam,
      optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
      n_steps = 1, gamma = 1.3, lambda_sparse = 0, n_d = 32, n_a = 32,
      scheduler_params = dict(mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
      scheduler_fn = ReduceLROnPlateau, verbose=10, seed=1010)

    clf.fit(
      X_train = X_train_cv,
      y_train = y_train_cv,
      eval_set = [(X_test_cv, y_test_cv)],
      eval_name = ["val"],
      eval_metric = ["logits_ll"],
      max_epochs = 200,
      patience = 50,
      batch_size = 1024, 
      virtual_batch_size = 32,
      num_workers = 1,
      loss_fn = BCEWithLogitsLoss()
    )
    preds_val = clf.predict(X_test_cv)
    preds = expit(preds_val)

    control_mask = X_test_cv[:,-2]==0
    preds[control_mask] = 0

    loss = log_loss(np.ravel(y_test_cv), np.ravel(preds))
    print('Loss: '+str(loss))
    losses_tabnet.append(loss)

print('Average Loss: '+str(np.average(losses_tabnet)))
"""

#Prediction over all the dataset

In [None]:
#L4 simple NN 
nnclf_l4 = l4_model((925,),206,0.0005)
hist_l4 = nnclf_l4.fit(X_norm_pca, y, batch_size=512, epochs=50, verbose=0)
preds_l4 = nnclf_l4.predict(X_norm_pca) # list of preds per class

#L3 Residual
nnclfl3_r = l3_res_model((925,),206,0.0005)
hist_l3_r = nnclfl3_r.fit(X_norm_pca, y, batch_size=512, epochs=50, verbose=0)
preds_l3_r = nnclfl3_r.predict(X_norm_pca) # list of preds per class

#Tab net
clf = TabNetRegressor(optimizer_fn = optim.Adam,
  optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
  n_steps = 1, gamma = 1.3, lambda_sparse = 0, n_d = 32, n_a = 32,
  scheduler_params = dict(mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
  scheduler_fn = ReduceLROnPlateau, verbose=10, seed=1010)

clf.fit(
  X_train = X_norm_pca,
  y_train = y,
  eval_metric = ["logits_ll"],
  max_epochs = 200,
  patience = 50,
  batch_size = 1024, 
  virtual_batch_size = 32,
  num_workers = 1,
  loss_fn = BCEWithLogitsLoss()
)
preds_val = clf.predict(X_norm_pca)
preds_tabnet = expit(preds_val)



In [None]:
a_opt, b_opt, c_opt = 1/3,1/3,1/3
preds_ensemble = a_opt*preds_l4 + b_opt*preds_l3_r + c_opt*preds_tabnet
log_loss(np.ravel(y), np.ravel(a_opt*preds_l4 + b_opt*preds_l3_r + c_opt*preds_tabnet)) 

# Final prediction

In [None]:
#Getting Data
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
test_features_enc = pd.get_dummies(test_features, columns=['cp_type', 'cp_dose'], drop_first=True)

X_test = test_features_enc.iloc[:,1:].to_numpy()


#PCA
X_trim_test = pca.transform(X_test)

#Concatenation of the PCA features and the original ones
X_pca_test = np.concatenate([X_trim_test, X_test],axis=1)

#Data normalization
X_norm_test = normalize(X_test, axis=0, norm='max')
X_norm_pca_test = normalize(X_pca_test, axis=0, norm='max')


In [None]:
#L4 simple NN 
preds_l4_test = nnclf_l4.predict(X_norm_pca_test) # list of preds per class

#L3 Residual
preds_l3_r_test = nnclfl3_r.predict(X_norm_pca_test) # list of preds per class

#Tab net
preds_val_test = clf.predict(X_norm_pca_test)
preds_tabnet_test = expit(preds_val_test)

#Ensembling the model resuls as the mean of the three models
a_opt, b_opt, c_opt = 1/3,1/3,1/3
preds_ensemble_test = a_opt*preds_l4_test + b_opt*preds_l3_r_test + c_opt*preds_tabnet_test

In [None]:
submit_df = pd.DataFrame(preds_ensemble_test, columns=train_targets.columns[1:], index=test_features["sig_id"].values)
submit_df.index.name = "sig_id"
submit_df.to_csv("submission.csv")