# Mechanisms of Action (MoA) Prediction

A tabnet architecture that perform correctly. Possible to push LB score to 0.01833 by increasing folds to 7

I also made the training here, but I reach same results with the training outside the kernel with only train + public train set.

Credit: 

PCA / Rank Gauss : [vbmokin](https://www.kaggle.com/vbmokin/moa-pytorch-rankgauss-pca-nn-upgrade-3d-visual)

Initial Tabnet : [optimo](https://www.kaggle.com/optimo/tabnetregressor-2-0-train-infer)

In [None]:
!pip install --no-index --find-links ../input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

In [None]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
import random
import pandas as pd
import os
import copy
import gc

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss

import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_columns', 2000)

from tqdm import tqdm

import glob

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)
 
def prep_data(n_comp_GENES, n_comp_CELLS, VarianceThreshold_for_FS):
    
    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')

    Xtrain = pd.get_dummies(train_features.drop('sig_id',axis=1), columns = ['cp_dose'],drop_first = True)
    Xtest = pd.get_dummies(test_features.drop('sig_id',axis=1), columns = ['cp_dose'],drop_first = True)

    target= train_targets_scored.drop('sig_id',axis=1)

    GENES = [col for col in train_features.columns if col.startswith('g-')]
    CELLS = [col for col in train_features.columns if col.startswith('c-')]

    Xtrain = pd.concat([Xtrain[GENES], Xtrain[CELLS], Xtrain.drop(GENES+CELLS,axis=1)],axis=1)
    Xtest = pd.concat([Xtest[GENES], Xtest[CELLS], Xtest.drop(GENES+CELLS,axis=1)],axis=1)

    # RankGauss - transform to Gauss
    for col in tqdm((GENES + CELLS)):

        transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
        Xtrain[col] = transformer.fit_transform(Xtrain[[col]])
        Xtest[col] = transformer.transform(Xtest[[col]])

    data = pd.concat([Xtrain[GENES], Xtest[GENES]])
    data2 = PCA(n_components=n_comp_GENES, random_state=42).fit_transform(data)

    Xtrain2 = pd.DataFrame(data2[:train_features.shape[0]], columns = [f'pca_G-{i}' for i in range(n_comp_GENES)])
    Xtest2 =  pd.DataFrame(data2[train_features.shape[0]:], columns = [f'pca_G-{i}' for i in range(n_comp_GENES)])

    data = pd.concat([Xtrain[CELLS], Xtest[CELLS]])
    data2 = PCA(n_components=n_comp_CELLS, random_state=42).fit_transform(data)

    Xtrain3 = pd.DataFrame(data2[:train_features.shape[0]], columns = [f'pca_C-{i}' for i in range(n_comp_CELLS)])
    Xtest3 =  pd.DataFrame(data2[train_features.shape[0]:], columns = [f'pca_C-{i}' for i in range(n_comp_CELLS)])

    Xtrain_tot = pd.concat((Xtrain, Xtrain2, Xtrain3), axis=1)
    Xtest_tot = pd.concat((Xtest, Xtest2, Xtest3), axis=1)

    var_thresh = VarianceThreshold(VarianceThreshold_for_FS)

    data = Xtrain_tot.append(Xtest_tot)
    data = data.iloc[:, 2:]
    data_transformed = var_thresh.fit_transform(data.drop('cp_type',axis=1))
    var = var_thresh.variances_

    Xtrain_transformed = pd.DataFrame(data_transformed[ : len(Xtrain)], columns = data.drop('cp_type',axis=1).columns[var>VarianceThreshold_for_FS])
    Xtest_transformed = pd.DataFrame(data_transformed[len(Xtrain) : ], columns = data.drop('cp_type',axis=1).columns[var>VarianceThreshold_for_FS])

    Xtrain_transformed['cp_type'] = Xtrain['cp_type']
    Xtest_transformed['cp_type'] = Xtest['cp_type']

    Xtrain_transformed['cp_time'] = Xtrain['cp_time']
    Xtest_transformed['cp_time'] = Xtest['cp_time']

    Xtrain_transformed['cp_dose_D2'] = Xtrain['cp_dose_D2']
    Xtest_transformed['cp_dose_D2'] = Xtest['cp_dose_D2']

    return Xtrain, Xtest, Xtrain_transformed, Xtest_transformed, target     
    
    
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss
    
class LabelSmoothingLoss(nn.Module):
    
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

def score(y, yp):
    return - np.mean(y*np.log(yp+10**(-8)) + (1-y)*np.log(1-yp+10**(-8)))  

def run_k_fold(folds, seed, Xtrain, Xtest, target, train):

    oof = []
    predictions = []
    models = []
    for fold in range(len(np.unique(folds))):

        oof_, pred_ , model = run_training(folds, fold, seed, Xtrain, Xtest, target, train)

        predictions.append(pred_)
        oof.append(oof_)

    oof = pd.concat(oof)
    predictions = np.mean(predictions,axis=0)
    return oof, predictions, models

def run_training(fold, seed, Xtrain, Xtest, target, tabnet_params):

    seed_everything(seed)
    
    num_features=Xtrain.shape[1]-1
    num_targets=target.shape[1]
    
    x_train = Xtrain[folds != fold].drop('cp_type',axis=1)
    x_valid = Xtrain[(folds == fold) & (Xtrain.cp_type == 'trt_cp')].drop('cp_type',axis=1)
    y_train = target.loc[x_train.index]
    y_valid = target.loc[x_valid.index]
    
    iddx = y_valid.index
    
    x_train, x_valid, y_train, y_valid = x_train.values, x_valid.values, y_train.values, y_valid.values

    model = TabNetRegressor(**tabnet_params)
    model.fit(X_train=x_train,
            y_train=y_train,
            eval_set=[(x_valid, y_valid)],
            eval_name = ["val"],
            eval_metric = ["logits_ll"],
            max_epochs=EPOCHS,
            patience=20, batch_size=BATCHSIZE, virtual_batch_size=GBATCHSIZE,
            num_workers=1, drop_last=False,
            # use binary cross entropy as this is not a regression problem
            loss_fn=SmoothBCEwLogits(smoothing =0.001))
    
    x_test = Xtest.drop('cp_type',axis=1).values

    # save tabnet model
    oof = model.predict(x_valid)
    oof = pd.DataFrame(oof, index = iddx)
    predictions = model.predict(x_test)
    return oof, predictions

def run_k_fold(NFOLDS, seed, Xtrain, Xtest, target, tabnet_params):

    oof = []
    predictions = []

    for fold in np.unique(NFOLDS):

        oof_, pred_ = run_training(fold, seed, Xtrain, Xtest, target, tabnet_params)
        predictions.append(pred_)
        oof.append(oof_)
        
    oof = pd.concat(oof).sort_index()
    predictions = np.mean(predictions,axis=0)
    return oof, predictions

def make_all_preds(Xtrain_transformed, Xtest_transformed, target, folds, train):

    oof, predictions = run_k_fold(folds, seed, Xtrain_transformed, Xtest_transformed, target, train)

    return oof.sort_index(), predictions

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        targets = y_true * (1.0 - 1e-4) + 0.5 * 1e-4
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [None]:
n_comp_GENES = 450
n_comp_CELLS = 65
VarianceThreshold_for_FS = 0.9
n_d = 32
n_a = 256
n_steps = 1
gamma = 1.5
lambda_sparse = 1e-7
LEARNING_RATE = 8e-3
wd = 1e-5
div_factor = 10
EPOCHS = 500
BATCHSIZE = 248
GBATCHSIZE = 64
Xtrain, Xtest, Xtrain_transformed, Xtest_transformed, target  = prep_data(n_comp_GENES, n_comp_CELLS, VarianceThreshold_for_FS)
BATCHES = int(len(Xtrain)/BATCHSIZE)
SEED = 0
n_independent= 1
n_shared = 0 

GROUP = 'tabnet'
tabnet_params = dict(
    n_d=n_d, 
    n_a=n_a, 
    n_steps=n_steps,
    n_independent = n_independent,
    gamma=gamma,
    lambda_sparse=lambda_sparse, 
    n_shared = n_shared,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=LEARNING_RATE, weight_decay=wd),
    mask_type='entmax',
    scheduler_params=dict(mode = "min", patience = 7, min_lr = 1e-5, factor = 0.07),
    scheduler_fn=optim.lr_scheduler.ReduceLROnPlateau,
    verbose=10,
    )

In [None]:
preds_tabnet = []
GROUP = 'tabnet_4'
seed = 42
hidden_size = 0
NFOLDS = 5
for SEED in [1,2,3,4,5]:
    
    folds = pd.Series(np.zeros(len(Xtrain)), index = Xtrain.index)
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)

    for f, (t_idx, v_idx) in enumerate(mskf.split(X=Xtrain, y=target)):
        folds.loc[v_idx] = int(f)  
        
    preds = np.zeros((Xtest.shape[0], target.shape[1]))
    oof, preds = make_all_preds(Xtrain, Xtest, target, folds, train = tabnet_params)
    preds_tabnet.append(sigmoid(preds))

In [None]:
preds_tabnet = np.mean(preds_tabnet,axis=0)

## Submission

In [None]:
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
submit = pd.DataFrame(preds_tabnet, columns = target.columns, index = Xtest.index)
submit.loc[Xtest.cp_type!='trt_cp'] = 0
submit.index = test_features.sig_id
submit = submit.reset_index()
submit.to_csv('submission.csv', index=False)