In [None]:
#!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

In [None]:
import pickle

In [None]:
import sys
sys.path.append('../input/customtabnet')
from pytorch_tabnet_custom.tab_model import TabNetRegressor as TabNetRegressorCustom

In [None]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
os.listdir('../input/lish-moa')

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
groups = pd.read_csv("../input/lish-moa/train_drug.csv", index_col="sig_id", squeeze=True)

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregluar_indices = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

I'll check distributions of g-* and c-* of train and test set. They are spiky distribution rather than normal distribution. Regardless of the train and test, they look be in the same shape.

**train set before using RankGauss**

In [None]:
gnum = train_features[GENES].shape[1]
graphs = []

for i in range(0, gnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=771:
            break
        graph = sns.distplot(train_features[GENES].values[:,item], ax=axs[k])
        graph.set_title(f"g-{item}")
        graphs.append(graph)

In [None]:
cnum = train_features[CELLS].shape[1]
graphs = []

for i in range(0, cnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=100:
            break
        graph = sns.distplot(train_features[CELLS].values[:,item], ax=axs[k])
        graph.set_title(f"c-{item}")
        graphs.append(graph)

**test set before using RankGauss**

In [None]:
gnum = test_features[GENES].shape[1]
graphs = []

for i in range(0, gnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=771:
            break
        graph = sns.distplot(test_features[GENES].values[:,item], ax=axs[k])
        graph.set_title(f"g-{item}")
        graphs.append(graph)

In [None]:
cnum = test_features[CELLS].shape[1]
graphs = []

for i in range(0, cnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=100:
            break
        graph = sns.distplot(test_features[CELLS].values[:,item], ax=axs[k])
        graph.set_title(f"c-{item}")
        graphs.append(graph)

It may be a too simple idea, it appears that the gene expression data and cell viability data can be controlled by the experimenter, so it is safe to assume that these data are independent of each other.

Also, since the shape of the distribution is close to normal distribution to begin with, I don't think there is much of a problem if it is forced to be transformed into a Gaussian distribution.

In [None]:
#RankGauss
rank_gauss = {}
for col in (GENES + CELLS):
    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    ##raw_vec = train_features[col].values.reshape(vec_len, 1)
    raw_vec = pd.concat([train_features,test_features])[col].values.reshape(vec_len + vec_len_test, 1)
    transformer.fit(raw_vec)
    rank_gauss[col] = transformer
    
pickle.dump(rank_gauss, open('rank_gauss.pkl', 'wb'))
rank_gauss = pickle.load(open('rank_gauss.pkl', 'rb'))

for col in (GENES + CELLS):
    transformer = rank_gauss[col]
    train_features[col] = transformer.transform(train_features[col].values.reshape(vec_len, 1)).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

We can confirme that the shapes of data got close to the normal distribution.

**train set after using RankGauss**

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
gnum = train_features[GENES].shape[1]
graphs = []

for i in range(0, gnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=771:
            break
        graph = sns.distplot(train_features[GENES].values[:,item], ax=axs[k])
        graph.set_title(f"g-{item}")
        graphs.append(graph)

In [None]:
cnum = train_features[CELLS].shape[1]
graphs = []

for i in range(0, cnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=100:
            break
        graph = sns.distplot(train_features[CELLS].values[:,item], ax=axs[k])
        graph.set_title(f"c-{item}")
        graphs.append(graph)

**test set after using RankGauss**

In [None]:
gnum = test_features[GENES].shape[1]
graphs = []

for i in range(0, gnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=771:
            break
        graph = sns.distplot(test_features[GENES].values[:,item], ax=axs[k])
        graph.set_title(f"g-{item}")
        graphs.append(graph)

In [None]:
cnum = test_features[CELLS].shape[1]
graphs = []

for i in range(0, cnum -1 , 7):
    #for least display.... 
    if i >= 3:
        break
    idxs = list(np.array([0, 1, 2, 3, 4, 5, 6]) + i)
    

    fig, axs = plt.subplots(1, 7, sharey=True)
    for k, item in enumerate(idxs):
        if item >=100:
            break
        graph = sns.distplot(test_features[CELLS].values[:,item], ax=axs[k])
        graph.set_title(f"c-{item}")
        graphs.append(graph)

It appears that we were able to transform the distribution of each data to resemble a normal distribution, as intended.

So, let's enter the data into the benchmarking method to see the improvement.

# PCA features + Existing features

In [None]:
# GENES
n_comp = 600  #<--Update

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
#data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[GENES]))
gen_pca = PCA(n_components=n_comp, random_state=42).fit(data[GENES])

pickle.dump(gen_pca, open('gen_pca.pkl', 'wb'))
gen_pca = pickle.load(open('gen_pca.pkl', 'rb'))

data2 = (gen_pca.transform(data[GENES]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
#CELLS
n_comp = 50  #<--Update

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
#data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[CELLS]))

cel_pca = PCA(n_components=n_comp, random_state=42).fit(data[CELLS])

pickle.dump(cel_pca, open('cel_pca.pkl', 'wb'))
cel_pca = pickle.load(open('cel_pca.pkl', 'rb'))

data2 = (cel_pca.transform(data[CELLS]))

train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
train_features.shape

# feature Selection using Variance Encoding

In [None]:
from sklearn.feature_selection import VarianceThreshold


var_thresh = VarianceThreshold(0.8)  #<-- Update
data = train_features.append(test_features)
#data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])
var_thresh = var_thresh.fit(data.iloc[:, 4:])

pickle.dump(var_thresh, open('var_thresh.pkl', 'wb'))
var_thresh = pickle.load(open('var_thresh.pkl', 'rb'))

data_transformed = var_thresh.transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]


train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)


test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

train_features.shape


In [None]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(groups, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]
groups = train['drug_id']

In [None]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [None]:
train

In [None]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

# CV folds

In [None]:
folds = train.copy()

#msgkf = MultilabelStratifiedGroupKFold(n_splits=7)

#for f, (t_idx, v_idx) in enumerate(msgkf.split(X=train, y=target, groups=groups)):
#    folds.loc[v_idx, 'kfold'] = int(f)

#folds['kfold'] = folds['kfold'].astype(int)
folds

In [None]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

In [None]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

# Model

In [None]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            # true_dist = pred.data.clone()
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))    

In [None]:
MAX_EPOCH=200
tabnet_params = dict(n_d=32, n_a=32, n_steps=3, gamma=1.3,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )

In [None]:
from sklearn.metrics import log_loss
from pytorch_tabnet_custom.metrics import Metric
from sklearn.metrics import roc_auc_score, log_loss

class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

# Preprocessing steps

In [None]:
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    return data

In [None]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id','drug_id']]
len(feature_cols)

In [None]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5       
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=1500


# Single fold training

In [None]:
def run_tabnet_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        seed_everything(seed)
        print(f"FOLD: {fold}")

        train_data = process_data(folds)
        test_data = process_data(test)

        trn_idx = train_data[train_data['kfold'] != fold].index
        val_idx = train_data[train_data['kfold'] == fold].index

        train_df = train_data[train_data['kfold'] != fold].reset_index(drop=True)
        valid_df = train_data[train_data['kfold'] == fold].reset_index(drop=True)

        x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
        x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
        
        model = TabNetRegressorCustom(seed=seed, **tabnet_params)
        
        model.fit(X_train=x_train,
                  y_train=y_train,
                  eval_set=[(x_valid, y_valid)],
                  eval_name = ["val"],
                  eval_metric = ["logits_ll"],
                  max_epochs=MAX_EPOCH,
                  patience=20, batch_size=1024, virtual_batch_size=128,
                  num_workers=1, drop_last=False,
                  # use binary cross entropy as this is not a regression problem
                  loss_fn=torch.nn.functional.binary_cross_entropy_with_logits,
                  pretrain='pretrained.pth')

        pickle.dump(model, open(f"tabnet_FOLD{fold}_SEED{seed}.pkl", 'wb'))
        model = pickle.load(open(f"tabnet_FOLD{fold}_SEED{seed}.pkl", 'rb'))
        
        preds_val = model.predict(x_valid)
        # Apply sigmoid to the predictions
        preds =  1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])
    #     name = cfg.save_name + f"_fold{fold_nb}"
    #     model.save_model(name)
        ## save oof to compute the CV later
        oof[val_idx] = preds
        #scores.append(score)
        #--------------------- PREDICTION---------------------
        x_test = test_data[feature_cols].values  
        preds_test = model.predict(x_test)
        predictions += ((1 / (1 + np.exp(-preds_test))) / NFOLDS)
        
    return oof, predictions

In [None]:
from pytorch_tabnet_custom.pretraining import PreTrainingModel

folds = train.copy()
mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2020)
for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)
folds['kfold'] = folds['kfold'].astype(int)
    
fold = 0
train_data = process_data(folds)
test_data = process_data(test)

trn_idx = train_data[train_data['kfold'] != fold].index
val_idx = train_data[train_data['kfold'] == fold].index

train_df = train_data[train_data['kfold'] != fold].reset_index(drop=True)
valid_df = train_data[train_data['kfold'] == fold].reset_index(drop=True)

x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values

x_test = test_data[feature_cols].values

train_test = np.concatenate([x_train, x_test])

clf = PreTrainingModel(
    input_dim=train_test.shape[1],
    output_dim=train_test.shape[1],
    n_d=32, n_a=32, n_steps=3, gamma=1.3,
    lambda_sparse=0, optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
    mask_type='entmax',
    scheduler_params=dict(mode="min",
                        patience=5,
                        min_lr=1e-5,
                        factor=0.9,),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    verbose=10, # "sparsemax"
    )


clf.fit(X_train=train_test,
        y_train=train_test,
        eval_set=[(x_valid, x_valid)],
        eval_name = ["val"],
        max_epochs=2000,
        patience=20, batch_size=1024, virtual_batch_size=128,
        num_workers=1, drop_last=False)

from collections import OrderedDict
state_dict = clf.network.state_dict()
new_state_dict = OrderedDict()
for key, value in state_dict.items():
    if key not in ['encoder.final_mapping.weight']:
        new_key = 'tabnet.'+key
        new_state_dict[new_key] = value
#torch.save(clf.network.state_dict(), 'pretrained.pth')
torch.save(new_state_dict, 'pretrained.pth')

In [None]:
# Averaging on multiple SEEDS

SEED = [0, 1, 2, 3, 4] 

tab_oof = np.zeros((len(train), len(target_cols)))
tab_predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    folds = train.copy()
    mskf = MultilabelStratifiedGroupKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)
    for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target, groups=groups)):
        folds.loc[v_idx, 'kfold'] = int(f)
    folds['kfold'] = folds['kfold'].astype(int)
    
    oof_, predictions_ = run_tabnet_k_fold(NFOLDS, seed)
    
    tab_oof += oof_ / len(SEED)
    tab_predictions += predictions_ / len(SEED)

train[target_cols] = tab_oof
test[target_cols] = tab_predictions


In [None]:
train_targets_scored

In [None]:
len(target_cols)

In [None]:
valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)
valid_results.to_csv('oof.csv', index=False)

In [None]:
valid_results.head()

In [None]:
def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in train_targets_scored.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
    return np.mean(metrics)

In [None]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)

In [None]:
sub.shape