Original code by @optimo  from https://www.kaggle.com/optimo/tabnetregressor-2-0-train-infer

In [None]:
import sys
sys.path.append('../input/tabnetdevelop/tabnet-develop (1)/tabnet-develop')
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
import optuna
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd 

import os
import random
import sys
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from tqdm import tqdm
from sklearn.metrics import log_loss
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

# Data and minimal preprocessing

In [None]:
def create_folds(seed_count, fold_count):

    folds = []

    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    train_drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')


    # Get rid of "ctl_vehicle" from training. 
    # You may comment below lines if you do not want to do it.
    train_targets_scored = train_targets_scored.loc[train_features['cp_type'] == 'trt_cp', :]
    train_features = train_features[train_features['cp_type'] == 'trt_cp']
    
    train_features_drug = train_features.merge(train_drug, on="sig_id", how='left')
    
    # Add drug_id as one of the targets (for stratifying later)
    targets = train_targets_scored.columns[1:]
    train_targets_scored = train_targets_scored.merge(train_drug, on='sig_id', how='left') 

    # Within in training data, identify indices where drug ids 
    # which are present in more than 18 rows and less than 18 rows 
    vc = train_targets_scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index
    vc2 = vc.loc[vc > 18].index

    # tmp is a dataframe derived from scored targets, where targets are 
    # averaged by drugid (one row per drug id)
    tmp = train_targets_scored.groupby('drug_id')[targets].mean().loc[vc1]
    tmp = tmp.reset_index()    
    tmp = tmp.rename(columns={"index":"drug_id"})
    
    # tmp1 is a dataframe with tagets and drug_id for all drugs that have 
    # repeated more that 18 times in train dataset.
    # We are stratifying these drugs as among all folds. 
    # Thought here is that such drugs might repeat in public/private test sets as well
    tmp1 = train_targets_scored[train_targets_scored['drug_id'].isin(vc2)]
    tmp1 = tmp1.reset_index(drop=True)

    for seed in range(seed_count):

        skf = MultilabelStratifiedKFold(n_splits = fold_count, shuffle = True, random_state = seed)
        tmp_copy = tmp.copy()
        tmp1_copy = tmp1.copy()
        train_indices = train_features_drug[['sig_id', 'drug_id']].copy()
        
        for fold,(idxT,idxV) in enumerate(skf.split(X=tmp_copy,y=tmp_copy[targets])):
            tmp_copy.loc[idxV,"kfold"] = fold
        train_indices = train_indices.merge(tmp_copy[['drug_id', 'kfold']], on='drug_id', how="left")

        for fold,(idxT,idxV) in enumerate(skf.split(X=tmp1_copy,y=tmp1_copy[targets])):
            tmp1_copy.loc[idxV,"kfold"] = fold        
        train_indices = train_indices.merge(tmp1_copy[['sig_id', 'kfold']], on='sig_id', how="left")

        train_indices['kfold'] = train_indices['kfold_x'].combine_first(train_indices['kfold_y'])        
        train_indices.drop(['drug_id', 'kfold_x', 'kfold_y'], inplace=True, axis=1) 
        
        # Add this to the output
        folds.append(train_indices)       

    return np.stack(folds)

In [None]:
data_path = "../input/lish-moa/"
train = pd.read_csv(data_path+'train_features.csv')

train_targets_scored = pd.read_csv(data_path+'train_targets_scored.csv')

test = pd.read_csv(data_path+'test_features.csv')
test.drop(columns=["sig_id"], inplace=True)

submission = pd.read_csv(data_path+'sample_submission.csv')

remove_vehicle = True

if remove_vehicle:
    kept_index = train['cp_type']=='trt_cp'
    train = train.loc[kept_index].reset_index(drop=True)
    train_targets_scored = train_targets_scored.loc[kept_index].reset_index(drop=True)

train["cp_type"] = (train["cp_type"]=="trt_cp") + 0
train["cp_dose"] = (train["cp_dose"]=="D1") + 0

test["cp_type"] = (test["cp_type"]=="trt_cp") + 0
test["cp_dose"] = (test["cp_dose"]=="D1") + 0

X_test = test.values

In [None]:
# One seeds, 7 folds
folds = 7
folded = create_folds(1, folds)
folded_data = pd.DataFrame(data=folded[0], columns=["sig_id", "kfold"])
train = train.merge(folded_data, on="sig_id", how="left")
train_targets_scored = train_targets_scored.merge(folded_data, on="sig_id", how="left")
train.drop(columns=["sig_id"], inplace=True)
train_targets_scored.drop(columns=["sig_id"], inplace=True)
mean = np.mean(train, axis=0)
std = np.std(train, axis=0)

# Define custom metric for valdidation

In [None]:
from sklearn.metrics import log_loss
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score, log_loss

class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

## Optuna Objective function with hyperparameters

In [None]:
# Low number of epochs for Optuna
MAX_EPOCH=15

def objective(trial):

    mask_type = 'entmax'
#     n_da = 60
#     n_steps = 1
#     gamma = 1.0
    lambda_sparse = 2.6743669818463933e-05
    n_shared = 1
    
    # all hyperparameters here
#     mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 44, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
    gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
#     lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
#     n_shared = trial.suggest_int("n_shared", 1, 3)    
    batch_size = trial.suggest_int("batch_size", 128, 1024, 128)
    
    tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                         lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                         mask_type=mask_type, n_shared=n_shared,
                         scheduler_params=dict(mode="min",
                                               patience=5,
                                               min_lr=1e-5,
                                               factor=0.5,),
                         scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                         verbose=0,
                         )

    scores_auc_all= []
    test_cv_preds = []

    NB_SPLITS = 5
    mskf = MultilabelStratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
    oof_preds = []
    oof_targets = []
    scores = []
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, train_targets_scored)):
        print("FOLDS : ", fold_nb)

        X_train, y_train = train[train['kfold'] != fold_nb], train_targets_scored[train_targets_scored["kfold"] != fold_nb]
        X_val, y_val = train[train['kfold'] == fold_nb], train_targets_scored[train_targets_scored["kfold"] == fold_nb]

        X_train = X_train.reset_index(drop=True)
        gauss = np.random.normal(0, std/10, (X_train.shape[0], 876))    
        aug_train_X = X_train + gauss
        X_train = X_train.append(aug_train_X).reset_index(drop=True)    
        y_train = y_train.append(y_train)

        X_train = X_train.drop(columns=["kfold"]).values
        y_train = y_train.drop(columns=["kfold"]).values
        X_val = X_val.drop(columns=["kfold"]).values
        y_val = y_val.drop(columns=["kfold"]).values
    
        model = TabNetRegressor(**tabnet_params)

        model.fit(X_train=X_train,
                  y_train=y_train,
                  eval_set=[(X_val, y_val)],
                  eval_name = ["val"],
                  eval_metric = ["logits_ll"],
                  max_epochs=MAX_EPOCH,
                  patience=10, batch_size=1024, virtual_batch_size=batch_size,
                  num_workers=1, drop_last=False,
                  # use binary cross entropy as this is not a regression problem
                  loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)

        preds_val = model.predict(X_val)
        # Apply sigmoid to the predictions
        preds =  1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])

        ## save oof to compute the CV later
        oof_preds.append(preds_val)
        oof_targets.append(y_val)
        scores.append(score)

        # preds on test
        preds_test = model.predict(X_test)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

    oof_preds_all = np.concatenate(oof_preds)
    oof_targets_all = np.concatenate(oof_targets)
    test_preds_all = np.stack(test_cv_preds)
    
    return np.mean(scores)

#### You can run around 95 trials in background mode using GPU, without timing out

In [None]:
pruner = optuna.pruners.MedianPruner() 

study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective, n_jobs=-1, n_trials=50, gc_after_trial=True, timeout=None)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

### Visualize optimization trials

In [None]:
plot_contour(study, params=['mask_type',
                            'n_da',
                            'n_steps',
                            'gamma',
                            'lambda_sparse',
                            'n_shared'])

In [None]:
plot_optimization_history(study)

In [None]:
plot_slice(study)