In [None]:
!pip install ../input/pytorch16gpu/torch-1.6.0cu101-cp37-cp37m-linux_x86_64.whl --quiet

In [None]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
sys.path.insert(0, "../input/tabnetdevelop/tabnet-develop")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import os
import gc
import math
import random
import datetime
import numpy as np
import pandas as pd
from joblib import dump, load
from numba import njit
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow_addons as tfa
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from tqdm.notebook import tqdm
from time import time
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score

N_STARTS = 3
N_SPLITS = 7
CALCULATE_OOF = False
CALCULATE_OOF_PL = True
FINETUNE = True
POST_PROCESS = True

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

# Preprocessing

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

cols = [c for c in ss.columns.values if c != 'sig_id']
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
def preprocess(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

def log_loss_metric(y_true, y_pred):
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return - np.mean(y_true * np.log(y_pred_clip) + (1 - y_true) * np.log(1 - y_pred_clip))

train = preprocess(train_features)
test = preprocess(test_features)

del train_targets['sig_id']
del train_targets_nonscored['sig_id']

# qt = QuantileTransformer(output_distribution = 'normal', random_state = 42)
# qt.fit(pd.concat([pd.DataFrame(train[GENES+CELLS]), pd.DataFrame(test[GENES+CELLS])]))
qt = load('../input/moa-preprocess/qt')
train[GENES+CELLS] = qt.transform(train[GENES+CELLS])
test[GENES+CELLS] = qt.transform(test[GENES+CELLS])

# GENES
n_comp_genes = 600  #<--Update

data = pd.concat([pd.DataFrame(train[GENES]), pd.DataFrame(test[GENES])])
# pca_genes = PCA(n_components = n_comp_genes, random_state = 42)
# data2 = pca_genes.fit_transform(data[GENES])
pca_genes = load('../input/moa-preprocess/pca_genes')
data2 = pca_genes.transform(data[GENES])
train2 = data2[:train.shape[0]]; test2 = data2[-test.shape[0]:]

train2 = pd.DataFrame(train2, columns = [f'pca_G-{i}' for i in range(n_comp_genes)])
test2 = pd.DataFrame(test2, columns = [f'pca_G-{i}' for i in range(n_comp_genes)])

train = pd.concat((train, train2), axis = 1)
test = pd.concat((test, test2), axis = 1)

#CELLS
n_comp_cells = 50  #<--Update

data = pd.concat([pd.DataFrame(train[CELLS]), pd.DataFrame(test[CELLS])])
# pca_cells = PCA(n_components = n_comp_cells, random_state = 42)
# data2 = pca_cells.fit_transform(data[CELLS])
pca_cells = load('../input/moa-preprocess/pca_cells')
data2 = pca_cells.transform(data[CELLS])
train2 = data2[:train.shape[0]]; test2 = data2[-test.shape[0]:]

train2 = pd.DataFrame(train2, columns = [f'pca_C-{i}' for i in range(n_comp_cells)])
test2 = pd.DataFrame(test2, columns = [f'pca_C-{i}' for i in range(n_comp_cells)])

train = pd.concat((train, train2), axis = 1)
test = pd.concat((test, test2), axis = 1)

data = train.append(test)
# var_thresh = VarianceThreshold(0.8)  #<-- Update
# data_transformed = var_thresh.fit_transform(data.iloc[:, 3:])
var_thresh = load('../input/moa-preprocess/var_thresh')
data_transformed = var_thresh.transform(data.iloc[:, 3:])

train_transformed = data_transformed[ : train.shape[0]]
test_transformed = data_transformed[-test.shape[0] : ]

train = pd.DataFrame(train[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
                     columns=['cp_type','cp_time','cp_dose'])

train = pd.concat([train, pd.DataFrame(train_transformed)], axis=1)

test = pd.DataFrame(test[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
                    columns=['cp_type','cp_time','cp_dose'])

test = pd.concat([test, pd.DataFrame(test_transformed)], axis=1)

print(train.shape)
print(test.shape)

train_targets = train_targets.loc[train['cp_type'] == 0].reset_index(drop = True)
train_targets_nonscored = train_targets_nonscored.loc[train['cp_type'] == 0].reset_index(drop = True)
train = train.loc[train['cp_type'] == 0].reset_index(drop = True)

print(train.shape)

top_feats = np.arange(1, train.shape[1])
print(top_feats)

cat_tr, cat_test, numerical_tr, numerical_test = train.loc[:, train.columns[1:3]], test.loc[:, test.columns[1:3]], train.loc[:, train.columns[3:]].values, test.loc[:, test.columns[3:]].values
cat_tr.loc[:, 'cp_time'] = cat_tr.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
cat_test.loc[:, 'cp_time'] = cat_test.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
cat_tr = cat_tr.values
cat_test = cat_test.values
targets_tr = train_targets[cols].values.astype(np.float32)

# Utils

In [None]:
def evals(model, X, y, verbose=True):
    with torch.no_grad():
        y_preds = model.predict(X)
        y_preds = torch.clamp(y_preds, 0.0,1.0).detach().numpy()
    score = log_loss_multi(y, y_preds)
    #print("Logloss = ", score)
    return y_preds, score


def inference_fn(model, X ,verbose=True):
    with torch.no_grad():
        y_preds = model.predict( X )
        y_preds = torch.sigmoid(torch.as_tensor(y_preds)).numpy()
    return y_preds

def log_loss_score(actual, predicted,  eps=1e-15):

        """
        :param predicted:   The predicted probabilities as floats between 0-1
        :param actual:      The binary labels. Either 0 or 1.
        :param eps:         Log(0) is equal to infinity, so we need to offset our predicted values slightly by eps from 0 or 1
        :return:            The logarithmic loss between between the predicted probability assigned to the possible outcomes for item i, and the actual outcome.
        """

        
        p1 = actual * np.log(predicted+eps)
        p0 = (1-actual) * np.log(1-predicted+eps)
        loss = p0 + p1

        return -loss.mean()
def log_loss_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        results[i] = log_loss_score(y_true[:,i], y_pred[:,i])
    return results.mean()
def check_targets(targets):
    ### check if targets are all binary in training set
    
    for i in range(targets.shape[1]):
        if len(np.unique(targets[:,i])) != 2:
            return False
    return True
def auc_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        try:
            results[i] = roc_auc_score(y_true[:,i], y_pred[:,i])
        except:
            pass
    return results.mean()

import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss
    
sbcewlogits = SmoothBCEwLogits(smoothing = 0.0008)

# TabNet

In [None]:
## TABNET

import torch
import numpy as np
from scipy.sparse import csc_matrix
import time
from abc import abstractmethod
from pytorch_tabnet import tab_network
from pytorch_tabnet.multiclass_utils import unique_labels
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score
from torch.nn.utils import clip_grad_norm_
from pytorch_tabnet.utils import (PredictDataset,
                                  create_dataloaders,
                                  create_explain_matrix)
from sklearn.base import BaseEstimator
from torch.utils.data import DataLoader
from copy import deepcopy
import io
import json
from pathlib import Path
import shutil
import zipfile

class TabModel(BaseEstimator):
    def __init__(self, n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1,
                 n_independent=2, n_shared=2, epsilon=1e-15,  momentum=0.02,
                 lambda_sparse=1e-3, seed=0,
                 clip_value=1, verbose=1,
                 optimizer_fn=torch.optim.Adam,
                 optimizer_params=dict(lr=2e-2),
                 scheduler_params=None, scheduler_fn=None,
                 mask_type="sparsemax",
                 input_dim=None, output_dim=None,
                 device_name='auto'):
        """ Class for TabNet model
        Parameters
        ----------
            device_name: str
                'cuda' if running on GPU, 'cpu' if not, 'auto' to autodetect
        """

        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.cat_idxs = cat_idxs
        self.cat_dims = cat_dims
        self.cat_emb_dim = cat_emb_dim
        self.n_independent = n_independent
        self.n_shared = n_shared
        self.epsilon = epsilon
        self.momentum = momentum
        self.lambda_sparse = lambda_sparse
        self.clip_value = clip_value
        self.verbose = verbose
        self.optimizer_fn = optimizer_fn
        self.optimizer_params = optimizer_params
        self.device_name = device_name
        self.scheduler_params = scheduler_params
        self.scheduler_fn = scheduler_fn
        self.mask_type = mask_type
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.batch_size = 1024

        self.seed = seed
        torch.manual_seed(self.seed)
        # Defining device
        if device_name == 'auto':
            if torch.cuda.is_available():
                device_name = 'cuda'
            else:
                device_name = 'cpu'
        self.device = torch.device(device_name)
        print(f"Device used : {self.device}")

    @abstractmethod
    def construct_loaders(self, X_train, y_train, X_valid, y_valid,
                          weights, batch_size, num_workers, drop_last):
        """
        Returns
        -------
        train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader
            Training and validation dataloaders
        -------
        """
        raise NotImplementedError('users must define construct_loaders to use this base class')

    def init_network(
                     self,
                     input_dim,
                     output_dim,
                     n_d,
                     n_a,
                     n_steps,
                     gamma,
                     cat_idxs,
                     cat_dims,
                     cat_emb_dim,
                     n_independent,
                     n_shared,
                     epsilon,
                     virtual_batch_size,
                     momentum,
                     device_name,
                     mask_type,
                     ):
        self.network = tab_network.TabNet(
            input_dim,
            output_dim,
            n_d=n_d,
            n_a=n_a,
            n_steps=n_steps,
            gamma=gamma,
            cat_idxs=cat_idxs,
            cat_dims=cat_dims,
            cat_emb_dim=cat_emb_dim,
            n_independent=n_independent,
            n_shared=n_shared,
            epsilon=epsilon,
            virtual_batch_size=virtual_batch_size,
            momentum=momentum,
            device_name=device_name,
            mask_type=mask_type).to(self.device)

        self.reducing_matrix = create_explain_matrix(
            self.network.input_dim,
            self.network.cat_emb_dim,
            self.network.cat_idxs,
            self.network.post_embed_dim)

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, loss_fn=None,
            weights=0, max_epochs=100, patience=10, batch_size=1024,
            virtual_batch_size=128, num_workers=0, drop_last=False, pretrain=False, optimizer_params=None):
        """Train a neural network stored in self.network
        Using train_dataloader for training data and
        valid_dataloader for validation.
        Parameters
        ----------
            X_train: np.ndarray
                Train set
            y_train : np.array
                Train targets
            X_train: np.ndarray
                Train set
            y_train : np.array
                Train targets
            weights : bool or dictionnary
                0 for no balancing
                1 for automated balancing
                dict for custom weights per class
            max_epochs : int
                Maximum number of epochs during training
            patience : int
                Number of consecutive non improving epoch before early stopping
            batch_size : int
                Training batch size
            virtual_batch_size : int
                Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size)
            num_workers : int
                Number of workers used in torch.utils.data.DataLoader
            drop_last : bool
                Whether to drop last batch during training
        """
        # update model name

        self.update_fit_params(X_train, y_train, X_valid, y_valid, loss_fn,
                               weights, max_epochs, patience, batch_size,
                               virtual_batch_size, num_workers, drop_last)

        train_dataloader, valid_dataloader = self.construct_loaders(X_train,
                                                                    y_train,
                                                                    X_valid,
                                                                    y_valid,
                                                                    self.updated_weights,
                                                                    self.batch_size,
                                                                    self.num_workers,
                                                                    self.drop_last)
        if not pretrain:
            self.init_network(
                input_dim=self.input_dim,
                output_dim=self.output_dim,
                n_d=self.n_d,
                n_a=self.n_a,
                n_steps=self.n_steps,
                gamma=self.gamma,
                cat_idxs=self.cat_idxs,
                cat_dims=self.cat_dims,
                cat_emb_dim=self.cat_emb_dim,
                n_independent=self.n_independent,
                n_shared=self.n_shared,
                epsilon=self.epsilon,
                virtual_batch_size=self.virtual_batch_size,
                momentum=self.momentum,
                device_name=self.device_name,
                mask_type=self.mask_type
            )
            self.optimizer = self.optimizer_fn(self.network.parameters(),
                                               **self.optimizer_params)
        else:
            self.optimizer = self.optimizer_fn(self.network.parameters(),
                                               **optimizer_params)

        if self.scheduler_fn:
            self.scheduler = self.scheduler_fn(self.optimizer, **self.scheduler_params)
        else:
            self.scheduler = None

        self.losses_train = []
        self.losses_valid = []
        self.learning_rates = []
        self.metrics_train = []
        self.metrics_valid = []

        if self.verbose > 0:
            print("Will train until validation stopping metric",
                  f"hasn't improved in {self.patience} rounds.")
            msg_epoch = f'| EPOCH |  train  |   valid  | total time (s)'
            print('---------------------------------------')
            print(msg_epoch)

        total_time = 0
        while (self.epoch < self.max_epochs and
               self.patience_counter < self.patience):
            starting_time = time.time()
            # updates learning rate history
            self.learning_rates.append(self.optimizer.param_groups[-1]["lr"])

            fit_metrics = self.fit_epoch(train_dataloader, valid_dataloader)

            # leaving it here, may be used for callbacks later
            self.losses_train.append(fit_metrics['train']['loss_avg'])
            self.losses_valid.append(fit_metrics['valid']['total_loss'])
            self.metrics_train.append(fit_metrics['train']['stopping_loss'])
            self.metrics_valid.append(fit_metrics['valid']['stopping_loss'])

            stopping_loss = fit_metrics['valid']['stopping_loss']
            if stopping_loss < self.best_cost:
                self.best_cost = stopping_loss
                self.patience_counter = 0
                # Saving model
                self.best_network = deepcopy(self.network)
                has_improved = True
            else:
                self.patience_counter += 1
                has_improved=False
            self.epoch += 1
            total_time += time.time() - starting_time
            if self.verbose > 0:
                if self.epoch % self.verbose == 0:
                    separator = "|"
                    msg_epoch = f"| {self.epoch:<5} | "
                    msg_epoch += f" {fit_metrics['train']['stopping_loss']:.5f}"
                    msg_epoch += f' {separator:<2} '
                    msg_epoch += f" {fit_metrics['valid']['stopping_loss']:.5f}"
                    msg_epoch += f' {separator:<2} '
                    msg_epoch += f" {np.round(total_time, 1):<10}"
                    msg_epoch += f" {has_improved}"
                    print(msg_epoch)

        if self.verbose > 0:
            if self.patience_counter == self.patience:
                print(f"Early stopping occured at epoch {self.epoch}")
            print(f"Training done in {total_time:.3f} seconds.")
            print('---------------------------------------')

        self.history = {"train": {"loss": self.losses_train,
                                  "metric": self.metrics_train,
                                  "lr": self.learning_rates},
                        "valid": {"loss": self.losses_valid,
                                  "metric": self.metrics_valid}}
        # load best models post training
        self.load_best_model()

        # compute feature importance once the best model is defined
        self._compute_feature_importances(train_dataloader)

    def save_model(self, path):
        """
        Saving model with two distinct files.
        """
        saved_params = {}
        for key, val in self.get_params().items():
            if isinstance(val, type):
                # Don't save torch specific params
                continue
            else:
                saved_params[key] = val

        # Create folder
        Path(path).mkdir(parents=True, exist_ok=True)

        # Save models params
        with open(Path(path).joinpath("model_params.json"), "w", encoding="utf8") as f:
            json.dump(saved_params, f)

        # Save state_dict
        torch.save(self.network.state_dict(), Path(path).joinpath("network.pt"))
        shutil.make_archive(path, 'zip', path)
        shutil.rmtree(path)
        print(f"Successfully saved model at {path}.zip")
        return f"{path}.zip"

    def load_model(self, filepath):

        try:
            try:
                with zipfile.ZipFile(filepath) as z:
                    with z.open("model_params.json") as f:
                        loaded_params = json.load(f)
                    with z.open("network.pt") as f:
                        try:
                            saved_state_dict = torch.load(f)
                        except io.UnsupportedOperation:
                            # In Python <3.7, the returned file object is not seekable (which at least
                            # some versions of PyTorch require) - so we'll try buffering it in to a
                            # BytesIO instead:
                            saved_state_dict = torch.load(io.BytesIO(f.read()))
                            
            except:
                with open(os.path.join(filepath, "model_params.json")) as f:
                        loaded_params = json.load(f)

                saved_state_dict = torch.load(os.path.join(filepath, "network.pt"), map_location="cpu")
 
        except KeyError:
            raise KeyError("Your zip file is missing at least one component")

        #print(loaded_params)
        if torch.cuda.is_available():
            device_name = 'cuda'
        else:
            device_name = 'cpu'
        loaded_params["device_name"] = device_name
        self.__init__(**loaded_params)

        self.init_network(
            input_dim=self.input_dim,
            output_dim=self.output_dim,
            n_d=self.n_d,
            n_a=self.n_a,
            n_steps=self.n_steps,
            gamma=self.gamma,
            cat_idxs=self.cat_idxs,
            cat_dims=self.cat_dims,
            cat_emb_dim=self.cat_emb_dim,
            n_independent=self.n_independent,
            n_shared=self.n_shared,
            epsilon=self.epsilon,
            virtual_batch_size=1024,
            momentum=self.momentum,
            device_name=self.device_name,
            mask_type=self.mask_type
        )
        self.network.load_state_dict(saved_state_dict)
        self.network.eval()
        return

    def fit_epoch(self, train_dataloader, valid_dataloader):
        """
        Evaluates and updates network for one epoch.
        Parameters
        ----------
            train_dataloader: a :class: `torch.utils.data.Dataloader`
                DataLoader with train set
            valid_dataloader: a :class: `torch.utils.data.Dataloader`
                DataLoader with valid set
        """
        train_metrics = self.train_epoch(train_dataloader)
        valid_metrics = self.predict_epoch(valid_dataloader)

        fit_metrics = {'train': train_metrics,
                       'valid': valid_metrics}

        return fit_metrics

    @abstractmethod
    def train_epoch(self, train_loader):
        """
        Trains one epoch of the network in self.network
        Parameters
        ----------
            train_loader: a :class: `torch.utils.data.Dataloader`
                DataLoader with train set
        """
        raise NotImplementedError('users must define train_epoch to use this base class')

    @abstractmethod
    def train_batch(self, data, targets):
        """
        Trains one batch of data
        Parameters
        ----------
            data: a :tensor: `torch.tensor`
                Input data
            target: a :tensor: `torch.tensor`
                Target data
        """
        raise NotImplementedError('users must define train_batch to use this base class')

    @abstractmethod
    def predict_epoch(self, loader):
        """
        Validates one epoch of the network in self.network
        Parameters
        ----------
            loader: a :class: `torch.utils.data.Dataloader`
                    DataLoader with validation set
        """
        raise NotImplementedError('users must define predict_epoch to use this base class')

    @abstractmethod
    def predict_batch(self, data, targets):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            batch_outs: dict
        """
        raise NotImplementedError('users must define predict_batch to use this base class')

    def load_best_model(self):
        if self.best_network is not None:
            self.network = self.best_network

    @abstractmethod
    def predict(self, X):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            predictions: np.array
                Predictions of the regression problem or the last class
        """
        raise NotImplementedError('users must define predict to use this base class')

    def explain(self, X):
        """
        Return local explanation
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            M_explain: matrix
                Importance per sample, per columns.
            masks: matrix
                Sparse matrix showing attention masks used by network.
        """
        self.network.eval()

        dataloader = DataLoader(PredictDataset(X),
                                batch_size=self.batch_size, shuffle=False)

        for batch_nb, data in enumerate(dataloader):
            data = data.to(self.device).float()

            M_explain, masks = self.network.forward_masks(data)
            for key, value in masks.items():
                masks[key] = csc_matrix.dot(value.cpu().detach().numpy(),
                                            self.reducing_matrix)

            if batch_nb == 0:
                res_explain = csc_matrix.dot(M_explain.cpu().detach().numpy(),
                                             self.reducing_matrix)
                res_masks = masks
            else:
                res_explain = np.vstack([res_explain,
                                         csc_matrix.dot(M_explain.cpu().detach().numpy(),
                                                        self.reducing_matrix)])
                for key, value in masks.items():
                    res_masks[key] = np.vstack([res_masks[key], value])
        return res_explain, res_masks

    def _compute_feature_importances(self, loader):
        self.network.eval()
        feature_importances_ = np.zeros((self.network.post_embed_dim))
        for data, targets in loader:
            data = data.to(self.device).float()
            M_explain, masks = self.network.forward_masks(data)
            feature_importances_ += M_explain.sum(dim=0).cpu().detach().numpy()

        feature_importances_ = csc_matrix.dot(feature_importances_,
                                              self.reducing_matrix)
        self.feature_importances_ = feature_importances_ / np.sum(feature_importances_)
        
        
class TabNetRegressor(TabModel):

    def construct_loaders(self, X_train, y_train, X_valid, y_valid, weights,
                          batch_size, num_workers, drop_last):
        """
        Returns
        -------
        train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader
            Training and validation dataloaders
        -------
        """
        if isinstance(weights, int):
            if weights == 1:
                raise ValueError("Please provide a list of weights for regression.")
        if isinstance(weights, dict):
            raise ValueError("Please provide a list of weights for regression.")

        train_dataloader, valid_dataloader = create_dataloaders(X_train,
                                                                y_train,
                                                                X_valid,
                                                                y_valid,
                                                                weights,
                                                                batch_size,
                                                                num_workers,
                                                                drop_last)
        return train_dataloader, valid_dataloader

    def update_fit_params(self, X_train, y_train, X_valid, y_valid, loss_fn,
                          weights, max_epochs, patience,
                          batch_size, virtual_batch_size, num_workers, drop_last):

        if loss_fn is None:
            self.loss_fn = torch.nn.functional.mse_loss
        else:
            self.loss_fn = loss_fn

        assert X_train.shape[1] == X_valid.shape[1], "Dimension mismatch X_train X_valid"
        self.input_dim = X_train.shape[1]

        if len(y_train.shape) == 1:
            raise ValueError("""Please apply reshape(-1, 1) to your targets
                                if doing single regression.""")
        assert y_train.shape[1] == y_valid.shape[1], "Dimension mismatch y_train y_valid"
        self.output_dim = y_train.shape[1]

        self.updated_weights = weights

        self.max_epochs = max_epochs
        self.patience = patience
        self.batch_size = batch_size
        self.virtual_batch_size = virtual_batch_size
        # Initialize counters and histories.
        self.patience_counter = 0
        self.epoch = 0
        self.best_cost = np.inf
        self.num_workers = num_workers
        self.drop_last = drop_last

    def train_epoch(self, train_loader):
        """
        Trains one epoch of the network in self.network
        Parameters
        ----------
            train_loader: a :class: `torch.utils.data.Dataloader`
                DataLoader with train set
        """

        self.network.train()
        y_preds = []
        ys = []
        total_loss = 0

        for data, targets in train_loader:
            batch_outs = self.train_batch(data, targets)
            y_preds.append(batch_outs["y_preds"].cpu().detach().numpy())
            ys.append(batch_outs["y"].cpu().detach().numpy())
            total_loss += batch_outs["loss"]

        y_preds = np.vstack(y_preds)
        ys = np.vstack(ys)

        #stopping_loss = mean_squared_error(y_true=ys, y_pred=y_preds)
        stopping_loss =log_loss_multi(ys, torch.sigmoid(torch.as_tensor(y_preds)).numpy()  )
        total_loss = total_loss / len(train_loader)
        epoch_metrics = {'loss_avg': total_loss,
                         'stopping_loss': total_loss,
                         }

        if self.scheduler is not None:
            self.scheduler.step()
        return epoch_metrics

    def train_batch(self, data, targets):
        """
        Trains one batch of data
        Parameters
        ----------
            data: a :tensor: `torch.tensor`
                Input data
            target: a :tensor: `torch.tensor`
                Target data
        """
        self.network.train()
        data = data.to(self.device).float()

        targets = targets.to(self.device).float()
        self.optimizer.zero_grad()

        output, M_loss = self.network(data)

        loss = self.loss_fn(output, targets)
        
        loss -= self.lambda_sparse*M_loss

        loss.backward()
        if self.clip_value:
            clip_grad_norm_(self.network.parameters(), self.clip_value)
        self.optimizer.step()

        loss_value = loss.item()
        batch_outs = {'loss': loss_value,
                      'y_preds': output,
                      'y': targets}
        return batch_outs

    def predict_epoch(self, loader):
        """
        Validates one epoch of the network in self.network
        Parameters
        ----------
            loader: a :class: `torch.utils.data.Dataloader`
                    DataLoader with validation set
        """
        y_preds = []
        ys = []
        self.network.eval()
        total_loss = 0

        for data, targets in loader:
            batch_outs = self.predict_batch(data, targets)
            total_loss += batch_outs["loss"]
            y_preds.append(batch_outs["y_preds"].cpu().detach().numpy())
            ys.append(batch_outs["y"].cpu().detach().numpy())

        y_preds = np.vstack(y_preds)
        ys = np.vstack(ys)

        stopping_loss =log_loss_multi(ys, torch.sigmoid(torch.as_tensor(y_preds)).numpy()  ) #mean_squared_error(y_true=ys, y_pred=y_preds)

        total_loss = total_loss / len(loader)
        epoch_metrics = {'total_loss': total_loss,
                         'stopping_loss': stopping_loss}

        return epoch_metrics

    def predict_batch(self, data, targets):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            batch_outs: dict
        """
        self.network.eval()
        data = data.to(self.device).float()
        targets = targets.to(self.device).float()

        output, M_loss = self.network(data)
       
        loss = self.loss_fn(output, targets)
        #print(self.loss_fn, loss)
        loss -= self.lambda_sparse*M_loss
        #print(loss)
        loss_value = loss.item()
        batch_outs = {'loss': loss_value,
                      'y_preds': output,
                      'y': targets}
        return batch_outs

    def predict(self, X):
        """
        Make predictions on a batch (valid)
        Parameters
        ----------
            data: a :tensor: `torch.Tensor`
                Input data
            target: a :tensor: `torch.Tensor`
                Target data
        Returns
        -------
            predictions: np.array
                Predictions of the regression problem
        """
        self.network.eval()
        dataloader = DataLoader(PredictDataset(X),
                                batch_size=self.batch_size, shuffle=False)

        results = []
        for batch_nb, data in enumerate(dataloader):
            data = data.to(self.device).float()

            output, M_loss = self.network(data)
            predictions = output.cpu().detach().numpy()
            results.append(predictions)
        res = np.vstack(results)
        return res

In [None]:
class Config(object):
    def __init__(self):
        self.num_class = targets_tr.shape[1]
        self.verbose=False
        #
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.SPLITS = N_SPLITS
        self.EPOCHS = 200
        self.num_ensembling = N_STARTS
        self.seed = 0
        # Parameters model
        self.cat_emb_dim=[1] * cat_tr.shape[1] #to choose
        self.cats_idx = list(range(cat_tr.shape[1]))
        self.cat_dims = [len(np.unique(cat_tr[:, i])) for i in range(cat_tr.shape[1])]
        self.num_numericals = numerical_tr.shape[1]
        # save
        self.save_name = "../input/multilabel-pbestpre-tabnet/tabnet_raw_step1"
        
        self.strategy = "KFOLD" # 
        
cfg = Config()

In [None]:
X_test = np.concatenate([cat_test, numerical_test], axis=1)
if cfg.strategy == "KFOLD":
    oof_preds_all = []
    oof_targets_all = []
    scores_all =  []
    scores_auc_all= []
    preds_test = []
    res = np.zeros(targets_tr.shape)
    for nums, seed in enumerate(range(cfg.num_ensembling)):
        print("## SEED : ", seed)
        mskf = MultilabelStratifiedKFold(n_splits=cfg.SPLITS, random_state=cfg.seed+seed, shuffle=True)
        oof_preds = []
        oof_targets = []
        scores = []
        scores_auc = []
        p = []
        for j, (train_idx, val_idx) in enumerate(mskf.split(np.zeros(len(cat_tr)), targets_tr)):
            print("FOLDS : ", j)

            ## model
            X_train, y_train = torch.as_tensor(np.concatenate([cat_tr[train_idx], numerical_tr[train_idx] ], axis=1)), torch.as_tensor(targets_tr[train_idx])
            X_val, y_val = torch.as_tensor(np.concatenate([cat_tr[val_idx], numerical_tr[val_idx] ], axis=1)), torch.as_tensor(targets_tr[val_idx])
            model = TabNetRegressor(n_d=24, n_a=24, n_steps=1, gamma=1.3, lambda_sparse=0, cat_dims=cfg.cat_dims, cat_emb_dim=cfg.cat_emb_dim, cat_idxs=cfg.cats_idx, optimizer_fn=torch.optim.Adam,
                                   optimizer_params=dict(lr=2e-2), mask_type='entmax', device_name=cfg.device, scheduler_params=dict(milestones=[ 50,100,150], gamma=0.9), scheduler_fn=torch.optim.lr_scheduler.MultiStepLR)
            #'sparsemax'
            
            name = cfg.save_name + f"_fold{j}_{seed}"
            model.load_model(name)
            if CALCULATE_OOF:
            # preds on val
                preds = model.predict(X_val)
                preds = torch.sigmoid(torch.as_tensor(preds)).detach().cpu().numpy()
                score = log_loss_multi(y_val, preds)
                res[val_idx] += preds / cfg.num_ensembling

                ## save oof to compute the CV later
                oof_preds.append(preds)
                oof_targets.append(y_val)
                scores.append(score)
                scores_auc.append(auc_multi(y_val,preds))
                print(f"validation fold {j} : {score}")
                
                
            # preds on test
            temp = model.predict(X_test)
            p.append(torch.sigmoid(torch.as_tensor(temp)).detach().cpu().numpy())
                
        p = np.stack(p)
        preds_test.append(p)
        
        if CALCULATE_OOF:
            oof_preds_all.append(np.concatenate(oof_preds))
            oof_targets_all.append(np.concatenate(oof_targets))
            scores_all.append(np.array(scores))
            scores_auc_all.append(np.array(scores_auc))
            
    preds_test = np.stack(preds_test)

In [None]:
if CALCULATE_OOF:

    if cfg.strategy == "KFOLD":

        for i in range(cfg.num_ensembling): 
            print("CV score fold : ", log_loss_multi(oof_targets_all[i], oof_preds_all[i]))
            print("auc mean : ", sum(scores_auc_all[i])/len(scores_auc_all[i]))
        
    # Overall OOF CV Score
    tr_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').drop('sig_id', axis = 1)
    res_all = np.zeros(tr_targets[cols].shape)
    res_all[train_features['cp_type'] == 0] = res
    overall_oof_score = log_loss_metric(tr_targets[cols].values, res_all)
    print(f'TabNet Overall OOF CV Score:', overall_oof_score)

In [None]:
submission_tabnet = pd.read_csv('../input/lish-moa/sample_submission.csv')
submission_tabnet[cols] = preds_test.mean(1).mean(0)
submission_tabnet.loc[test['cp_type'] == 1, cols] = 0

# Model Function

In [None]:
def create_mlp_elu(num_columns, hidden_units, dropout_rates, lr):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ), name = 'inp')
    x = tf.keras.layers.BatchNormalization(name = 'bn0')(inp)
    x = tf.keras.layers.Dropout(0.3, name = 'dp0')(x)
    
    for i, units in enumerate(hidden_units):
        
        x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(units, activation = 'elu', name = f'd{i}'), name = f'w{i}')(x)
        x = tf.keras.layers.Dropout(dropout_rates, name = f'dp{i + 1}')(x)
        x = tf.keras.layers.BatchNormalization(name = f'bn{i + 1}')(x)
        
    out = tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation = 'sigmoid', 
                                                               bias_initializer=tf.keras.initializers.Constant(6.3), 
                                                               name = 'out206'), name = 'wn_out206')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    
    metrics = [tf.keras.losses.BinaryCrossentropy(name = 'mean_loss')]
    model.compile(optimizer = tfa.optimizers.AdamW(weight_decay = 1e-5, learning_rate = lr),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.0008), 
                  metrics = metrics, 
                  )
    
    return model

In [None]:
def create_resnet(num_columns, hidden_units, dropout_rates, lr):
    inp = layers.Input(shape = (num_columns,), name = 'inp')

    head_1 = tf.keras.models.Sequential([
        layers.BatchNormalization(),
        layers.Dropout(dropout_rates[0]),
        layers.Dense(hidden_units[0], activation = "elu"), 
        layers.BatchNormalization(),
        layers.Dense(hidden_units[1], activation = "elu")
        ], name = 'Head1') 

    inp1 = head_1(inp)

    head_2 = tf.keras.models.Sequential([
        layers.BatchNormalization(),
        layers.Dropout(dropout_rates[0]),
        layers.Dense(hidden_units[2], "relu"),
        layers.BatchNormalization(),
        layers.Dense(hidden_units[2], "elu"),
        layers.BatchNormalization(),
        layers.Dense(hidden_units[1], "relu"),
        layers.BatchNormalization(),
        layers.Dense(hidden_units[1], "elu")
        ], name = 'Head2')

    inp2 = head_2(inp1)
    inp2_avg = layers.Average(name = 'average')([inp1, inp2]) 

    head_3 = tf.keras.models.Sequential([
        layers.BatchNormalization(),
        layers.Dense(hidden_units[3], kernel_initializer = 'lecun_normal', activation = 'selu'),
        layers.BatchNormalization(),
        layers.Dense(206, kernel_initializer = 'lecun_normal', activation = 'selu'),
        layers.BatchNormalization(),
        ], name = 'Head3')

    out0 = head_3(inp2_avg)
    out = layers.Dense(206, activation = "sigmoid", name = 'out_206')(out0)

    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tfa.optimizers.AdamW(weight_decay = 1e-5, learning_rate = lr), 
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.0008), 
                  metrics = tf.keras.losses.BinaryCrossentropy(name = 'mean_loss'),
                 )
    
    return model

In [None]:
def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, lr):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ), name = 'inp')
    x = tf.keras.layers.BatchNormalization(name = 'bn0')(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0], name = 'dp0')(x)
    for i in range(len(hidden_units)): 
        x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(hidden_units[i], name = f'd{i}'), name = f'wn{i}')(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish, name = f'a{i}')(x)
        x = tf.keras.layers.BatchNormalization(name = f'bn{i+1}')(x)
        x = tf.keras.layers.Dropout(dropout_rates[i+1], name = f'dp{i+1}')(x)    
        
    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_labels, 
                                                             bias_initializer = tf.keras.initializers.Constant(6.3), 
                                                             name = f'output_d{num_labels}'), 
                                       name = f'output_wn{num_labels}')(x)
    out = tf.keras.layers.Activation('sigmoid', name = f'output_a{num_labels}')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tfa.optimizers.AdamW(weight_decay = 1e-5, learning_rate = lr),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.0008), 
                  metrics = tf.keras.losses.BinaryCrossentropy(name = 'mean_loss'), 
                 )
    
    return model

In [None]:
def create_rtn(num_columns, num_labels, rethink_iter, num_layers, hidden_units, dropout_rates, lr):
    inp = tf.keras.layers.Input(shape = (num_columns, ), name = 'inp')
    x = tf.keras.layers.BatchNormalization(name = 'bn0')(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0], name = 'dp0')(x)
    x = tf.keras.layers.RepeatVector(rethink_iter, name = 'rv')(x)

    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(hidden_units[0], name = 'd0'), name = 'wn0')(x)
    x = tf.keras.layers.Activation('swish', name = 'a0')(x)
    
    for i in range(num_layers):
        if i != num_layers - 1:
            x = tf.keras.layers.LSTM(hidden_units[1], return_sequences = True, dropout = dropout_rates[1], name = f'lstm{i}')(x)
        else:
            x = tf.keras.layers.LSTM(hidden_units[1], return_sequences = False, dropout = dropout_rates[1], name = f'lstm{i}')(x)

    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_labels, name = f'd{num_labels}'), name = f'wn{num_labels}')(x)
    out = tf.keras.layers.Activation('sigmoid', name = f'out{num_labels}')(x)

    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tfa.optimizers.AdamW(weight_decay = 1e-5, learning_rate = lr), 
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.0008), 
                  metrics = tf.keras.losses.BinaryCrossentropy(name = 'mean_loss'),
                 )
    
    return model

# Inference Model

In [None]:
def inference_model(X_train, Y_train_2, Y_nonscored, features, model_name, model_name_2, save_path, num_seeds, num_splits, 
                    model_params, X_test = None, sample_sub_path = None, verbose = 0):
    start_time_all = time()
    oof = Y_train_2.copy()
    oof.loc[:, Y_train_2.columns] = 0
    overall_score = []
    if X_test is not None:
        sub = pd.read_csv(sample_sub_path)
        sub.loc[:, Y_train_2.columns] = 0
    else:
        sub = None
    if 'RTN' in model_name_2:
        model = create_rtn(len(features), 206, **model_params)
    elif 'RESNET' in model_name_2:
        model = create_resnet(len(features), **model_params)
    elif 'ELU' in model_name_2:
        model = create_mlp_elu(len(features), **model_params)
    elif 'MLP' in model_name_2:
        model = create_mlp(len(features), 206, **model_params)
    for nums, seed in enumerate(range(num_seeds)):
        start_time_seed = time()
        tf.random.set_seed(seed)
        np.random.seed(seed)
        random.seed(seed) 
        mean_score = 0
        skf = MultilabelStratifiedKFold(n_splits = num_splits, random_state = seed, shuffle = True)
        for n, (tr, te) in enumerate(skf.split(Y_train_2, Y_train_2)):
            print(f'Model:{model_name}, Seed:{seed}, Fold:{n}', end = '\r')
            start_time_fold = time()
            if CALCULATE_OOF:
                x_tr, x_val = X_train.values[tr][:, features], X_train.values[te][:, features]
                y_tr, y_val = Y_train_2.values[tr], Y_train_2.values[te]
                
            if 'ELU' in model_name_2:
                ckp_path = save_path + f'{model_name}_{seed}_{n}.hdf5' 
            else:
                ckp_path = save_path + f'{model_name}_Seed_{seed}_Fold_{n}.hdf5' 
            model.load_weights(ckp_path)
            
            if X_test is not None:
                x_tt = X_test.values[:, features]
                test_predict = model.predict(x_tt, batch_size = 1024)
                sub.loc[:, Y_train_2.columns] += test_predict / (num_splits * num_seeds)
            
            if CALCULATE_OOF:
                val_predict = model.predict(x_val, batch_size = 1024)
#                 fold_score = hist['val_mean_loss'].min()
                fold_score = log_loss_metric(y_val, val_predict)
                mean_score += fold_score / num_splits
                oof.loc[te, Y_train_2.columns] += val_predict / num_seeds
#                 print(f'[{str(datetime.timedelta(seconds = time() - start_time_fold))[0:7]}] {model_name} Seed {seed}, Fold {n}:', fold_score)
            
#             del model
#             x = gc.collect()
#             K.clear_session()
            
        if CALCULATE_OOF:
            pass
#             print(f'[{str(datetime.timedelta(seconds = time() - start_time_seed))[0:7]}] {model_name} Seed {seed} Mean Score:', mean_score)
    
    if X_test is not None:
        sub.loc[X_test['cp_type'] == 1, Y_train_2.columns] = 0
    
    if CALCULATE_OOF:
        oof.loc[X_train['cp_type'] == 1, Y_train_2.columns] = 0
        overall_score = log_loss_metric(Y_train_2.values, oof[Y_train_2.columns].values)
        print(f'[{str(datetime.timedelta(seconds = time() - start_time_all))[0:7]}] {model_name} OOF Score:', overall_score)
        
    return overall_score, oof, sub

In [None]:
def inference_ensemble(X_train, Y_train_2, model_name, model_name_2, save_path, num_seeds, num_splits, 
                       model_params, X_test = None, sample_sub_path = None, verbose = 0):
    start_time_all = time()
    oof = Y_train_2.copy()
    oof.loc[:, Y_train_2.columns] = 0
    overall_score = []
    if X_test is not None:
        sub = pd.read_csv(sample_sub_path)
        sub.loc[:, Y_train_2.columns] = 0
    else:
        sub = None
    if 'RTN' in model_name_2:
        model = create_rtn(X_train.shape[1], 206, **model_params)
    elif 'RESNET' in model_name_2:
        model = create_resnet(X_train.shape[1], **model_params)
    elif 'ELU' in model_name_2:
        model = create_mlp_elu(X_train.shape[1], **model_params)
    elif 'MLP' in model_name_2:
        model = create_mlp(X_train.shape[1], 206, **model_params)
    for seed in range(num_seeds):
        start_time_seed = time()
        tf.random.set_seed(seed)
        np.random.seed(seed)
        random.seed(seed) 
        mean_score = 0
        skf = MultilabelStratifiedKFold(n_splits = num_splits, random_state = seed, shuffle = True)
        for n, (tr, te) in enumerate(skf.split(Y_train_2, Y_train_2)):
            print(f'Model:{model_name}, Seed:{seed}, Fold:{n}', end = '\r')
            start_time_fold = time()
            if CALCULATE_OOF:
                x_tr, x_val = X_train[tr], X_train[te]
                y_tr, y_val = Y_train_2.values[tr], Y_train_2.values[te]
                
            if 'ELU' in model_name_2:
                ckp_path = save_path + f'{model_name}_{seed}_{n}.hdf5' 
            else:
                ckp_path = save_path + f'{model_name}_Seed_{seed}_Fold_{n}.hdf5'
            model.load_weights(ckp_path)
            
            if X_test is not None:
                x_tt = X_test
                test_predict = model.predict(x_tt, batch_size = 1024)
                sub.loc[:, Y_train_2.columns] += test_predict / (num_splits * num_seeds)
            
            if CALCULATE_OOF:
                val_predict = model.predict(x_val, batch_size = 1024)
#                 fold_score = hist['val_mean_loss'].min()
                fold_score = log_loss_metric(y_val, val_predict)
                mean_score += fold_score / num_splits
                oof.loc[te, Y_train_2.columns] += val_predict / num_seeds
#                 print(f'[{str(datetime.timedelta(seconds = time() - start_time_fold))[0:7]}] {model_name} Seed {seed}, Fold {n}:', fold_score)
            
#             del model
#             x = gc.collect()
#             K.clear_session()

        if CALCULATE_OOF:
            pass
#             print(f'[{str(datetime.timedelta(seconds = time() - start_time_seed))[0:7]}] {model_name} Seed {seed} Mean Score:', mean_score)
    
    if X_test is not None:
        sub.loc[test['cp_type'] == 1, Y_train_2.columns] = 0
    
    if CALCULATE_OOF:
        oof.loc[train['cp_type'] == 1, Y_train_2.columns] = 0
        overall_score = log_loss_metric(Y_train_2.values, oof[Y_train_2.columns].values)
        print(f'[{str(datetime.timedelta(seconds = time() - start_time_all))[0:7]}] {model_name} OOF Score:', overall_score)
        
    return overall_score, oof, sub

In [None]:
model_names = ['ELU_0', 'ELU_1', 'ELU_2', 'RESNET_0', 'RESNET_1', 'RESNET_2', 'MLP_0', 'MLP_1', 'MLP_2', 'RTN']

model_params = [{'hidden_units': [512, 512, 4096],  
                 'dropout_rates': 0.45,
                 'lr': 1e-4,
                }, 
                {'hidden_units': [512, 1024],  
                 'dropout_rates': 0.463,
                 'lr': 1e-4,
                },
                {'hidden_units': [1024, 1024],  
                 'dropout_rates': 0.5, 
                 'lr': 1e-4,
                },
                {'hidden_units': [128, 896, 256, 1024], 
                 'dropout_rates': [0.5615750059111406, 0.381766362379825], 
                 'lr': 1e-4,
                } , 
                {'hidden_units': [256, 1024, 512, 896], 
                 'dropout_rates': [0.5493076960151594, 0.5121645764863383], 
                 'lr': 1e-4,
                }, 
                {'hidden_units': [384, 1024, 256, 896], 
                 'dropout_rates': [0.5923827297670073, 0.37098621422772815], 
                 'lr': 1e-4,
                }, 
                {'hidden_units': [128, 1024],  
                 'dropout_rates': [0.41458519175008285, 0.38992411412605404, 0.10265155152086326], 
                 'lr': 1e-4,
                }, 
                {'hidden_units': [896, 128, 1664],  
                 'dropout_rates': [0.532412647744322, 0.2498368055693044, 0.36619131749273925, 0.1386672227832089], 
                 'lr': 1e-4,
                }, 
                {'hidden_units': [128, 896, 1024, 1792],  
                 'dropout_rates': [0.4131287697962003, 0.33921841003415876, 0.13058255266781393, 0.20075775903486198, 0.3354496944535896],
                 'lr': 1e-4,
                }, 
                {'rethink_iter': 3,  
                 'num_layers': 1, 
                 'hidden_units': [128, 512],  
                 'dropout_rates': [0.3, 0.4], 
                 'lr': 1e-4,
                },]

In [None]:
from time import time

VERBOSE = 0
sample_sub_path = '../input/lish-moa/sample_submission.csv'

overall_oof_scores = []
oof_elu = []
oof_resnet = []
oof_mlp = []
submission_elu = []
submission_resnet = []
submission_mlp = []
for m in range(len(model_params)):
    print(model_names[m], model_params[m])
    if 'ELU' in model_names[m]:
        model_idx = model_names[m].split('_')[1]
        save_path = '../input/multilabel-v2/'
        model_name = f'Model_{model_idx}'
    elif 'RESNET' in model_names[m]:
        model_idx = model_names[m].split('_')[1]
        save_path = '../input/multilabel-v4/'
        model_name = f'Model{model_idx}'
    elif 'MLP' in model_names[m]:
        model_idx = model_names[m].split('_')[1]
        save_path = '../input/miltilabel-mlpstack/'
        model_name = f'Model{model_idx}'
    else:
        save_path = '../input/multilabel-rtn-single/'
        model_name = f'Model0'        
        
    oof_score, res, ss = inference_model(train, train_targets, train_targets_nonscored, top_feats, model_name, model_names[m], 
                                         save_path, N_STARTS, N_SPLITS, model_params[m], test, sample_sub_path, VERBOSE)
    if 'ELU' in model_names[m]:
        oof_elu.append(res)
        submission_elu.append(ss)
    elif 'RESNET' in model_names[m]:
        oof_resnet.append(res)
        submission_resnet.append(ss)
    elif 'MLP' in model_names[m]:
        oof_mlp.append(res)
        submission_mlp.append(ss)
    else:
        oof_rtn = res
        submission_rtn = ss
    
    if CALCULATE_OOF:
        # Overall OOF CV Score
        tr_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').drop('sig_id', axis = 1)
        res_all = np.zeros(tr_targets[cols].shape)
        res_all[train_features['cp_type'] == 0] = res[cols].values
        overall_oof_score = log_loss_metric(tr_targets[cols].values, res_all)
        overall_oof_scores.append(overall_oof_score)
        print(f'{model_names[m]} Overall OOF CV Score:', overall_oof_score)

In [None]:
if CALCULATE_OOF:
    for n, name in enumerate(model_names):
        print(f'{name} OOF:\t', overall_oof_scores[n])

In [None]:
if CALCULATE_OOF:
    train_new_elu = np.concatenate([oof0[cols].values for oof0 in oof_elu], axis = 1)
    train_new_resnet = np.concatenate([oof0[cols].values for oof0 in oof_resnet], axis = 1)
    train_new_mlp = np.concatenate([oof0[cols].values for oof0 in oof_mlp], axis = 1)
else:
    train_new_elu = np.zeros((train.shape[0], len(cols) * 3))
    train_new_resnet = np.zeros((train.shape[0], len(cols) * 3))
    train_new_mlp = np.zeros((train.shape[0], len(cols) * 3))
    
test_new_elu = np.concatenate([sub[cols].values for sub in submission_elu], axis = 1)
test_new_resnet = np.concatenate([sub[cols].values for sub in submission_resnet], axis = 1)
test_new_mlp = np.concatenate([sub[cols].values for sub in submission_mlp], axis = 1)

In [None]:
model_names = ['ELU_stack', 'RESNET_stack', 'MLP_stack']

model_params = [{'hidden_units': [1024, 1024],  
                 'dropout_rates': 0.336, 
                 'lr': 1e-4,
                },
                {'hidden_units': [1024, 896, 896, 256], 
                 'dropout_rates': [0.5398428318872255, 0.5734093398641228], 
                 'lr': 1e-4, 
                },
                {'hidden_units': [1920, 768],  
                 'dropout_rates': [0.36130273975713795, 0.38130486900003896, 0.44485672673556004], 
                 'lr': 1e-4,
                },]

In [None]:
overall_oof_stack_scores = []
for m in range(len(model_params)):
    print(model_names[m], model_params[m])
    if 'ELU' in model_names[m]:
        save_path = '../input/multilabel-v2/'
        model_name = f'EModel'
        oof_score, res, ss = inference_ensemble(train_new_elu, train_targets, model_name, model_names[m], save_path, 
                                                N_STARTS, N_SPLITS, model_params[m], test_new_elu, sample_sub_path, VERBOSE)
        oof_elu_stack = res.copy()
        submission_elu_stack = ss.copy()
        
    elif 'RESNET' in model_names[m]:
        save_path = '../input/multilabel-v4/'
        model_name = f'EModel_Stack'
        oof_score, res, ss = inference_ensemble(train_new_resnet, train_targets, model_name, model_names[m], save_path, 
                                                N_STARTS, N_SPLITS, model_params[m], test_new_resnet, sample_sub_path, VERBOSE)
        oof_resnet_stack = res.copy()
        submission_resnet_stack = ss.copy()
        
    elif 'MLP' in model_names[m]:
        save_path = '../input/miltilabel-mlpstack/'
        model_name = f'EModel_Stack'
        oof_score, res, ss = inference_ensemble(train_new_mlp, train_targets, model_name, model_names[m], save_path, 
                                                N_STARTS, N_SPLITS, model_params[m], test_new_mlp, sample_sub_path, VERBOSE)
        oof_mlp_stack = res.copy()
        submission_mlp_stack = ss.copy()
    
    if CALCULATE_OOF:
        # Overall OOF CV Score
        tr_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').drop('sig_id', axis = 1)
        res_all = np.zeros(tr_targets[cols].shape)
        res_all[train_features['cp_type'] == 0] = res[cols].values
        overall_oof_score = log_loss_metric(tr_targets[cols].values, res_all)
        overall_oof_stack_scores.append(overall_oof_score)
        print(f'{model_names[m]} Overall OOF CV Score:', overall_oof_score)

In [None]:
if CALCULATE_OOF:
    for n, name in enumerate(model_names):
        print(f'{name} OOF:\t', overall_oof_stack_scores[n])

# Ensemble

In [None]:
@njit
def post_process(pred, low, high):
    pred_copy = pred.copy()
    num_idx = 0
    for i in range(pred_copy.shape[0]):
        flag = np.zeros(pred_copy.shape[1])
        array = pred_copy[i].copy()
        for j in range(pred_copy.shape[1]):
            if (pred_copy[i, j] <= low) or (pred_copy[i, j] >= high):
                flag[j] = 1
            array[j] = round(array[j])
        if flag.all() and pred_copy[i].any(): #array.any()
            pred_copy[i] = array
            num_idx += 1
    return pred_copy, num_idx

In [None]:
ss = pd.read_csv('../input/lish-moa/sample_submission.csv')
ss[cols] = 0.0877289129911273 * submission_elu_stack[cols].values + \
           0.026620985745766827 * submission_resnet_stack[cols].values + \
           0.04467573105638814 * submission_mlp_stack[cols].values + \
           0.3382592931952609 * submission_rtn[cols].values + \
           0.5027150770114568 * submission_tabnet[cols].values
ss.head(10)

In [None]:
if POST_PROCESS:
    low = 0.012
    high = 0.98

    ss[cols], num_idx = post_process(ss[cols].values, low, high)
    print(num_idx)

In [None]:
del data, data2, train2, test2, data_transformed, train_transformed, test_transformed
if CALCULATE_OOF:
    del tr_targets, overall_oof_score, overall_oof_scores, oof_preds, oof_preds_all, oof_targets, oof_targets_all, oof_score
    del oof_elu_stack, oof_resnet_stack, oof_mlp_stack, oof_elu, oof_resnet, oof_mlp, oof_rtn
rubbish = gc.collect()

# Pseudo Labelling

In [None]:
if FINETUNE:
    pseudo_targets = ss.loc[test['cp_type'] == 0, cols].values
    pseudo_train = test.loc[test['cp_type'] == 0, test.columns].values

In [None]:
if FINETUNE:
    import time
    if cfg.strategy == "KFOLD":
        oof_preds_all = []
        oof_targets_all = []
        scores_all =  []
        scores_auc_all= []
        preds_test = []
        res = np.zeros(targets_tr.shape)
        for nums, seed in enumerate(range(cfg.num_ensembling)):
            print("## SEED : ", seed)
            mskf = MultilabelStratifiedKFold(n_splits=cfg.SPLITS, random_state=cfg.seed+seed, shuffle=True)
            oof_preds = []
            oof_targets = []
            scores = []
            scores_auc = []
            p = []
            for j, (train_idx, val_idx) in enumerate(mskf.split(np.zeros(len(cat_tr)), targets_tr)):
                print("FOLDS : ", j)

                ## model
                x_tr, y_tr = np.concatenate([cat_tr[train_idx], numerical_tr[train_idx]], axis=1), targets_tr[train_idx]
                x_tr, y_tr = np.concatenate([x_tr, X_test[test['cp_type'] == 0, :]]), np.concatenate([y_tr, pseudo_targets])

                X_train, y_train = torch.as_tensor(x_tr), torch.as_tensor(y_tr)
                X_val, y_val = torch.as_tensor(np.concatenate([cat_tr[val_idx], numerical_tr[val_idx]], axis=1)), torch.as_tensor(targets_tr[val_idx])

                model = TabNetRegressor(n_d=24, n_a=24, n_steps=1, gamma=1.3, lambda_sparse=0, cat_dims=cfg.cat_dims, cat_emb_dim=cfg.cat_emb_dim, cat_idxs=cfg.cats_idx, optimizer_fn=torch.optim.Adam,
                                        optimizer_params=dict(lr=2e-2, weight_decay=1e-5), mask_type='entmax', device_name=cfg.device, scheduler_params=dict(milestones=[100, 150], gamma=0.9), 
                                        scheduler_fn=torch.optim.lr_scheduler.MultiStepLR)
                name = cfg.save_name + f"_fold{j}_{seed}"
                model.load_model(name)
                model.fit(X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, max_epochs=200, patience=5, batch_size=1024, virtual_batch_size=128,
                          num_workers=0, drop_last=False, loss_fn=sbcewlogits, pretrain=True, optimizer_params=dict(lr=1e-4, weight_decay=1e-5))
                model.load_best_model()
                save_name = f"PL_tabnet_raw_step1_fold{j}_{seed}"
                model.save_model(save_name)
                
                # preds on test
                temp = model.predict(X_test)
                p.append(torch.sigmoid(torch.as_tensor(temp)).detach().cpu().numpy())
                
                if CALCULATE_OOF_PL:
                    preds = model.predict(X_val)
                    preds = torch.sigmoid(torch.as_tensor(preds)).detach().cpu().numpy()
                    score = log_loss_multi(y_val, preds)
                    res[val_idx] += preds / cfg.num_ensembling

                    ## save oof to compute the CV later
                    oof_preds.append(preds)
                    oof_targets.append(y_val)
                    scores.append(score)
                    scores_auc.append(auc_multi(y_val,preds))
                    print(f"validation fold {j} : {score}")

            p = np.stack(p)
            preds_test.append(p)
            if CALCULATE_OOF_PL:
                oof_preds_all.append(np.concatenate(oof_preds))
                oof_targets_all.append(np.concatenate(oof_targets))
                scores_all.append(np.array(scores))
                scores_auc_all.append(np.array(scores_auc))

        preds_test = np.stack(preds_test)

In [None]:
if FINETUNE:
    if CALCULATE_OOF_PL:

        if cfg.strategy == "KFOLD":

            for i in range(cfg.num_ensembling): 
                print("CV score fold : ", log_loss_multi(oof_targets_all[i], oof_preds_all[i]))
                print("auc mean : ", sum(scores_auc_all[i])/len(scores_auc_all[i]))

        # Overall OOF CV Score
        tr_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').drop('sig_id', axis = 1)
        res_all = np.zeros(tr_targets[cols].shape)
        res_all[train_features['cp_type'] == 0] = res
        overall_oof_score = log_loss_metric(tr_targets[cols].values, res_all)
        print(f'TabNet Overall OOF CV Score:', overall_oof_score)
        oof_tabnet2 = res_all

In [None]:
if FINETUNE:

    submission_tabnet2 = pd.read_csv('../input/lish-moa/sample_submission.csv')
    submission_tabnet2[cols] = preds_test.mean(1).mean(0)
    submission_tabnet2.loc[test['cp_type'] == 1, cols] = 0

In [None]:
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from time import time

def train_model(X_train, Y_train_2, Y_nonscored, features, model_name, model_name_2, save_path, num_seeds, num_splits, 
                model_params, X_test = None, sample_sub_path = None, pseudo_labeling = True, verbose = 0):
    start_time_all = time()
    oof = Y_train_2.copy()
    oof.loc[:, Y_train_2.columns] = 0
    overall_score = []
    if X_test is not None:
        sub = pd.read_csv(sample_sub_path)
        sub.loc[:, Y_train_2.columns] = 0
    else:
        sub = None
    if 'RTN' in model_name_2:
        model = create_rtn(len(features), 206, **model_params)
    elif 'RESNET' in model_name_2:
        model = create_resnet(len(features), **model_params)
    elif 'ELU' in model_name_2:
        model = create_mlp_elu(len(features), **model_params)
    elif 'MLP' in model_name_2:
        model = create_mlp(len(features), 206, **model_params)
    for nums, seed in enumerate(range(num_seeds)):
        start_time_seed = time()
        tf.random.set_seed(seed)
        np.random.seed(seed)
        random.seed(seed) 
        mean_score = 0
        skf = MultilabelStratifiedKFold(n_splits = num_splits, random_state = seed, shuffle = True)
        for n, (tr, te) in enumerate(skf.split(Y_train_2, Y_train_2)):
            print(f'Model:{model_name}, Seed:{seed}, Fold:{n}', end = '\r')
            start_time_fold = time()
            x_tr, x_val = X_train.values[tr][:, features], X_train.values[te][:, features]
            y_tr, y_val = Y_train_2.values[tr], Y_train_2.values[te]
            if pseudo_labeling:
                x_tr = np.concatenate([x_tr, pseudo_train[:, features]])
                y_tr = np.concatenate([y_tr, pseudo_targets])

            if X_test is not None:
                x_tt = X_test.values[:, features]
            
            if 'ELU' in model_name_2:
                ckp_path = save_path + f'{model_name}_{seed}_{n}.hdf5' 
            else:
                ckp_path = save_path + f'{model_name}_Seed_{seed}_Fold_{n}.hdf5'
                
            model.load_weights(ckp_path)

            rlr = ReduceLROnPlateau(monitor = 'val_mean_loss', factor = 0.1, patience = 3, 
                                    verbose = verbose, min_delta = 1e-4, mode = 'min')
            ckp = ModelCheckpoint(f'{model_name}_Seed_{seed}_Fold_{n}.hdf5', monitor = 'val_mean_loss', verbose = 0, 
                                  save_best_only = True, save_weights_only = True, mode = 'min')
            es = EarlyStopping(monitor = 'val_mean_loss', min_delta = 1e-4, patience = 5, mode = 'min', 
                               baseline = None, restore_best_weights = True, verbose = verbose)
            history = model.fit(x_tr, y_tr, validation_data = (x_val, y_val), epochs = 1000, 
                                batch_size = 128, callbacks = [rlr, ckp, es], verbose = verbose)
            hist = pd.DataFrame(history.history)
            model.load_weights(f'{model_name}_Seed_{seed}_Fold_{n}.hdf5')
            
            if X_test is not None:
                test_predict = model.predict(x_tt, batch_size = 1024)
                sub.loc[:, Y_train_2.columns] += test_predict / (num_splits * num_seeds)
            
            if CALCULATE_OOF_PL:
                val_predict = model.predict(x_val, batch_size = 1024)
                fold_score = hist['val_mean_loss'].min()
#                 fold_score = log_loss_metric(y_val, val_predict)
                mean_score += fold_score / num_splits
                oof.loc[te, Y_train_2.columns] += val_predict / num_seeds
                print(f'[{str(datetime.timedelta(seconds = time() - start_time_fold))[0:7]}] {model_name} Seed {seed}, Fold {n}:', fold_score)
                  
#             del model
#             x = gc.collect()
#             K.clear_session()
            
        if CALCULATE_OOF_PL:
            print(f'[{str(datetime.timedelta(seconds = time() - start_time_seed))[0:7]}] {model_name} Seed {seed} Mean Score:', mean_score)
    
    if X_test is not None:
        sub.loc[X_test['cp_type'] == 1, Y_train_2.columns] = 0
    
    if CALCULATE_OOF_PL:
        oof.loc[X_train['cp_type'] == 1, Y_train_2.columns] = 0
        overall_score = log_loss_metric(Y_train_2.values, oof[Y_train_2.columns].values)
        print(f'[{str(datetime.timedelta(seconds = time() - start_time_all))[0:7]}] {model_name} OOF Score:', overall_score)
        
    return overall_score, oof, sub

In [None]:
model_names = ['ELU_0', 'ELU_1', 'ELU_2', 'RTN']

model_params = [{'hidden_units': [512, 512, 4096],  
                 'dropout_rates': 0.45,
                 'lr': 1e-4,
                }, 
                {'hidden_units': [512, 1024],  
                 'dropout_rates': 0.463,
                 'lr': 1e-4,
                },
                {'hidden_units': [1024, 1024],  
                 'dropout_rates': 0.5, 
                 'lr': 1e-4,
                },
                {'rethink_iter': 3,  
                 'num_layers': 1, 
                 'hidden_units': [128, 512],  
                 'dropout_rates': [0.3, 0.4], 
                 'lr': 1e-4,
                },]

In [None]:
if FINETUNE:
    Pseudo_Labeling = True
    VERBOSE = 0
    sample_sub_path = '../input/lish-moa/sample_submission.csv'

    overall_oof_scores = []
    oof_elu2 = []
    submission_elu2 = []
    for m in range(len(model_params)):
        print(model_names[m], model_params[m])        
        if 'ELU' in model_names[m]:
            model_idx = model_names[m].split('_')[1]
            save_path = '../input/multilabel-v2/'
            model_name = f'Model_{model_idx}'
        elif 'RTN' in model_names[m]:
            save_path = '../input/multilabel-rtn-single/'
            model_name = f'Model0' 
        
        oof_score, res, ss = train_model(train, train_targets, train_targets_nonscored, top_feats, model_name, model_names[m], save_path, 
                                         N_STARTS, N_SPLITS, model_params[m], test, sample_sub_path, Pseudo_Labeling, VERBOSE)        
        if 'ELU' in model_names[m]:
            oof_elu2.append(res)
            submission_elu2.append(ss)
        elif 'RTN' in model_names[m]:
            oof_rtn2 = res
            submission_rtn2 = ss
        
        if CALCULATE_OOF_PL:
            # Overall OOF CV Score
            tr_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').drop('sig_id', axis = 1)
            res_all = np.zeros(tr_targets[cols].shape)
            res_all[train_features['cp_type'] == 0] = res[cols].values
            overall_oof_score = log_loss_metric(tr_targets[cols].values, res_all)
            overall_oof_scores.append(overall_oof_score)
            print(f'{model_name} Overall OOF CV Score:', overall_oof_score)

In [None]:
if FINETUNE:
    if CALCULATE_OOF_PL:
        print(f'{model_names} OOF:\t', overall_oof_scores)

In [None]:
def train_ensemble(X_train, Y_train_2, model_name, model_name_2, save_path, num_seeds, num_splits, 
                   model_params, X_test = None, sample_sub_path = None, verbose = 0):
    start_time_all = time()
    oof = Y_train_2.copy()
    oof.loc[:, Y_train_2.columns] = 0
    overall_score = []
    if X_test is not None:
        sub = pd.read_csv(sample_sub_path)
        sub.loc[:, Y_train_2.columns] = 0
    else:
        sub = None
    if 'RTN' in model_name_2:
        model = create_rtn(X_train.shape[1], 206, **model_params)
    elif 'RESNET' in model_name_2:
        model = create_resnet(X_train.shape[1], **model_params)
    elif 'ELU' in model_name_2:
        model = create_mlp_elu(X_train.shape[1], **model_params)
    elif 'MLP' in model_name_2:
        model = create_mlp(X_train.shape[1], 206, **model_params)
    for seed in range(num_seeds):
        start_time_seed = time()
        tf.random.set_seed(seed)
        np.random.seed(seed)
        random.seed(seed) 
        mean_score = 0
        skf = MultilabelStratifiedKFold(n_splits = num_splits, random_state = seed, shuffle = True)
        for n, (tr, te) in enumerate(skf.split(Y_train_2, Y_train_2)):
            print(f'Model:{model_name}, Seed:{seed}, Fold:{n}', end = '\r')
            start_time_fold = time()
            x_tr, x_val = X_train[tr], X_train[te]
            y_tr, y_val = Y_train_2.values[tr], Y_train_2.values[te]
                
            if 'ELU' in model_name_2:
                ckp_path = save_path + f'{model_name}_{seed}_{n}.hdf5' 
            else:
                ckp_path = save_path + f'{model_name}_Seed_{seed}_Fold_{n}.hdf5'
            model.load_weights(ckp_path)
            
            rlr = ReduceLROnPlateau(monitor = 'val_mean_loss', factor = 0.1, patience = 3, 
                                    verbose = verbose, min_delta = 1e-4, mode = 'min')
            ckp = ModelCheckpoint(f'{model_name}_Seed_{seed}_Fold_{n}.hdf5', monitor = 'val_mean_loss', verbose = 0, 
                                  save_best_only = True, save_weights_only = True, mode = 'min')
            es = EarlyStopping(monitor = 'val_mean_loss', min_delta = 1e-4, patience = 5, mode = 'min', 
                               baseline = None, restore_best_weights = True, verbose = verbose)
            history = model.fit(x_tr, y_tr, validation_data = (x_val, y_val), epochs = 1000, 
                                batch_size = 128, callbacks = [rlr, ckp, es], verbose = verbose)
            hist = pd.DataFrame(history.history)
            model.load_weights(f'{model_name}_Seed_{seed}_Fold_{n}.hdf5')
            
            if X_test is not None:
                x_tt = X_test
                test_predict = model.predict(x_tt, batch_size = 1024)
                sub.loc[:, Y_train_2.columns] += test_predict / (num_splits * num_seeds)
            
            if CALCULATE_OOF_PL:
                val_predict = model.predict(x_val, batch_size = 1024)
                fold_score = hist['val_mean_loss'].min()
#                 fold_score = log_loss_metric(y_val, val_predict)
                mean_score += fold_score / num_splits
                oof.loc[te, Y_train_2.columns] += val_predict / num_seeds
                print(f'[{str(datetime.timedelta(seconds = time() - start_time_fold))[0:7]}] {model_name} Seed {seed}, Fold {n}:', fold_score)
            
#             del model
#             x = gc.collect()
#             K.clear_session()

        if CALCULATE_OOF_PL:
            print(f'[{str(datetime.timedelta(seconds = time() - start_time_seed))[0:7]}] {model_name} Seed {seed} Mean Score:', mean_score)
    
    if X_test is not None:
        sub.loc[test['cp_type'] == 1, Y_train_2.columns] = 0
    
    if CALCULATE_OOF_PL:
        oof.loc[train['cp_type'] == 1, Y_train_2.columns] = 0
        overall_score = log_loss_metric(Y_train_2.values, oof[Y_train_2.columns].values)
        print(f'[{str(datetime.timedelta(seconds = time() - start_time_all))[0:7]}] {model_name} OOF Score:', overall_score)
        
    return overall_score, oof, sub

In [None]:
if FINETUNE:
    if CALCULATE_OOF_PL:
        train_new_elu2 = np.concatenate([oof0[cols].values for oof0 in oof_elu2], axis = 1)
    else:
        train_new_elu2 = np.zeros((train.shape[0], len(cols) * 3))

    test_new_elu2 = np.concatenate([sub[cols].values for sub in submission_elu2], axis = 1)

In [None]:
model_names = ['ELU_stack']

model_params = [{'hidden_units': [1024, 1024],  
                 'dropout_rates': 0.336, 
                 'lr': 1e-4,
                },]

In [None]:
if FINETUNE:
    overall_oof_stack_scores = []
    for m in range(len(model_params)):
        print(model_names[m], model_params[m])
        if 'ELU' in model_names[m]:
            save_path = '../input/multilabel-v2/'
            model_name = f'EModel'
            oof_score, res, ss = train_ensemble(train_new_elu2, train_targets, model_name, model_names[m], save_path, 
                                                N_STARTS, N_SPLITS, model_params[m], test_new_elu2, sample_sub_path, VERBOSE)
            oof_elu_stack2 = res.copy()
            submission_elu_stack2 = ss.copy()

        if CALCULATE_OOF_PL:
            # Overall OOF CV Score
            tr_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').drop('sig_id', axis = 1)
            res_all = np.zeros(tr_targets[cols].shape)
            res_all[train_features['cp_type'] == 0] = res[cols].values
            overall_oof_score = log_loss_metric(tr_targets[cols].values, res_all)
            overall_oof_stack_scores.append(overall_oof_score)
            print(f'{model_names[m]} Overall OOF CV Score:', overall_oof_score)

In [None]:
if CALCULATE_OOF_PL:
    for n, name in enumerate(model_names):
        print(f'{name} OOF:\t', overall_oof_stack_scores[n])

# Final Ensemble

In [None]:
if FINETUNE:
    ss = pd.read_csv('../input/lish-moa/sample_submission.csv')
    ss[cols] = 0.0877289129911273 * submission_elu_stack2[cols].values + \
               0.026620985745766827 * submission_resnet_stack[cols].values + \
               0.04467573105638814 * submission_mlp_stack[cols].values + \
               0.3382592931952609 * submission_rtn2[cols].values + \
               0.5027150770114568 * submission_tabnet2[cols].values

In [None]:
if FINETUNE:
    if POST_PROCESS:
        low = 0.012
        high = 0.98

        ss[cols], num_idx = post_process(ss[cols].values, low, high)
        print(num_idx)

In [None]:
ss.to_csv('submission.csv', index = False)
ss.head(10)