In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,Model,losses
import sys
import json
import gc
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from scipy.optimize import dual_annealing, minimize

from tensorflow import keras


In [None]:
# TabNet
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet
# Iterative Stratification
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/

### General ###
import os
import copy
import tqdm
import pickle
import random
import warnings
warnings.filterwarnings("ignore")
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

### Data Wrangling ###
from scipy import stats

### Data Visualization ###
import seaborn as sns
plt.style.use("fivethirtyeight")

### Machine Learning ###
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold

### Deep Learning ###
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Tabnet 
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

### Make prettier the prints ###
from colorama import Fore
c_ = Fore.CYAN
m_ = Fore.MAGENTA
r_ = Fore.RED
b_ = Fore.BLUE
y_ = Fore.YELLOW
g_ = Fore.GREEN


In [None]:
seed = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(seed)

# Define Preprocessing Functions for each notebook

In [None]:
# Preprocessing for TabNet
def preprocessor_TabNet():
    data_path = "../input/lish-moa/"
    no_ctl = True
    scale = "rankgauss"
    variance_threshould = 0.7
    decompo = "PCA"
    ncompo_genes = 80
    ncompo_cells = 10
    encoding = "dummy"
    
    train = pd.read_csv(data_path + "train_features.csv")
    targets = pd.read_csv(data_path + "train_targets_scored.csv")
    test = pd.read_csv(data_path + "test_features.csv")
    train_drug = pd.read_csv(data_path + "train_drug.csv")
    submission = pd.read_csv(data_path + "sample_submission.csv")

    if no_ctl:
        # cp_type == ctl_vehicle
        print(b_, "not_ctl")
        train = train[train["cp_type"] != "ctl_vehicle"]
        test = test[test["cp_type"] != "ctl_vehicle"]
        targets = targets.iloc[train.index]
        train.reset_index(drop = True, inplace = True)
        test.reset_index(drop = True, inplace = True)
        targets.reset_index(drop = True, inplace = True)

    cols_numeric = [feat for feat in list(train.columns) if feat not in ["sig_id", "cp_type", "cp_time", "cp_dose"]]
    mask = (train[cols_numeric].var() >= variance_threshould).values
    tmp = train[cols_numeric].loc[:, mask]
    train = pd.concat([train[["sig_id", "cp_type", "cp_time", "cp_dose"]], tmp], axis = 1)
    cols_numeric = [feat for feat in list(train.columns) if feat not in ["sig_id", "cp_type", "cp_time", "cp_dose"]]
    test = pd.concat([test[["sig_id", "cp_type", "cp_time", "cp_dose"]], test.loc[:,cols_numeric]], axis = 1)

    GENES = [col for col in train.columns if col.startswith("g-")]
    CELLS = [col for col in train.columns if col.startswith("c-")]

    if scale == "rankgauss":
        ### Rank Gauss ###
        print(b_, "Rank Gauss")
        for col in (GENES + CELLS):
            transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")   # from optimal commit 9
            vec_len = len(train[col].values)
            vec_len_test = len(test[col].values)
            raw_vec = train[col].values.reshape(vec_len, 1)
            transformer.fit(raw_vec)
        
            train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
            test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
    else:
        pass

    if decompo == "PCA":
        print(b_, "PCA")
        GENES = [col for col in train.columns if col.startswith("g-")]
        CELLS = [col for col in train.columns if col.startswith("c-")]
    
        pca_genes = PCA(n_components = ncompo_genes, random_state = seed)
        pca_genes_train = pca_genes.fit_transform(train[GENES])
    
    
    
        pca_cells = PCA(n_components = ncompo_cells, random_state = seed)
        pca_cells_train = pca_cells.fit_transform(train[CELLS])
    
        pca_genes_train = pd.DataFrame(pca_genes_train, columns = [f"pca_g-{i}" for i in range(ncompo_genes)])
        pca_cells_train = pd.DataFrame(pca_cells_train, columns = [f"pca_c-{i}" for i in range(ncompo_cells)])
        train = pd.concat([train, pca_genes_train, pca_cells_train], axis = 1)
    
        pca_genes_test = pca_genes.transform(test[GENES])
        pca_cells_test = pca_cells.transform(test[CELLS])
    
        pca_genes_test = pd.DataFrame(pca_genes_test, columns = [f"pca_g-{i}" for i in range(ncompo_genes)])
        pca_cells_test = pd.DataFrame(pca_cells_test, columns = [f"pca_c-{i}" for i in range(ncompo_cells)])
        test = pd.concat([test, pca_genes_test, pca_cells_test], axis = 1)
    
    else:
        pass

    if encoding == "dummy":
        print(b_, "One-Hot")
        train = pd.get_dummies(train, columns = ["cp_time", "cp_dose"])
        test = pd.get_dummies(test, columns = ["cp_time", "cp_dose"])
    else:
        pass
# GENES = [col for col in train.columns if col.startswith("g-")]
# CELLS = [col for col in train.columns if col.startswith("c-")]

    for stats in tqdm.tqdm(["sum", "mean", "std", "kurt", "skew"]):
        train["g_" + stats] = getattr(train[GENES], stats)(axis = 1)
        train["c_" + stats] = getattr(train[CELLS], stats)(axis = 1)    
        train["gc_" + stats] = getattr(train[GENES + CELLS], stats)(axis = 1)
    
        test["g_" + stats] = getattr(test[GENES], stats)(axis = 1)
        test["c_" + stats] = getattr(test[CELLS], stats)(axis = 1)    
        test["gc_" + stats] = getattr(test[GENES + CELLS], stats)(axis = 1)
    
    gsquarecols=['g-574','g-211','g-216','g-0','g-255','g-577',
             'g-153','g-389','g-60','g-370','g-248','g-167',
             'g-203','g-177','g-301','g-332','g-517','g-6',
             'g-744','g-224','g-162','g-3','g-736','g-486',
             'g-283','g-22','g-359','g-361','g-440','g-335',
             'g-106','g-307','g-745','g-146','g-416','g-298',
             'g-666','g-91','g-17','g-549','g-145','g-157','g-768','g-568','g-396']

    for df in [train, test]:
        df['c52_c42'] = df['c-52'] * df['c-42']
        df['c13_c73'] = df['c-13'] * df['c-73']
        df['c26_c13'] = df['c-23'] * df['c-13']
        df['c33_c6'] = df['c-33'] * df['c-6']
        df['c11_c55'] = df['c-11'] * df['c-55']
        df['c38_c63'] = df['c-38'] * df['c-63']
        df['c38_c94'] = df['c-38'] * df['c-94']
        df['c13_c94'] = df['c-13'] * df['c-94']
        df['c4_c52'] = df['c-4'] * df['c-52']
        df['c4_c42'] = df['c-4'] * df['c-42']
        df['c13_c38'] = df['c-13'] * df['c-38']
        df['c55_c2'] = df['c-55'] * df['c-2']
        df['c55_c4'] = df['c-55'] * df['c-4']
        df['c4_c13'] = df['c-4'] * df['c-13']
        df['c82_c42'] = df['c-82'] * df['c-42']
        df['c66_c42'] = df['c-66'] * df['c-42']
        df['c6_c38'] = df['c-6'] * df['c-38']
        df['c2_c13'] = df['c-2'] * df['c-13']
        df['c62_c42'] = df['c-62'] * df['c-42']
        df['c90_c55'] = df['c-90'] * df['c-55']
            
        for feature in gsquarecols:
            if feature in GENES:
                df[f'{feature}_squared'] = df[feature] ** 2  
        for feature in CELLS:
            df[f'{feature}_squared'] = df[feature] ** 2  

    train = train.merge(targets, on='sig_id')
    train = train.merge(train_drug, on='sig_id')
    target_cols = [x for x in targets.columns if x != 'sig_id']
    
    train_df = train
    train_df.reset_index(drop=True, inplace=True)
    test.reset_index(drop = True, inplace = True)
    features_to_drop = ["sig_id", "cp_type"]

    test.drop(features_to_drop, axis = 1, inplace = True)
    feature_cols = [c for c in train_df.columns if c not in targets.columns]
    feature_cols = [c for c in feature_cols if c not in [ 'sig_id', 'drug_id', 'cp_type']]

    return train, targets, test, submission, target_cols, feature_cols 

In [None]:
train_df, targets_TabNet, test_TabNet, submission_TabNet, target_cols, feature_cols = preprocessor_TabNet()

In [None]:
# pd.set_option('display.max_columns', None)
# print(train_df.shape)
# display(train_df.head())

In [None]:
# pd.set_option('display.max_columns', None)
# display(targets_TabNet.head(3))
# display(test_TabNet.head(3))
# print(len(target_cols))
# print(len(feature_cols))

In [None]:
from sklearn.model_selection import KFold

def make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH):
    vc = train.drug_id.value_counts()
    vc1 = vc.loc[vc <= DRUG_THRESH].index.sort_values()
    vc2 = vc.loc[vc > DRUG_THRESH].index.sort_values()

    for seed_id in SEEDS:
        kfold_col = 'kfold_{}'.format(seed_id)
        
        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}
        dct2 = {}

        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.groupby('drug_id')[target_cols].mean().loc[vc1]

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.loc[train.drug_id.isin(vc2)].reset_index(drop=True)

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        train[kfold_col] = train.drug_id.map(dct1)
        train.loc[train[kfold_col].isna(), kfold_col] = train.loc[train[kfold_col].isna(), 'sig_id'].map(dct2)
        train[kfold_col] = train[kfold_col].astype('int8')
        
    return train

SEEDS = [x for x in range(42,49)]
NFOLDS = 10
DRUG_THRESH = 18

train_df = make_cv_folds(train_df, SEEDS, NFOLDS, DRUG_THRESH)

In [None]:
# display(train_df.head(3))
# print(train_df.shape)

# TabNet Inference
train_TabNet, targets_TabNet, test_TabNet, submission_TabNet


In [None]:
# TabNet inference
import json
import shutil
import zipfile
import io

X_test = test_TabNet.values


MAX_EPOCH = 200
tabnet_params = dict(
    n_d = 32,
    n_a = 32,
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = seed,
    verbose = 10
)

class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)
    
test_cv_preds = []
for s in SEEDS:
    tabnet_params['seed'] = s
#     mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = s, shuffle = True)
    for fold_id in range(NFOLDS):
        print(b_,"FOLDS: ", r_, fold_id + 1, y_, 'seed:', tabnet_params['seed'])
        print(g_, '*' * 60, c_)

        saved_path_name = '../input/tabnetfe7seeds424810folds/TabNet_seed_'\
        +str(s)+'_fold_'+str(fold_id+1)

        saved_file = '/kaggle/working/model'

        shutil.make_archive(saved_file,'zip',saved_path_name)
        print(f'Successfully saved model at {saved_file}.zip')

        device = ('cuda' if torch.cuda.is_available() else 'cpu')
        with zipfile.ZipFile('model.zip') as z:
            with z.open('model_params.json') as f:
                loaded_params = json.load(f)
            with z.open('network.pt') as f:
                try:
                    saved_state_dict = torch.load(f, map_location=device)
                except io.UnsupportedOperation:
                    saved_state_dict = torch.load(io.bytesIO(f.read()),map_location = device)
        loaded_model = TabNetRegressor(**loaded_params)
        loaded_model._set_network()
        loaded_model.network.load_state_dict(saved_state_dict)
        loaded_model.network.eval()
        

        ### Predict on validation ###
#         preds_val = model.predict(X_val)
#         preds_val = loaded_model.predict(X_val)
        # Apply sigmoid to the predictions
#         preds = 1 / (1 + np.exp(-preds_val))
#         score = np.min(model.history["val_logits_ll"])
#         if s == 42:
#             print(type(preds_val), preds_val.shape, type(preds), preds.shape)
        
    
        ### Save OOF for CV ###
#         oof_preds.append(preds_val)
#         oof_targets.append(y_val)
#         if s == 42:
#             print(f'{len(oof_preds)}*{len(oof_preds[0])}, {len(oof_targets)}*{len(oof_targets[0])}')
#         scores.append(score)
    
        ### Predict on test ###
        preds_test = loaded_model.predict(X_test)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))
# print(len(oof_preds))
# oof_preds_all = np.concatenate(oof_preds)
# oof_targets_all = np.concatenate(oof_targets)
test_preds_all = np.stack(test_cv_preds)

saved_path_name = '../input/tabnetfe7seeds424810folds/'
oof_TabNet_all = np.load(saved_path_name + 'oof_TabNet_all.npy')

In [None]:
data_path = "../input/lish-moa/"
all_feat = [col for col in submission_TabNet.columns if col not in ["sig_id"]]

# To obtain the same lengh of test_preds_all and submission
test_TabNet = pd.read_csv(data_path + "test_features.csv")
sig_id = test_TabNet[test_TabNet["cp_type"] != "ctl_vehicle"].sig_id.reset_index(drop = True)
tmp = pd.DataFrame(test_preds_all.mean(axis = 0), columns = all_feat)
tmp["sig_id"] = sig_id

submission_TabNet = pd.merge(test_TabNet[["sig_id"]], tmp, on = "sig_id", how = "left")
submission_TabNet.fillna(0, inplace = True)

#submission[all_feat] = tmp.mean(axis = 0)

# Set control to 0
#submission.loc[test["cp_type"] == 0, submission.columns[1:]] = 0
submission_TabNet.to_csv("submission_TabNet.csv", index = None)
submission_TabNet.head()

In [None]:
print(f"{b_}submission_TabNet.shape: {r_}{submission_TabNet.shape}")

In [None]:
import gc
del targets_TabNet, test_TabNet, tmp, train_df
gc.collect()

In [None]:
seed = 42

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(seed)

def preprocessor_nn_transfer():
    variance_threshould = 0.8
    ncompo_genes = 600
    ncompo_cells = 50
    
    data_dir = '../input/lish-moa/'
    train_features = pd.read_csv(data_dir + 'train_features.csv')
    train_targets_scored = pd.read_csv(data_dir + 'train_targets_scored.csv')
    train_targets_nonscored = pd.read_csv(data_dir + 'train_targets_nonscored.csv')
    train_drug = pd.read_csv(data_dir + 'train_drug.csv')
    test_features = pd.read_csv(data_dir + 'test_features.csv')
    sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')
    
    train_features = train_features[train_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
    test_features = test_features[test_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
    # drop cp_type
    train_features = train_features.drop('cp_type', axis=1)
    test_features = test_features.drop('cp_type', axis=1)

    
    GENES = [col for col in train_features.columns if col.startswith('g-')]
    CELLS = [col for col in train_features.columns if col.startswith('c-')]
    
    # Rank Gauss
    for col in (GENES + CELLS):
        transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
        vec_len = len(train_features[col].values)
        vec_len_test = len(test_features[col].values)
        raw_vec = train_features[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
    
#     print('Rank Gauss')
#     print('train_features: {}'.format(train_features.shape))
#     print('test_features: {}'.format(test_features.shape))
    
    # PCA
    # GENES
    pca_genes = PCA(n_components = ncompo_genes, random_state = seed)
    pca_genes_train = pca_genes.fit_transform(train_features[GENES])
    # CELLS
    pca_cells = PCA(n_components = ncompo_cells, random_state = seed)
    pca_cells_train = pca_cells.fit_transform(train_features[CELLS])
    #train
    pca_genes_train = pd.DataFrame(pca_genes_train, columns = [f"pca_g-{i}" for i in range(ncompo_genes)])
    pca_cells_train = pd.DataFrame(pca_cells_train, columns = [f"pca_c-{i}" for i in range(ncompo_cells)])
    train_features = pd.concat([train_features, pca_genes_train, pca_cells_train], axis = 1)
    #test
    pca_genes_test = pca_genes.transform(test_features[GENES])
    pca_cells_test = pca_cells.transform(test_features[CELLS])
    
    pca_genes_test = pd.DataFrame(pca_genes_test, columns = [f"pca_g-{i}" for i in range(ncompo_genes)])
    pca_cells_test = pd.DataFrame(pca_cells_test, columns = [f"pca_c-{i}" for i in range(ncompo_cells)])
    test_features = pd.concat([test_features, pca_genes_test, pca_cells_test], axis = 1)

#     print('\n\nPCA')
#     print('train_features: {}'.format(train_features.shape))
#     print('test_features: {}'.format(test_features.shape))
    
    # feature selection
    cols_numeric = [feat for feat in list(train_features.columns) if feat not in ["sig_id", "cp_time", "cp_dose"]]
    mask = (train_features[cols_numeric].var() >= variance_threshould).values
    tmp = train_features[cols_numeric].loc[:, mask]
    train_features = pd.concat([train_features[["sig_id", "cp_time", "cp_dose"]], tmp], axis = 1)
    cols_numeric = [feat for feat in list(train_features.columns) if feat not in ["sig_id", "cp_time", "cp_dose"]]
    test_features = pd.concat([test_features[["sig_id", "cp_time", "cp_dose"]], test_features.loc[:,cols_numeric]], axis = 1)
    
    # one hot
    train_features = pd.get_dummies(train_features, columns = ['cp_time', 'cp_dose'])
    test_features = pd.get_dummies(test_features, columns = ['cp_time', 'cp_dose'])

#     print('\n\nFeature selection')
#     print('train_features: {}'.format(train_features.shape))
#     print('test_features: {}'.format(test_features.shape))
    
    # Join
    train = train_features.merge(train_targets_scored, on='sig_id')
    train = train.merge(train_targets_nonscored, on='sig_id')
    train = train.merge(train_drug, on='sig_id')
    test = test_features

#     print('\n\nJoin')
#     print('train: {}'.format(train.shape))
#     print('test: {}'.format(test.shape))
    
    
    target_cols = [x for x in train_targets_scored.columns if x != 'sig_id']
    aux_target_cols = [x for x in train_targets_nonscored.columns if x != 'sig_id']
    all_target_cols = target_cols + aux_target_cols

    num_targets = len(target_cols)
    num_aux_targets = len(aux_target_cols)
    num_all_targets = len(all_target_cols)

#     print('\n\nnum_targets: {}'.format(num_targets))
#     print('num_aux_targets: {}'.format(num_aux_targets))
#     print('num_all_targets: {}'.format(num_all_targets))
#     print('\n\n')
#     print(train.shape)
#     print(test.shape)
#     print(sample_submission.shape)
#     display(train.head())
#     print(train.columns.to_list())

    return train, test, num_targets, num_aux_targets, num_all_targets, target_cols, aux_target_cols, all_target_cols

In [None]:
train, test, num_targets, num_aux_targets, num_all_targets, target_cols, aux_target_cols, all_target_cols = preprocessor_nn_transfer()

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }

        return dct
    

def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    return final_loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    return preds


import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1

        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
            
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss
    
class Model(nn.Module):
    def __init__(self, num_features, num_targets):
        super(Model, self).__init__()
        self.hidden_size = [1500, 1250, 1000, 750]
        self.dropout_value = [0.5, 0.35, 0.3, 0.25]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = nn.Linear(self.hidden_size[1], self.hidden_size[2])

        self.batch_norm4 = nn.BatchNorm1d(self.hidden_size[2])
        self.dropout4 = nn.Dropout(self.dropout_value[2])
        self.dense4 = nn.Linear(self.hidden_size[2], self.hidden_size[3])

        self.batch_norm5 = nn.BatchNorm1d(self.hidden_size[3])
        self.dropout5 = nn.Dropout(self.dropout_value[3])
        self.dense5 = nn.utils.weight_norm(nn.Linear(self.hidden_size[3], num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))

        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = F.leaky_relu(self.dense4(x))

        x = self.batch_norm5(x)
        x = self.dropout5(x)
        x = self.dense5(x)
        return x
    
class FineTuneScheduler:
    def __init__(self, epochs):
        self.epochs = epochs
        self.epochs_per_step = 0
        self.frozen_layers = []

    def copy_without_top(self, model, num_features, num_targets, num_targets_new):
        self.frozen_layers = []

        model_new = Model(num_features, num_targets)
        model_new.load_state_dict(model.state_dict())

        # Freeze all weights
        for name, param in model_new.named_parameters():
            layer_index = name.split('.')[0][-1]

            if layer_index == 5:
                continue

            param.requires_grad = False

            # Save frozen layer names
            if layer_index not in self.frozen_layers:
                self.frozen_layers.append(layer_index)

        self.epochs_per_step = self.epochs // len(self.frozen_layers)

        # Replace the top layers with another ones
        model_new.batch_norm5 = nn.BatchNorm1d(model_new.hidden_size[3])
        model_new.dropout5 = nn.Dropout(model_new.dropout_value[3])
        model_new.dense5 = nn.utils.weight_norm(nn.Linear(model_new.hidden_size[-1], num_targets_new))
        model_new.to(DEVICE)
        return model_new

    def step(self, epoch, model):
        if len(self.frozen_layers) == 0:
            return

        if epoch % self.epochs_per_step == 0:
            last_frozen_index = self.frozen_layers[-1]
            
            # Unfreeze parameters of the last frozen layer
            for name, param in model.named_parameters():
                layer_index = name.split('.')[0][-1]

                if layer_index == last_frozen_index:
                    param.requires_grad = True

            del self.frozen_layers[-1]  # Remove the last layer as unfrozen

In [None]:
from sklearn.model_selection import KFold

# HyperParameters
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 24
BATCH_SIZE = 128

WEIGHT_DECAY = {'ALL_TARGETS': 1e-5, 'SCORED_ONLY': 3e-6}
MAX_LR = {'ALL_TARGETS': 1e-2, 'SCORED_ONLY': 3e-3}
DIV_FACTOR = {'ALL_TARGETS': 1e3, 'SCORED_ONLY': 1e2}
PCT_START = 0.1

SEEDS = 7
NFOLDS = 7
DRUG_THRESH = 18

feature_cols = [c for c in train.columns if c not in all_target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id', 'drug_id']]
num_features = len(feature_cols)
# Show model architecture
model = Model(num_features, num_all_targets)
# model


def make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH):
    vc = train.drug_id.value_counts()
    vc1 = vc.loc[vc <= DRUG_THRESH].index.sort_values()
    vc2 = vc.loc[vc > DRUG_THRESH].index.sort_values()

    for seed_id in range(SEEDS):
        kfold_col = 'kfold_{}'.format(seed_id)
        
        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}
        dct2 = {}

        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.groupby('drug_id')[target_cols].mean().loc[vc1]

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.loc[train.drug_id.isin(vc2)].reset_index(drop=True)

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        train[kfold_col] = train.drug_id.map(dct1)
        train.loc[train[kfold_col].isna(), kfold_col] = train.loc[train[kfold_col].isna(), 'sig_id'].map(dct2)
        train[kfold_col] = train[kfold_col].astype('int8')
        
    return train



train = make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH)
display(train.head())

def run_training(fold_id, seed_id):
    seed_everything(seed_id)
    
    train_ = train
    test_ = test
    
    kfold_col = f'kfold_{seed_id}'
    trn_idx = train_[train_[kfold_col] != fold_id].index
    val_idx = train_[train_[kfold_col] == fold_id].index
    
    train_df = train_[train_[kfold_col] != fold_id].reset_index(drop=True)
    valid_df = train_[train_[kfold_col] == fold_id].reset_index(drop=True)
    
    # Load the fine-tuned model with the best loss
    model = Model(num_features, num_targets)
    model.load_state_dict(torch.load(f"../input/nn-transfer-learning-oof/SCORED_ONLY_SEED_{seed_id}_FOLD{fold_id}_.pth"))
    model.to(DEVICE)

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = np.zeros((len(test_), num_targets))
    predictions = inference_fn(model, testloader, DEVICE)
#     return oof, predictions
    print(f"Pretrained_Model_Loaded_SCORED_ONLYSEED: {seed_id}, FOLD: {fold_id} ")
    return 0.02, predictions

In [None]:
def run_k_fold(NFOLDS, seed_id):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold_id in range(NFOLDS):
        oof_, pred_ = run_training(fold_id, seed_id)
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

from time import time

# Averaging on multiple SEEDS
SEED = [0, 1, 2, 3, 4, 5, 6]
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_begin = time()

for seed_id in SEED:
    oof_, predictions_ = run_k_fold(NFOLDS, seed_id)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

time_diff = time() - time_begin

train[target_cols] = oof
test[target_cols] = predictions

from datetime import timedelta
str(timedelta(seconds=time_diff))

data_dir = '../input/lish-moa/'
train_targets_scored = pd.read_csv(data_dir + 'train_targets_scored.csv')
train_targets_scored.head()

valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0

for i in range(len(target_cols)):
    score += log_loss(y_true[:, i], y_pred[:, i])

print("CV log_loss: ", score / y_pred.shape[1])

sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')
sub_nn_transfer = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub_nn_transfer.to_csv('submission_nn_transfer.csv', index=False)

In [None]:
saved_path_name = '../input/nn-transfer-learning-oof/'
oof_nn_transfer = np.load(saved_path_name + 'oof_nn_transfer_all.npy')
oof_nn_transfer.shape

In [None]:
del sample_submission, test, train, train_targets_scored, valid_results 
gc.collect()

In [None]:
print(sub_nn_transfer.shape)
display(sub_nn_transfer.head())


In [None]:
def readin_data_resnet():
    # Import train data, drop sig_id, cp_type

    train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')

    train_drug = pd.read_csv('../input/lish-moa/train_drug.csv')
    non_ctl_idx = train_features.loc[train_features['cp_type']!='ctl_vehicle'].index.to_list()
    train_features = train_features.drop(['sig_id','cp_type','cp_dose','cp_time'],axis=1)

    train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

    targets_drug = train_targets_scored.merge(train_drug, on='sig_id')
    train_targets_scored = train_targets_scored.drop('sig_id',axis=1)
    #targets_drug = targets_drug.drop('sig_id',axis=1)
    labels_train = train_targets_scored.values

    # Drop training data with ctl vehicle

    train_features = train_features.iloc[non_ctl_idx]
    labels_train = labels_train[non_ctl_idx]
    targets_drug = targets_drug.iloc[non_ctl_idx]
    # Import test data

    test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
    test_features = test_features.drop(['sig_id','cp_dose','cp_time'],axis=1)

    # Import predictors from public kernel

    json_file_path = '../input/ttestpcarfelogisticregression/main_predictors.json'

    with open(json_file_path, 'r') as j:
        predictors = json.loads(j.read())
        predictors = predictors['start_predictors']
    cs = train_features.columns.str.contains('c-')
    gs = train_features.columns.str.contains('g-')
    return train_features,labels_train,test_features,targets_drug,predictors,cs,gs

#target_cols = train_targets_scored.drop('sig_id', axis=1).columns.values.tolist()




In [None]:
# Create g-mean, c-mean, genes_pca (2 components), cells_pca (all components)

def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

def resnet_preprocessor(train,test,cs,gs):
    
    
    # PCA
    
    n_gs = 2 # No of PCA comps to include
    n_cs = 100 # No of PCA comps to include
    
    pca_cs = PCA(n_components = n_cs)
    pca_gs = PCA(n_components = n_gs)

    train_pca_gs = pca_gs.fit_transform(train[:,gs])
    train_pca_cs = pca_cs.fit_transform(train[:,cs])
    test_pca_gs = pca_gs.transform(test[:,gs])
    test_pca_cs = pca_cs.transform(test[:,cs])
    
    # c-mean, g-mean
    
    train_c_mean = train[:,cs].mean(axis=1)
    test_c_mean = test[:,cs].mean(axis=1)
    train_g_mean = train[:,gs].mean(axis=1)
    test_g_mean = test[:,gs].mean(axis=1)
    
    # Append Features
    
    train = np.concatenate((train,train_pca_gs,train_pca_cs,train_c_mean[:,np.newaxis]
                            ,train_g_mean[:,np.newaxis]),axis=1)
    test = np.concatenate((test,test_pca_gs,test_pca_cs,test_c_mean[:,np.newaxis],
                           test_g_mean[:,np.newaxis]),axis=1)
    
    # Scaler for numerical values

    # Scale train data
    scaler = preprocessing.QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal") 

    train = scaler.fit_transform(train)

    # Scale Test data
    test = scaler.transform(test)
    
    return train, test




In [None]:
def resnet_model(n_features, n_features_2, n_labels, label_smoothing = 0.0005):    
    input_1 = layers.Input(shape = (n_features,), name = 'Input1')
    input_2 = layers.Input(shape = (n_features_2,), name = 'Input2')

    head_1 = Sequential([
        layers.BatchNormalization(),
        layers.Dropout(0.25),
        layers.Dense(512, activation="elu"), 
        layers.BatchNormalization(),
        layers.Dense(256, activation = "elu")
        ],name='Head1') 

    input_3 = head_1(input_1)
    input_3_concat = layers.Concatenate()([input_2, input_3])

    head_2 = Sequential([
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(512, "relu"),
        layers.BatchNormalization(),
        layers.Dense(512, "elu"),
        layers.BatchNormalization(),
        layers.Dense(256, "relu"),
        layers.BatchNormalization(),
        layers.Dense(256, "elu")
        ],name='Head2')

    input_4 = head_2(input_3_concat)
    input_4_avg = layers.Average()([input_3, input_4]) 

    head_3 = Sequential([
        layers.BatchNormalization(),
        layers.Dense(256, kernel_initializer='lecun_normal', activation='selu'),
        layers.BatchNormalization(),
        layers.Dense(n_labels, kernel_initializer='lecun_normal', activation='selu'),
        layers.BatchNormalization(),
        layers.Dense(n_labels, activation="sigmoid")
        ],name='Head3')

    output = head_3(input_4_avg)


    model = Model(inputs = [input_1, input_2], outputs = output)
    model.compile(optimizer='adam', loss=losses.BinaryCrossentropy(label_smoothing=label_smoothing), metrics=logloss)
    
    return model

In [None]:
def resnet_make_cv_folds(targets_drug,target_cols, SEEDS, NFOLDS, DRUG_THRESH):
    vc = targets_drug.drug_id.value_counts()
    vc1 = vc.loc[vc <= DRUG_THRESH].index.sort_values()
    vc2 = vc.loc[vc > DRUG_THRESH].index.sort_values()
   # target_cols = targets_drug.drop(['sig_id','drug_id'], axis=1).columns.values.tolist()

    for seed_id in SEEDS:
        kfold_col = 'kfold_{}'.format(seed_id)
        
        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}
        dct2 = {}

        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = targets_drug.groupby('drug_id')[target_cols].mean().loc[vc1]

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = targets_drug.loc[targets_drug.drug_id.isin(vc2)].reset_index(drop=True)

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        targets_drug[kfold_col] = targets_drug.drug_id.map(dct1)
        targets_drug.loc[targets_drug[kfold_col].isna(), kfold_col] = targets_drug.loc[targets_drug[kfold_col].isna(), 'sig_id'].map(dct2)
        targets_drug[kfold_col] = targets_drug[kfold_col].astype('int8')
        
    return targets_drug



In [None]:
# Generate Seeds

def predict_resnet(train_features,test_features,labels_train, targets_drug1,SEEDS, NFOLDS,cs,gs):
    n_labels = labels_train.shape[1]
    n_train = train_features.shape[0]
    n_test = test_features.shape[0]
    p_min = 0.0005
    p_max = 0.9995
   
    n_seeds = len(SEEDS)

    n_folds = NFOLDS
    y_pred = np.zeros((n_test,n_labels))
    #oof = tf.constant(0.0)
    oof = np.zeros((n_train,n_labels))
    #oof = torch.zeros(n_train, n_labels)
    #hists = []
   # for fold in kfolds:
    #X_train, X_test = preprocessor(train_features[train].values,
    #                                       train_features[test].values)
    train_all,data_test = resnet_preprocessor(train_features.values,
                                       test_features.drop('cp_type',axis=1).values,cs,gs)
        
    for seed in SEEDS:
        kfold_col = 'kfold_{}'.format(seed)
        #kf = KFold(n_splits=n_folds,shuffle=True,random_state=seed)
        kf = targets_drug1[kfold_col]
        for fold in range(NFOLDS):
            #train = targets_drug1[targets_drug1[kfold_col] != fold].index
            #test = targets_drug1[targets_drug1[kfold_col] == fold].index
            train = targets_drug1[kfold_col] != fold
            test = targets_drug1[kfold_col] == fold
            #print(train)
           # print(train_features.shape)
       # for train, test in kf.split(train_features):
            #X_train, X_test = preprocessor(train_features[train].values,
            #                              train_features[test].values)
            X_train, X_test = train_all[train],train_all[test]
            #_,data_test = preprocessor(train_features[train].values,
             #                          test_features.drop('cp_type',axis=1).values)
            X_train_2 = train_features[train][predictors].values
            X_test_2 = train_features[test][predictors].values
            data_test_2 = test_features[predictors].values
            y_train = labels_train[train]
            y_test = labels_train[test]
            n_features = X_train.shape[1]
            n_features_2 = X_train_2.shape[1]
            
           # model = resnet_model(n_features, n_features_2, n_labels)
            #reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_logloss', factor=0.1, patience=2, mode='min', min_lr=1E-5)
            #early_stopping = callbacks.EarlyStopping(monitor='val_logloss', min_delta=1E-5, patience=10, mode='min',restore_best_weights=True)

            #hist = model.fit([X_train,X_train_2],y_train, batch_size=128, epochs=192,verbose=0,validation_data = ([X_test,X_test_2],y_test),
             #                callbacks=[reduce_lr, early_stopping])
           # hists.append(hist)

            # Save Model
            #model.save('/kaggle/input/multi-input-resnet-model/TwoHeads_seed_'+str(seed)+'_fold_'+str(fold))
            model = keras.models.load_model('/kaggle/input/resnet-001843/TwoHeads_seed_'+str(seed)+'_fold_'+str(fold), custom_objects={'logloss':logloss})
            # OOF Score
            y_val = model.predict([X_test,X_test_2])
            #print(test)
            #oof[test] += tf.constant(y_val,dtype=tf.float32)/n_seeds
            #oof[test] += torch.clamp(torch.sigmoid(tf.constant(y_val,dtype=tf.float32)), p_min, p_max) / nseed
            oof[test] += y_val/ n_seeds



            #oof += logloss(tf.constant(y_test,dtype=tf.float32),tf.constant(y_val,dtype=tf.float32))/(n_folds*n_seeds)

            # Run prediction
            y_pred += model.predict([data_test,data_test_2])/(n_folds*n_seeds)
    #cv_score = F.binary_cross_entropy(oof, labels_train)
    #print('{} folds cv_score: {:.5f}'.format(nfold, cv_score))
    return oof,y_pred



In [None]:
#oof, test_pred = predict_nn_model(train,test,targets)
train_features,labels_train,test_features,targets_drug,predictors,cs,gs = readin_data_resnet()
p_min = 0.0005
p_max = 0.9995
DRUG_THRESH = 18
n_seeds = 7
np.random.seed(42)
SEEDS = np.random.randint(0,100,size=n_seeds)
#SEEDS = [0,42]
NFOLDS = 10
target_cols = targets_drug.drop(['sig_id','drug_id'], axis=1).columns.values.tolist()
targets_drug_copy = targets_drug.copy()
targets_drug1 = resnet_make_cv_folds(targets_drug_copy,target_cols, SEEDS, NFOLDS, DRUG_THRESH)

oof_rs, test_pred_rs = predict_resnet(train_features,test_features,labels_train, targets_drug1,SEEDS, NFOLDS,cs,gs)

In [None]:
sub_rs = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
sub_rs.iloc[:,1:] = np.clip(test_pred_rs,p_min,p_max)

# Set ctl_vehicle to 0
sub_rs.iloc[test_features['cp_type'] == 'ctl_vehicle',1:] = 0

# Save Submission
sub_rs.to_csv('submission_rs.csv', index=False)

In [None]:
%who DataFrame

In [None]:
del targets_drug, targets_drug1, targets_drug_copy, train_features
gc.collect()

In [None]:
## multi-heads

train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
train_drug = pd.read_csv('../input/lish-moa/train_drug.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')



In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    

def remove_ctl(no_ctl,train,test,targets):
    if no_ctl:
        # 删掉 cp_type==ctl_vehicle 的样本
        print('not_ctl')
        train = train[train['cp_type']!='ctl_vehicle']
        test = test[test['cp_type']!='ctl_vehicle']
        targets = targets.iloc[train.index]
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
        targets.reset_index(drop=True, inplace=True)

    return train,test,targets

def remove_small_variance(variance_threshould,train,test):
    cols_numeric = [feat for feat in list(train.columns) if feat not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose']]
    mask = (train[cols_numeric].var() >= variance_threshould).values
    tmp_train = train[cols_numeric].loc[:, mask]
    tmp_test = test[cols_numeric].loc[:, mask]
    train = pd.concat([train[['sig_id', 'cp_type', 'cp_time', 'cp_dose']], tmp_train], axis=1)
    test = pd.concat([test[['sig_id', 'cp_type', 'cp_time', 'cp_dose']], tmp_test], axis=1)
    
    return train,test

def pca_process(ncompo_genes,ncompo_cells,train,test):

    GENES = [col for col in train.columns if col.startswith('g-')]
    CELLS = [col for col in train.columns if col.startswith('c-')]

    pca_genes = PCA(n_components=ncompo_genes, random_state=base_seed).fit(train[GENES])
    pca_cells = PCA(n_components=ncompo_cells, random_state=base_seed).fit(train[CELLS])
    train_pca_gene = pca_genes.transform(train[GENES])
    test_pca_gene = pca_genes.transform(test[GENES])
    train_pca_cell = pca_cells.transform(train[CELLS])
    test_pca_cell = pca_cells.transform(test[CELLS])
    train_pca_gene = pd.DataFrame(train_pca_gene, columns=[f'pca_g-{i}' for i in range(ncompo_genes)])
    test_pca_gene = pd.DataFrame(test_pca_gene, columns=[f'pca_g-{i}' for i in range(ncompo_genes)])
    train_pca_cell = pd.DataFrame(train_pca_cell, columns=[f'pca_c-{i}' for i in range(ncompo_cells)])
    test_pca_cell = pd.DataFrame(test_pca_cell, columns=[f'pca_c-{i}' for i in range(ncompo_cells)])

    #pca_genes = pd.DataFrame(pca_genes, columns=[f'pca_g-{i}' for i in range(ncompo_genes)])
    #pca_cells = pd.DataFrame(pca_cells, columns=[f'pca_c-{i}' for i in range(ncompo_cells)])
    #data_all = pd.concat([data_all, pca_genes, pca_cells], axis=1)
    train_df = pd.concat([train, train_pca_gene, train_pca_cell], axis=1)
    test_df = pd.concat([test, test_pca_gene, test_pca_cell], axis=1)
    return train_df,test_df


def scale_data(scale, train_df,test_df):
    if scale == 'minmax':
        # 归一化
        print('minmax')
        data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax, axis=0)

    elif scale == 'rankgauss':
        # RankGauss
        print('rankgauss')
        cols_numeric = [feat for feat in list(train_df.columns) if feat not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose'] and "pca_" not in feat]
        #scaler = GaussRankScaler()
        scaler = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal") 
        train_df[cols_numeric] = scaler.fit_transform(train_df[cols_numeric])
        test_df[cols_numeric] = scaler.transform(test_df[cols_numeric])

    else:
        pass
    return train_df,test_df

def add_features(train,test):
    
    GENES = [col for col in train.columns if col.startswith('g-')]
    CELLS = [col for col in train.columns if col.startswith('c-')]
    for stats in tqdm.tqdm(['sum', 'mean', 'std', 'kurt', 'skew']):
        train['g_'+stats] = getattr(train[GENES], stats)(axis=1)
        train['c_'+stats] = getattr(train[CELLS], stats)(axis=1)
        train['gc_'+stats] = getattr(train[GENES+CELLS], stats)(axis=1)
        test['g_'+stats] = getattr(test[GENES], stats)(axis=1)
        test['c_'+stats] = getattr(test[CELLS], stats)(axis=1)
        test['gc_'+stats] = getattr(test[GENES+CELLS], stats)(axis=1)
    return train,test

def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    return data

def data_processing(train,test, targets):
    train,test,targets = remove_ctl(no_ctl,train,test,targets)
    print("remove control: ", train.shape,test.shape)
    train,test = remove_small_variance(variance_threshould,train,test)
    print("remove small_var: ", train.shape,test.shape)
    train_df,test_df = pca_process(ncompo_genes,ncompo_cells,train,test)
    print("do  PCA:  ", train_df.shape,test_df.shape)
    train_df,test_df = scale_data(scale, train_df,test_df)
    print("rankGauss: ", train_df.shape,test_df.shape)
    train_df,test_df = add_features(train_df,test_df)
    print("add stats feature:", train_df.shape,test_df.shape)
    
    return train_df,test_df,targets

In [None]:
def make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH):
    vc = train.drug_id.value_counts()
    vc1 = vc.loc[vc <= DRUG_THRESH].index.sort_values()
    vc2 = vc.loc[vc > DRUG_THRESH].index.sort_values()

    for seed_id in range(SEEDS):
        kfold_col = 'kfold_{}'.format(seed_id)
        
        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}
        dct2 = {}

        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.groupby('drug_id')[target_cols].mean().loc[vc1]

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.loc[train.drug_id.isin(vc2)].reset_index(drop=True)

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        train[kfold_col] = train.drug_id.map(dct1)
        train.loc[train[kfold_col].isna(), kfold_col] = train.loc[train[kfold_col].isna(), 'sig_id'].map(dct2)
        train[kfold_col] = train[kfold_col].astype('int8')
        
    return train

In [None]:
no_ctl = True
scale = 'rankgauss'
variance_threshould = 0.8
decompo = 'PCA'
ncompo_genes = 50
ncompo_cells = 8
encoding = 'dummy'
base_seed = 42

# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
EPOCHS = 20
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 10
EARLY_STOP = True

hidden_size=1024

# for KFold parameters
SEEDS = 7
SEED = [0, 1, 2, 3, 4, 5, 6]
DRUG_THRESH = 18

In [None]:
seed_everything(base_seed)
train, test, targets = data_processing(train_features,test_features, train_targets_scored) 

target_cols = train_targets_scored.drop('sig_id', axis=1).columns.values.tolist()
print(len(target_cols))

train_comb = train.merge(targets, on='sig_id')
train_comb.drop('cp_type', axis=1, inplace=True)

print("combine train and targets to prepare for kfolds ", train_comb.shape)

feature_cols = [c for c in process_data(train_comb).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id', 'drug_id']]
print(len(feature_cols))

gen_cols = [c for c in feature_cols if 'g-' == c[:2]]
cell_cols = [c for c in feature_cols if 'c-' == c[:2]]
pca_gen_cols = [c for c in feature_cols if 'pca_g' == c[:5]]
pca_cell_cols = [c for c in feature_cols if 'pca_c' == c[:5]]
ohe_cols = [c for c in feature_cols if c not in gen_cols+cell_cols+pca_gen_cols+pca_cell_cols]
print(len(gen_cols), len(cell_cols), len(pca_gen_cols), len(pca_cell_cols),len(ohe_cols))

num_features=len(feature_cols)
num_targets=len(target_cols)

folds = train_comb.copy()
folds = folds.merge(train_drug, on='sig_id')
folds = make_cv_folds(folds, SEEDS, NFOLDS, DRUG_THRESH)

In [None]:
class Model(nn.Module):
    def __init__(self, num_gen_features, num_cell_features,num_targets, hidden_size):
        super(Model, self).__init__()
        self.gen_batch_norm1 = nn.BatchNorm1d(num_gen_features)
        self.gen_dropout1 = nn.Dropout(0.2)
        self.gen_dense1 = nn.utils.weight_norm(nn.Linear(num_gen_features, hidden_size))
        self.gen_batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.gen_dropout2 = nn.Dropout(0.3)
        self.gen_dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.cell_batch_norm1 = nn.BatchNorm1d(num_cell_features)
        self.cell_dropout1 = nn.Dropout(0.2)
        self.cell_dense1 = nn.utils.weight_norm(nn.Linear(num_cell_features, int(hidden_size/2)))
        self.cell_batch_norm2 = nn.BatchNorm1d(int(hidden_size/2))
        self.cell_dropout2 = nn.Dropout(0.3)
        self.cell_dense2 = nn.utils.weight_norm(nn.Linear(int(hidden_size/2), int(hidden_size/2)))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size + int(hidden_size/2) )
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size + int(hidden_size/2), num_targets))
        
        
    
    def forward(self, gen_x, cell_x):
#         print(cell_x.shape)
        gen_x = self.gen_batch_norm1(gen_x)
        gen_x = self.gen_dropout1(gen_x)
        gen_x = F.relu(self.gen_dense1(gen_x))
        gen_x = self.gen_batch_norm2(gen_x)
        gen_x = self.gen_dropout2(gen_x)
        gen_x = F.relu(self.gen_dense2(gen_x))

        cell_x = self.cell_batch_norm1(cell_x)
        cell_x = self.cell_dropout1(cell_x)
        cell_x = F.relu(self.cell_dense1(cell_x))
        cell_x = self.cell_batch_norm2(cell_x)
        cell_x = self.cell_dropout2(cell_x)
        cell_x = F.relu(self.cell_dense2(cell_x))
                
        x = torch.cat((gen_x,cell_x),dim=1)
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        return x

In [None]:
class MoADataset:
    def __init__(self, gen_features, cell_features, targets):
        self.gen_features = gen_features
        self.cell_features = cell_features
        self.targets = targets
        
    def __len__(self):
        return (self.gen_features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'gen_x' : torch.tensor(self.gen_features[idx, :], dtype=torch.float),
            'cell_x' : torch.tensor(self.cell_features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, gen_features, cell_features):
        self.gen_features = gen_features
        self.cell_features = cell_features
        
    def __len__(self):
        return (self.gen_features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'gen_x' : torch.tensor(self.gen_features[idx, :], dtype=torch.float),
            'cell_x' : torch.tensor(self.cell_features[idx, :], dtype=torch.float)  
        }
        return dct


In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        gen_x, cell_x, targets = data['gen_x'].to(device), data['cell_x'].to(device), data['y'].to(device)
        outputs = model(gen_x, cell_x)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        gen_x, cell_x, targets = data['gen_x'].to(device), data['cell_x'].to(device),  data['y'].to(device)
        outputs = model(gen_x, cell_x)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        gen_x, cell_x = data['gen_x'].to(device), data['cell_x'].to(device)

        with torch.no_grad():
            outputs = model(gen_x, cell_x)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

In [None]:

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1

        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
            
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [None]:
def get_oof_predictions(fold, seed):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test)
    
#     kfold_col = f'kfold_{seed}'
#     trn_idx = train[train[kfold_col] != fold].index
#     val_idx = train[train[kfold_col] == fold].index

#     train_df = train[train[kfold_col] != fold].reset_index(drop=True)
#     valid_df = train[train[kfold_col] == fold].reset_index(drop=True)

#     gen_x_train, cell_x_train, y_train  = train_df[gen_cols + pca_gen_cols + ohe_cols].values, train_df[cell_cols + pca_cell_cols + ohe_cols].values, train_df[target_cols].values
#     gen_x_valid, cell_x_valid, y_valid =  valid_df[gen_cols + pca_gen_cols + ohe_cols].values, valid_df[cell_cols + pca_cell_cols + ohe_cols].values, valid_df[target_cols].values
#     #     print("len target_cols", len(target_cols), train_df[target_cols].shape, train_df[target_cols].reset_index(drop=True).shape)
#     #     print("shapes", x_train.shape, y_train.shape, x_valid.shape, y_valid.shape)

#     train_dataset = MoADataset(gen_x_train, cell_x_train, y_train)
#     valid_dataset = MoADataset(gen_x_valid, cell_x_valid, y_valid)
#     trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
#     validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    

    # predict on oof
#     print('predict on oof...', end='')
#     oof = np.zeros((len(train), targets.iloc[:, 1:].shape[1]))
    
    loss_fn = nn.BCEWithLogitsLoss()
    
    model = Model(
        num_gen_features=len(gen_cols + pca_gen_cols + ohe_cols),
        num_cell_features=len(cell_cols + pca_cell_cols + ohe_cols),
        num_targets=len(target_cols),
        hidden_size=hidden_size)
    
    model.to(DEVICE)
    model.load_state_dict(torch.load(f"../input/jp-multi-heads/FOLD{fold}_SEED{seed}.pth"))
#     valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
#     oof[val_idx] = valid_preds
#     print('  done.')
    
    
    #--------------------- PREDICTION---------------------
    
    gen_x_test, cell_x_test = test_[gen_cols + pca_gen_cols + ohe_cols].values, test_[cell_cols + pca_cell_cols + ohe_cols].values
    testdataset = TestDataset(gen_x_test, cell_x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)

    print('predict on test...', end='')
    predictions = np.zeros((len(test_), targets.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    print('  done.\n')
    
    
    return predictions

In [None]:
def run_k_fold(NFOLDS, seed):
#     oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        pred_ = get_oof_predictions(fold, seed)
        
        predictions += pred_ / NFOLDS
#         oof += oof_
        
    return predictions


# Averaging on multiple SEEDS

# oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    predictions_ = run_k_fold(NFOLDS, seed)
#     oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)


In [None]:
predictions.shape

In [None]:
%who DataFrame

In [None]:
del folds, sample_submission, targets, test, train, train_comb, train_drug, train_features, train_targets_nonscored, train_targets_scored
gc.collect()

In [None]:
# oof = pd.DataFrame(oof, columns=target_cols)

# train_features = pd.read_csv('../input/lish-moa/train_features.csv')
# sig_id = train_features[train_features['cp_type']!='ctl_vehicle'].sig_id.reset_index(drop=True)
# oof['sig_id'] = sig_id

# oof = pd.merge(train_features[['sig_id']], oof, on='sig_id', how='left')
# oof.fillna(0, inplace=True)
# oof.drop('sig_id', axis=1, inplace=True)

In [None]:
# compute multi-heads CV

# y_true = train_targets_scored[target_cols].values
# y_pred = np.clip(oof.values, 0.0005, 0.9995)

# score = 0
# for i in range(len(target_cols)):
#     score_ = log_loss(y_true[:, i], y_pred[:, i])
#     score += score_ / targets.shape[1]
    
# print("multi-heads CV log_loss: ", score)

In [None]:
predictions_clip = np.clip(predictions, 0.0005, 0.9995)
predictions_clip = pd.DataFrame(predictions_clip, columns=target_cols)

In [None]:
oof_multi_heads_all = np.load( '../input/jp-multi-heads/multi_heads_oof.npy')

In [None]:
# submit multi-heads
test = pd.read_csv('../input/lish-moa/test_features.csv')
sig_id = test[test['cp_type']!='ctl_vehicle'].sig_id.reset_index(drop=True)
predictions_clip['sig_id'] = sig_id

sub_mh = pd.merge(test[['sig_id']], predictions_clip, on='sig_id', how='left')
sub_mh.fillna(0, inplace=True)
sub_mh.to_csv('submission_mh.csv', index=False)

In [None]:
import datetime
import pandas as pd
from time import time
# from autograd import grad
# import autograd.numpy as np
import numpy as np
from numba import njit
from scipy.optimize import minimize, fsolve

# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_pred):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, - np.log(y_pred), - np.log(1 - y_pred))
    return loss.mean()

def func_numpy_metric(weights):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    return log_loss_numpy(oof_blend)

def grad_func(weights):
    oof_clip = np.clip(oof, 1e-15, 1 - 1e-15)
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

@njit
def grad_func_jit(weights):
    oof_clip = np.minimum(1 - 1e-15, np.maximum(oof, 1e-15))
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

In [None]:
y_true = pd.read_csv('../input/lish-moa/train_targets_scored.csv', index_col = 'sig_id').values

oof_dict = {'Model 1': '../input/nn-transfer-learning-oof/oof_nn_transfer_all.npy',
            'Model 2': '../input/tabnetfe7seeds424810folds/oof_TabNet_all.npy',
            'Resnet': '../input/prediction/resnet_oof.npy',
            'Multi_heads': '../input/jp-multi-heads/multi_heads_oof.npy'
           }

oof = np.zeros((len(oof_dict), y_true.shape[0], y_true.shape[1]))
for i in range(oof.shape[0]):
    #print(list(oof_dict.values())[i])
    oof[i] = np.load(list(oof_dict.values())[i])

In [None]:
%%time

log_loss_scores = {}
for n, key in enumerate(oof_dict.keys()):
    score_oof = log_loss_numpy(oof[n])
    log_loss_scores[key] = score_oof
    print(f'{key} CV:\t', score_oof)
print('-' * 50)

In [None]:
test_weights = np.array([1 / oof.shape[0]] * oof.shape[0])

In [None]:
# %timeit -r 10 grad_func(test_weights)

In [None]:
# %timeit -r 10 grad_func_jit(test_weights)

In [None]:
tol = 1e-10
init_guess = [1 / oof.shape[0]] * oof.shape[0]
bnds = [(0, 1) for _ in range(oof.shape[0])]
cons = {'type': 'eq', 
        'fun': lambda x: np.sum(x) - 1, 
        'jac': lambda x: [1] * len(x)}

print('Inital Blend OOF:', func_numpy_metric(init_guess))
start_time = time()
res_scipy = minimize(fun = func_numpy_metric, 
                     x0 = init_guess, 
                     method = 'SLSQP', 
                     jac = grad_func_jit, # grad_func 
                     bounds = bnds, 
                     constraints = cons, 
                     tol = tol)
print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Blend OOF:', res_scipy.fun)
print('Optimised Weights:', res_scipy.x)

In [None]:
print('Check the sum of all weights:', np.sum(res_scipy.x))
if np.sum(res_scipy.x) - 1 <= tol:
    print('Great! The sum of all weights equals to 1!')
else:
    print('Manual adjustion is needed to modify the weights.')

In [None]:
sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
sub.iloc[:,1:] = np.clip(res_scipy.x[0]*sub_nn_transfer.iloc[:, 1:].values + \
                  res_scipy.x[1]*submission_TabNet.iloc[:,1:].values + \
                  res_scipy.x[2]*sub_rs.iloc[:,1:].values + \
                  res_scipy.x[3]*sub_mh.iloc[:,1:].values, 0.00005,0.99995)
# sub.iloc[:,1:] = (0.4*sub_nn_transfer.iloc[:, 1:].values + \
#                   0.2*submission_TabNet.iloc[:,1:].values + \
#                   0.4*sub_rs.iloc[:,1:].values)

# Set ctl_vehicle to 0
sub.iloc[test_features['cp_type'] == 'ctl_vehicle',1:] = 0

# Save Submission
sub.to_csv('submission.csv', index=False)

In [None]:
!rm submission_nn_transfer.csv

In [None]:
!rm submission_rs.csv

In [None]:
!rm submission_TabNet.csv

In [None]:
!rm submission_mh.csv

In [None]:
sub