# Source 

- Pytorch 1.6 : https://pytorch.org/docs/stable/
- iterative-stratification : https://github.com/trent-b/iterative-stratification for stratified K fold multilabel

# Approach :
Inference script : 
https://www.kaggle.com/ludovick/inference-moa-baseline-mlp-kfold-10/edit/run/41997446

Neural Network to classify a multi labels tasks with pytorch
- Stratified K Fold (10 folds) or shufflesplit
- BCE Loss
- optional labels are used for the training, not for inference though (after filtering)
- gradient accumulation (not tested yet)
- version 14 : add weight_norm from https://www.kaggle.com/nicohrubec/pytorch-multilabel-neural-network

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip install iterative-stratification


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os
import random
import sys
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from tqdm import tqdm
from sklearn.metrics import log_loss

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

# Data


In [None]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

remove_vehicle = True

if remove_vehicle:
    train_features = train.loc[train['cp_type']=='trt_cp'].reset_index(drop=True)
    train_targets_scored = train_targets_scored.loc[train['cp_type']=='trt_cp'].reset_index(drop=True)
    train_targets_nonscored = train_targets_nonscored.loc[train['cp_type']=='trt_cp'].reset_index(drop=True)
else:
    train_features = train

In [None]:
train_features

Check the different values than each category can take and if there are nan/inf values

In [None]:
# check the number of categorical features for train/test
col_features = list(train_features.columns)[1:]
print(train_features[col_features[0]].value_counts())
print(test_features[col_features[0]].value_counts())
print(train_features[col_features[1]].value_counts())
print(test_features[col_features[1]].value_counts())
print(train_features[col_features[2]].value_counts())
print(test_features[col_features[2]].value_counts())


In [None]:
# check nan value and inf value ?
print(test_features[col_features].isna().sum().values.sum())
print(np.isinf(test_features[col_features[3:]].values).sum()) # only for numerical value
print(train_features[col_features].isna().values.sum())
print(np.isinf(train_features[col_features[3:]].values).sum()) # only for numerical value

# Targets data

In [None]:
train_targets_scored

As we can see, the labels are hightly imbalanced, it may be necessary to use a weighted loss function to help the model ?

In [None]:
# ratio for each label

def get_ratio_labels(df):
    columns = list(df.columns)
    columns.pop(0)
    ratios = []
    toremove = []
    for c in columns:
        counts = df[c].value_counts()
        if len(counts) != 1:
            ratios.append(counts[0]/counts[1])
        else:
            toremove.append(c)
    print(f"remove {len(toremove)} columns")
    
    for t in toremove:
        columns.remove(t)
    return columns, np.array(ratios).astype(np.int32)

columns, ratios = get_ratio_labels(train_targets_scored)
print(ratios)


In [None]:
train_targets_nonscored

In [None]:
columns_nonscored, ratios_nonscored = get_ratio_labels(train_targets_nonscored)
print(ratios_nonscored)
print(len(columns_nonscored), len(ratios_nonscored))

### seems that some optional labels have only one labels, so we discard them

# Dataset

In [None]:
print(train_features[col_features[3:]].max().values.max())
print(train_features[col_features[3:]].min().values.min())
print(test_features[col_features[3:]].min().values.min())
print(test_features[col_features[3:]].max().values.max())

we could later normalize our numerical value but we will see that for another version

In [None]:
len(col_features[3:])

# Dataloader

In [None]:
mapping = {"cp_type":{"trt_cp": 1, "ctl_vehicle":2},
               "cp_time":{48:1, 72:2, 24:3},
               "cp_dose":{"D1":1, "D2":2}}

def transform_data(train, test, col, mapping, normalize=True, removed_vehicle=False):
    """
        the first 3 columns represents categories, the others numericals features
    """

    
    if removed_vehicle:
        categories_tr = np.stack([ train[c].apply(lambda x: mapping[c][x]).values for c in col[1:3]], axis=1)
        categories_test = np.stack([ test[c].apply(lambda x: mapping[c][x]).values for c in col[1:3]], axis=1)
    else:
        categories_tr = np.stack([ train[c].apply(lambda x: mapping[c][x]).values for c in col[:3]], axis=1)
        categories_test = np.stack([ test[c].apply(lambda x: mapping[c][x]).values for c in col[:3]], axis=1)
    
    max_ = 10.
    min_ = -10.
   
    if removed_vehicle:
        numerical_tr = train[col[3:]].values
        numerical_test = test[col[3:]].values
    else:
        numerical_tr = train[col[3:]].values
        numerical_test = test[col[3:]].values
    if normalize:
        numerical_tr = (numerical_tr-min_)/(max_ - min_)
        numerical_test = (numerical_test-min_)/(max_ - min_)
    return categories_tr, categories_test, numerical_tr, numerical_test
col_features = list(train_features.columns)[1:]
cat_tr, cat_test, numerical_tr, numerical_test = transform_data(train_features, test_features, col_features, mapping, normalize=False, removed_vehicle=remove_vehicle)
targets_tr = train_targets_scored[columns].values.astype(np.float32)
targets2_tr = train_targets_nonscored[columns_nonscored].values.astype(np.float32)

In [None]:
class MOADataset(Dataset):
    def __init__(self, x_cats, x_nums, y=None, y2=None):
        self.cats = x_cats
        self.nums = x_nums
        self.y = y
        self.y2 = y2
        
    def __len__(self):
        return len(self.cats)

    def __getitem__(self, index):
        x1 = torch.as_tensor(self.cats[index], dtype=torch.long)
        x2 = torch.as_tensor(self.nums[index], dtype=torch.float)
        
        if self.y is not None:
            label = torch.as_tensor(self.y[index], dtype=torch.float)
            if self.y2 is not None:
                label2 = torch.as_tensor(self.y2[index], dtype=torch.float)
                return x1, x2, label, label2
            return  x1, x2, label
        return  x1, x2

# Model

In [None]:
class MOA_MLP(nn.Module):
    def __init__(self, num_cats=[2,3,2] , cats_emb_size=[2,2,2], num_numericals=872, hidden_size_numericals=2048,
                num_class=206, aux=None):
        super().__init__()
        self.cat_emb1 = nn.Embedding(num_cats[0], cats_emb_size[0], padding_idx=0)
        self.cat_emb2 = nn.Embedding(num_cats[1], cats_emb_size[1], padding_idx=0)
        #self.cat_emb3 = nn.Embedding(num_cats[2], cats_emb_size[2], padding_idx=0)

        self.norms = nn.BatchNorm1d(sum(cats_emb_size) +num_numericals)
        self.dropout = nn.Dropout(0.2)
        
        self.proj = nn.utils.weight_norm(nn.Linear(sum(cats_emb_size) + num_numericals, hidden_size_numericals))
        self.norm_proj = nn.BatchNorm1d(hidden_size_numericals)
        self.dropout2 = nn.Dropout(0.5)
        
        hd_1 = hidden_size_numericals//2
        hd_2 = hd_1//2
        self.extractor = nn.Sequential(nn.utils.weight_norm(nn.Linear(hidden_size_numericals, hd_1)),
                                        nn.PReLU(),
                                        nn.BatchNorm1d(hd_1),
                                        nn.Dropout(0.5),
                                        #nn.utils.weight_norm(nn.Linear(hd_1, hd_2)),
                                        #nn.PReLU(),
                                        #nn.BatchNorm1d(hd_2),
                                        #nn.Dropout(0.5)
        )
        self.cls = nn.utils.weight_norm(nn.Linear(hd_1, num_class))
        self.cls_aux=None
        if aux is not None:
            self.cls_aux = nn.utils.weight_norm(nn.Linear(hd_1, aux))
    def forward(self, x_cat, x_num):
        cat_features = torch.cat([self.cat_emb1(x_cat[:,0]), self.cat_emb2(x_cat[:,1])], dim=1)
        all_features = torch.cat([cat_features, x_num], dim=1)
        all_features = self.norms(all_features)
        all_features = self.dropout(all_features)
        
        proj_features = self.proj(all_features)
        proj_features = self.norm_proj(F.relu(proj_features))
        proj_features = self.dropout2(proj_features )
        
        
        
        features_reduced = self.extractor(proj_features)
        
        outputs = self.cls(features_reduced)
        if self.cls_aux is not None:
            outputs2 = self.cls_aux(features_reduced)
            return outputs, outputs2
        return outputs
    
class MOA_MLPv2(nn.Module):
    def __init__(self, num_cats=[2,3,2] , cats_emb_size=[2,2,2], num_numericals=872, hidden_size_numericals=2048,
                num_class=206, aux=None):
        super().__init__()
        self.cat_emb1 = nn.Embedding(num_cats[0], cats_emb_size[0], padding_idx=0)
        self.cat_emb2 = nn.Embedding(num_cats[1], cats_emb_size[1], padding_idx=0)
        self.cat_emb3 = nn.Embedding(num_cats[2], cats_emb_size[2], padding_idx=0)

        self.projection_numericals = nn.Linear(num_numericals, hidden_size_numericals)
        self.norm_numericals = nn.BatchNorm1d(hidden_size_numericals)
        self.dropout = nn.Dropout(0.5)
        
        self.proj = nn.Linear(sum(cats_emb_size) + hidden_size_numericals, 2048)
        self.norm_proj = nn.BatchNorm1d(2048)
        
        hd_1 = hidden_size_numericals//2
        hd_2 = hd_1//2
        self.extractor = nn.Sequential(nn.Linear(2048, hd_1),
                                       nn.ReLU(),
                                      nn.BatchNorm1d(hd_1),
                                      nn.Dropout(0.25),
                                      nn.Linear(hd_1, hd_2),
                                      nn.ReLU(),
                                      nn.BatchNorm1d(hd_2),
                                      nn.Dropout(0.25))
        
        self.cls = nn.Linear(hd_2, num_class)
        self.cls_aux=None
        if aux is not None:
            self.cls_aux = nn.Linear(hd_2, aux)
    def forward(self, x_cat, x_num):
        cat_features = torch.cat([self.cat_emb1(x_cat[:,0]), self.cat_emb2(x_cat[:,1]), self.cat_emb3(x_cat[:,2])], dim=1)
        
        num_features = self.projection_numericals(x_num)
        num_features = self.norm_numericals(F.relu(num_features))
        
        all_features = torch.cat([cat_features, num_features], dim=1)
        all_features = self.dropout(all_features)

        all_features = F.relu(self.proj(all_features))
        all_features = self.norm_proj(all_features)
        
        features_reduced = self.extractor(all_features)
        
        outputs = self.cls(features_reduced)
        if self.cls_aux is not None:
            outputs2 = self.cls_aux(features_reduced)
            return outputs, outputs2
        return outputs

# Utils

In [None]:
from torch.cuda.amp import GradScaler, autocast
def train_one_epoch(model, dataloader, cfg, optimizer, loss_fn, loss_fn_aux=None, accumulation=1, with_aux_class=False, verbose=True):
    model.train()
    scaler = GradScaler()
    optimizer.zero_grad()
    N = 0.
    total_loss = 0.
    t=tqdm(dataloader, disable=~verbose)
    for i, batch in enumerate(t):
        
        x1 = batch[0]
        x2 = batch[1]
        labels = batch[2]
        
        x1 = x1.to(cfg.device)
        x2 = x2.to(cfg.device)
        labels = labels.to(cfg.device)
        
        if with_aux_class:
            labels2 = batch[3]
            labels2 = labels2.to(cfg.device)
            
        with autocast(cfg.use_apex):
            if with_aux_class:
                outputs, outputs2 = model(x1, x2)
                loss1 = loss_fn(outputs, labels).mean(0).mean()
                loss2 = loss_fn_aux(outputs2, labels2).mean(0).mean()
                loss = loss1 + 0.5*loss2

            else:
                outputs = model(x1, x2)
                loss = loss_fn(outputs, labels).mean(0).mean()
        
        N += len(x1)
        total_loss += (loss.item() * len(x1))  
        
        if cfg.use_apex:
            loss = loss/accumulation
            scaler.scale(loss).backward()
        else:
            loss = loss/accumulation
            loss.backward()



        
        if (i+1)%accumulation == 0 or i-1 == len(dataloader):
            if cfg.use_apex:
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()
                optimizer.zero_grad()
            else:                
                optimizer.step()
                optimizer.zero_grad()


            t.set_description("Loss : {0}".format(total_loss/N))
            t.refresh()
            
            
def evals(model, dataloader, cfg, loss_fn, loss_fn_aux=None, with_aux_class=False, verbose=True):
    model.eval()
    N = 0.
    total_loss = 0.

    y_preds = []
    y_targets = []
    t=tqdm(dataloader, disable=~verbose)
    with torch.no_grad():
        for i, batch in enumerate(t):

            x1 = batch[0]
            x2 = batch[1]
            labels = batch[2]

            x1 = x1.to(cfg.device)
            x2 = x2.to(cfg.device)
            labels = labels.to(cfg.device)

            if with_aux_class:
                labels2 = batch[3]
                labels2 = labels2.to(cfg.device)

            with autocast(cfg.use_apex):
                if with_aux_class:
                    outputs, outputs2 = model(x1, x2)
                    loss1 = loss_fn(outputs, labels).mean(0).mean()
                    loss2 = loss_fn_aux(outputs2, labels2).mean(0).mean()
                    loss = loss1 #+ 0.5*loss2

                else:
                    outputs = model(x1, x2)
                    loss = loss_fn(outputs, labels).mean(0).mean()

            N += len(x1)
            total_loss += (loss.item() * len(x1))  

            t.set_description("Loss : {0}".format(total_loss/N))
            t.refresh()
            
            y_preds.append(torch.sigmoid(outputs).detach().cpu().numpy())
            y_targets.append(labels.detach().cpu().numpy())
    y_preds = np.concatenate(y_preds, axis=0)
    y_targets = np.concatenate(y_targets, axis=0)
    score = log_loss_multi(y_targets, y_preds)
    #print("Logloss = ", score)
    return y_preds, y_targets, score



def inference_fn(model, dataloader, cfg, with_aux_class=True, verbose=True):
    model.eval()
    N = 0.
    
    y_preds = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader, disable=~verbose)):

            x1 = batch[0]
            x2 = batch[1]
            

            x1 = x1.to(cfg.device)
            x2 = x2.to(cfg.device)

            

            with autocast(cfg.use_apex):
                if with_aux_class:
                    outputs, outputs2 = model(x1, x2)
                    
                else:
                    outputs = model(x1, x2)

            
            y_preds.append(torch.sigmoid(outputs).detach().cpu().numpy())
    y_preds = np.concatenate(y_preds, axis=0)
    return y_preds

In [None]:
def log_loss_score(actual, predicted,  eps=1e-15):

        """
        :param predicted:   The predicted probabilities as floats between 0-1
        :param actual:      The binary labels. Either 0 or 1.
        :param eps:         Log(0) is equal to infinity, so we need to offset our predicted values slightly by eps from 0 or 1
        :return:            The logarithmic loss between between the predicted probability assigned to the possible outcomes for item i, and the actual outcome.
        """

        
        p1 = actual * np.log(predicted+eps)
        p0 = (1-actual) * np.log(1-predicted+eps)
        loss = p0 + p1

        return -loss.mean()

In [None]:
np.log(0)*0

In [None]:
def log_loss_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        results[i] = log_loss_score(y_true[:,i], y_pred[:,i])
    return results.mean()
        

In [None]:
def train_fold(fold, model, tr_dataloader, val_dataloader, cfg, optimizer, reducer, loss_fn, loss_fn_aux=None, accumulation=1, with_aux_class=False):
    best_score = np.inf
    best_preds = None
    best_targets = None
    for e in range(cfg.EPOCHS):
        train_one_epoch(model, tr_dataloader, cfg, optimizer, loss_fn, loss_fn_aux=loss_fn_aux, accumulation=accumulation, with_aux_class=with_aux_class, verbose=cfg.verbose)
        preds, targets, score = evals(model, val_dataloader, cfg, loss_fn, loss_fn_aux=loss_fn_aux, with_aux_class=with_aux_class, verbose=cfg.verbose)
        reducer.step(score)
        if score < best_score:
            print("## Epochs {0} : Improvement from {1} to {2}".format(e, best_score, score))
            best_score = score
            best_preds = preds
            best_targets= targets
            torch.save(model.state_dict(), cfg.save_name + f"_{fold}.pth")
    print("## FOLD {0} : best results : {1}".format(fold, best_score))
    return best_preds, best_targets, score

In [None]:
def inference_fold(folds, model, test_loader,cfg):
    preds = []
    for fold in range(folds):
        name = cfg.save_name + f"_{fold}.pth"
        model.load_state_dict(torch.load(name))
        p = inference_fn(model, test_loader, cfg)
        preds.append(p)
    
    return preds

In [None]:
def check_targets(targets):
    ### check if targets are all binary in training set
    
    for i in range(targets.shape[1]):
        if len(np.unique(targets[:,i])) != 2:
            return False
    return True

In [None]:
def auc_multi(y_true, y_pred):
    M = y_true.shape[1]
    results = np.zeros(M)
    for i in range(M):
        try:
            results[i] = roc_auc_score(y_true[:,i], y_pred[:,i])
        except:
            pass
    return results.mean()

# script

In [None]:
def check_labels(A):
    w = np.zeros(A.shape[1])
    for i in range(A.shape[1]):
        if len(np.unique(A[:,i])) == 2:
            w[i] = 1
    return w.reshape(1, -1)

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

In [None]:
class Config(object):
    def __init__(self):
        self.num_class = targets_tr.shape[1]
        self.aux_class = targets2_tr.shape[1]
        self.use_apex=False
        self.verbose=False
        #
        self.batch_size = 128
        self.device = "cpu"
        self.SPLITS = 10
        self.EPOCHS = 100
        # Parameters model
        self.num_cats=[3+1,2+1] if remove_vehicle else [2+1,3+1,2+1] 
        self.cats_emb_size=[1]* cat_tr.shape[1] #to choose
        self.num_numericals= len(col_features[3:])
        self.hidden_size_numericals=1024 # to choose
        self.num_numericals= numerical_tr.shape[1]
        self.hidden_size_numericals=2048 # to choose
        self.num_ensembling = 1
        # save
        self.seed = 42
        self.save_name = f"MOA_mlp-KFOLD{self.SPLITS}"
        
        self.strategy = "KFOLD" # or 
cfg = Config()
cfg.with_aux_class = True if cfg.aux_class is not None else False
print(cfg.num_class, cfg.aux_class,cfg.with_aux_class)

In [None]:
loss_fn = nn.BCEWithLogitsLoss(reduction="none")#, pos_weight=torch.as_tensor(ratios))
loss_fn_aux = nn.BCEWithLogitsLoss(reduction="none")#, pos_weight=torch.as_tensor(ratios))

In [None]:
test_dataset = MOADataset(cat_test, numerical_test)
test_dataloader= DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=2)

In [None]:
if cfg.strategy == "KFOLD":
    oof_preds_all = []
    oof_targets_all = []
    scores_all = []
    scores_auc_all = []
    preds_test = []
    masks = []
    for seed in range(cfg.num_ensembling):
        mskf = MultilabelStratifiedKFold(n_splits=cfg.SPLITS, random_state=cfg.seed+seed, shuffle=True)
        oof_preds = []
        oof_targets = []
        scores = []
        scores_auc = []
        p = []
        temp_mask = []
        for j, (train_idx, val_idx) in enumerate(mskf.split(np.zeros(len(cat_tr)), targets_tr)):
            print("FOLDS : ", j)

            ## model

            model = MOA_MLP(num_cats=cfg.num_cats , cats_emb_size=cfg.cats_emb_size, num_numericals=cfg.num_numericals, hidden_size_numericals=cfg.hidden_size_numericals,
                        num_class=cfg.num_class, aux=cfg.aux_class).to(cfg.device)
            
            optimizer = optim.Adam(model.parameters(), lr = 5e-3, weight_decay=1e-5, amsgrad=True)#optim.SGD(model.parameters(), lr=1e-2, weight_decay=5e-4, momentum=0.9, nesterov=True) 
            reducer = ReduceLROnPlateau(optimizer, mode='min', factor=0.1,threshold=1e-3, patience=3,  min_lr=5e-6, eps=1e-08, verbose=True)
            ## Create dataset then dataloader
            if cfg.with_aux_class:
                train_dataset = MOADataset(cat_tr[train_idx], numerical_tr[train_idx], y=targets_tr[train_idx], y2=targets2_tr[train_idx])
                val_dataset = MOADataset(cat_tr[val_idx], numerical_tr[val_idx], y=targets_tr[val_idx], y2=targets2_tr[val_idx])
            else:
                train_dataset = MOADataset(cat_tr[train_idx], numerical_tr[train_idx], y=targets_tr[train_idx])
                val_dataset = MOADataset(cat_tr[val_idx], numerical_tr[val_idx], y=targets_tr[val_idx] )
            train_dataloader =    DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
            val_dataloader =    DataLoader(val_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=2)

            ## train fold
            preds , targets, score = train_fold(f"_{j}_{seed}", model, train_dataloader, val_dataloader, cfg, optimizer, reducer, loss_fn, 
                                                loss_fn_aux=loss_fn_aux, accumulation=1, with_aux_class=cfg.with_aux_class)

            ## save oof to compute the CV later
            temp_mask.append(check_labels(targets_tr[train_idx]))
            oof_preds.append(preds)
            oof_targets.append(targets)
            scores.append(score)
            scores_auc.append(auc_multi(targets,preds))
            p.append(inference_fn(model, test_dataloader, cfg,verbose=False))

        oof_preds_all.append(np.concatenate(oof_preds))
        oof_targets_all.append(np.concatenate(oof_targets))
        scores_all.append(scores)
        scores_auc_all.append(scores_auc)
        preds_test.append(np.array(p))
        masks.append(np.stack(temp_mask))
        

    preds_test = np.stack(preds_test)
    oof_preds_all = np.stack(oof_preds_all)
    oof_targets_all = np.stack(oof_targets_all)
    scores_all = np.stack(scores_all)
    scores_auc_all = np.stack(scores_auc_all)
    masks = np.stack(masks)

In [None]:
if cfg.strategy == "KFOLD":
    for i in range(oof_preds_all.shape[0]):
        print("CV score : ", log_loss_multi(oof_targets_all[i], oof_preds_all[i]))
        print("auc mean : ", sum(scores_auc_all[i])/len(scores_auc_all[i]))

In [None]:
if cfg.strategy != "KFOLD":
    i = 0
    mskf = MultilabelStratifiedShuffleSplit(n_splits=1000, test_size=0.1, random_state=0)
    oof_preds = []
    oof_targets = []
    scores = []
    scores_auc = []
    for j, (train_idx, val_idx) in enumerate(mskf.split(np.zeros(len(cat_tr)), targets_tr)):
        if i == cfg.SPLITS:
            break
            
        if not check_targets(targets_tr[train_idx]):
            continue
        print("FOLDS : ", i, j)

        ## model

        model = MOA_MLPv2(num_cats=cfg.num_cats , cats_emb_size=cfg.cats_emb_size, num_numericals=cfg.num_numericals, hidden_size_numericals=cfg.hidden_size_numericals,
                    num_class=cfg.num_class, aux=cfg.aux_class)
        optimizer = optim.Adam(model.parameters(), lr = 1e-3, amsgrad=True)#optim.SGD(model.parameters(), lr=1e-2, weight_decay=5e-4, momentum=0.9, nesterov=True) 
        reducer = ReduceLROnPlateau(optimizer, mode='min', factor=0.1,threshold=1e-3, patience=3,  min_lr=1e-5, eps=1e-08, verbose=True)
        ## Create dataset then dataloader
        if cfg.with_aux_class:
            train_dataset = MOADataset(cat_tr[train_idx], numerical_tr[train_idx], y=targets_tr[train_idx], y2=targets2_tr[train_idx])
            val_dataset = MOADataset(cat_tr[val_idx], numerical_tr[val_idx], y=targets_tr[val_idx], y2=targets2_tr[val_idx])
        else:
            train_dataset = MOADataset(cat_tr[train_idx], numerical_tr[train_idx], y=targets_tr[train_idx])
            val_dataset = MOADataset(cat_tr[val_idx], numerical_tr[val_idx], y=targets_tr[val_idx] )
        train_dataloader =    DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
        val_dataloader =    DataLoader(val_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=2)

        ## train fold
        preds , targets, score = train_fold(i, model, train_dataloader, val_dataloader, cfg, optimizer, reducer, loss_fn, 
                                            loss_fn_aux=loss_fn_aux, accumulation=1, with_aux_class=cfg.with_aux_class)

        ## save oof to compute the CV later
        oof_preds.append(preds)
        oof_targets.append(targets)
        scores.append(score)
        scores_auc.append(auc_multi(targets,preds))
        i+=1
        #break

In [None]:
if cfg.strategy != "KFOLD":
    oof_preds = np.concatenate(oof_preds)
    oof_targets = np.concatenate(oof_targets)
    print("CV score : ", log_loss_multi(oof_targets, oof_preds))
    print("auc mean : ", sum(scores_auc)/len(scores_auc))
    print(oof_preds.shape, oof_targets.shape, targets_tr.shape)

In [None]:
print(scores)

In [None]:
print(model)

In [None]:
preds_test2 = preds_test.sum(1).sum(0)/masks.sum(1).sum(0) 

In [None]:
submission[columns] = preds_test2#preds_test.mean(1).mean(0)
submission.loc[test_features['cp_type']=='ctl_vehicle', submission.columns[1:]] = 0

In [None]:
submission