# About this notebook
- PyTorch NNet + Entity Embeddings starter code
- Swish activation 
- Weighted average cat using softmax layer insted of torch cat
- BCEweighted loss + label smoothing

# Improvements maybe
- Use ArcFace or add triplet loss with bce for score improvement

# acknowledgement
- Y.NAKAMA great [notebook](https://www.kaggle.com/yasufuminakama/herbarium-2020-pytorch-resnet18-train/notebook)
- XGBoost Starter - [0.793] [link](https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793)
- stackoverflow for weighted average layer (late fusion) [link](https://stackoverflow.com/questions/62877879/implementing-late-fusion-in-keras)

If this notebook is helpful, feel free to upvote :)

In [None]:
!pip install -q --upgrade wandb

# Library

In [None]:
import os
import glob
import gc
import random
import math
import time
from tqdm import tqdm

import numpy as np
import pandas as pd


from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from torch.optim import Adam, SGD
from fastai.layers import SigmoidRange
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau,
OneCycleLR)

from torch.cuda.amp import autocast, GradScaler


device=("cuda" if torch.cuda.is_available() else "cpu")

# Directory settings

In [None]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Data Loading

In [None]:
train = pd.read_parquet("../input/amex-data-engineering/train.parquet")

In [None]:
train.head()

In [None]:
train.shape

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def amex_metric(y_true, y_pred, return_components=False) -> float:
    """Amex metric for ndarrays"""
    def top_four_percent_captured(df) -> float:
        """Corresponds to the recall for a threshold of 4 %"""
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        """Corresponds to 2 * AUC - 1"""
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)

    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)
    print("G: {:.6f}, D: {:.6f}, ALL: {:6f}".format(g, d, 0.5*(g+d)))
    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)

def get_score(y_true, y_pred):
    score = amex_metric(y_true, y_pred)
    return score

def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(seed=45)

In [None]:
CAT_FEATURES = [col for col in train.columns if (col.split("_")[-1] in ['count', 'nunique']) | (col in ["B_30_last","B_38_last","D_114_last","D_116_last","D_117_last","D_120_last","D_126_last","D_63_last","D_64_last","D_66_last","D_68_last"])]
NUM_FEATURES = [col for col in train.columns if (col.split("_")[-1] in ['mean', 'std', 'min', 'max', 'last']) & (col not in ["B_30_last","B_38_last","D_114_last","D_116_last","D_117_last","D_120_last","D_126_last","D_63_last","D_64_last","D_66_last","D_68_last"])]

# Label Encoding

In [None]:
def encode(df,cols):
    enc =  {}
    for col in cols:
        print(col)
        lbencoder = LabelEncoder()
        lb = lbencoder.fit(df[col].values)
        df[col]=lb.transform(df[col].values)
        enc[col]=lb
        
    return df,enc

train ,_ = encode(train, CAT_FEATURES)

# Configuration

In [None]:
class CFG:
    apex=False
    debug=False
    print_freq=100
    num_workers=2
    scheduler='CosineAnnealingWarmRestarts' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts','OneCycleLR']
    epochs=20
    # CosineAnnealingLR params
    cosanneal_params={
        'T_max':6,
        'eta_min':1e-5,
        'last_epoch':-1
    }
    #ReduceLROnPlateau params
    reduce_params={
        'mode':'min',
        'factor':0.2,
        'patience':4,
        'eps':1e-6,
        'verbose':True
    }
    # CosineAnnealingWarmRestarts params
    cosanneal_res_params={
        'T_0':5,
        'eta_min':1e-4,
        'T_mult':1,
        'last_epoch':-1
    }
    onecycle_params={
        'pct_start':0.1,
        'div_factor':1e2,
        'max_lr':1e-3
    }
    batch_size=32
    lr=4e-3
    weight_decay=1e-3
    gradient_accumulation_steps=2
    max_grad_norm=1000
    target_size=1
    nfolds=5
    trn_folds=[0, 1, 2, 3, 4]
    target_col="target"
    model_name="nnet"
    train=True
    cat_dims = [train[col].nunique() for col in CAT_FEATURES]
    cat_embs = [(dim, min(50,(dim+1)//2)) for dim in cat_dims]
    #num_features=len(num_columns)
    seed=42
    
if CFG.debug:
    CFG.epochs=2
    train=train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True).fillna(method="ffill")

# CV Split

In [None]:
skf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (trn_idx, vld_idx) in enumerate(skf.split(train, train["target"])):
    train.loc[vld_idx, "folds"] = int(fold)
train["folds"] = train["folds"].astype(int)

del skf
_ = gc.collect()

# W&B

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_key")

import wandb
wandb.login(key=wandb_api)

def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

run = wandb.init(project="Amex", 
                 name=CFG.model_name,
                 config=class2dict(CFG),
                 group=CFG.model_name,
                 job_type="train")

# Dataset

In [None]:
class AmexDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.num_data = df[NUM_FEATURES].values
        self.cat_data = df[CAT_FEATURES].values
        self.targets = df["target"].values 
    
    def __len__(self):
        
        return len(self.num_data)
    
    def __getitem__(self, idx):
            
            dct = {
            'num_data' : torch.tensor(self.num_data[idx], dtype=torch.float),
            'cat_data' : torch.tensor(self.cat_data[idx], dtype=torch.int),
            'y' : torch.tensor(self.targets[idx], dtype=torch.float)}
            
            return dct

# Model

In [None]:
# implementation of a weighted average between two tensors (model outputs)
# https://stackoverflow.com/questions/62877879/implementing-late-fusion-in-keras
class WeightedAverage(nn.Module):
    
    def __init__(self, n_output):
        super().__init__()
        self.n_output = n_output
        w = torch.Tensor(1, self.n_output)
        nn.init.xavier_normal_(w)
        w = w.unsqueeze(0)
        self.w = nn.Parameter(w)
        
    def forward(self, inputs):
        
        # inputs is a list of tensor of shape [(n_batch, n_feat), ..., (n_batch, n_feat)]
        # expand last dim of each input passed [(n_batch, n_feat, 1), ..., (n_batch, n_feat, 1)]
        inputs = [torch.unsqueeze(i, -1) for i in inputs] 
        inputs = torch.cat(inputs, dim=-1)  # (n_batch, n_feat, n_inputs)
        weights = nn.Softmax(dim=-1)(self.w) # (1,1,n_inputs)
        # weights sum up to one on last dim
        
        return torch.sum(weights * inputs, dim=-1)
    
class NNet(nn.Module):
    
    def __init__(self, cfg):
        super(NNet, self).__init__()
        
        self.emb = nn.ModuleList([nn.Embedding(x,y) for x,y in cfg.cat_embs])
        self.w_avg = WeightedAverage(n_output=2)
        self.mish_activation = nn.Mish()
        
        no_of_embs = sum([y for x, y in cfg.cat_embs])
        
        self.no_of_embs = no_of_embs
        self.no_of_num = len(NUM_FEATURES)
        self.dense_cat = nn.Linear(self.no_of_embs, 100)
        
        self.batch_norm1 = nn.BatchNorm1d(self.no_of_num)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.Linear(self.no_of_num, 100)
        
        self.batch_norm2 = nn.BatchNorm1d(100)
        self.dense2 = nn.Linear(100, 32)
        
        self.batch_norm3 = nn.BatchNorm1d(32)
        self.dense3 = nn.Linear(32, 16)
        
        self.batch_norm4 = nn.BatchNorm1d(16)
        self.dense4 = nn.Linear(16, 1)
        
    def forward(self, cat, num):
        ## cat data part
        x_cat = [emb_layer(cat[:,i]) for i,emb_layer in enumerate(self.emb)]
        x_cat = torch.cat(x_cat,1)
        x_cat = self.dropout1(x_cat)
        x_cat = self.dense_cat(x_cat)
        
        ##num data part
        x_num = self.batch_norm1(num)
        x_num = self.dropout1(x_num)
        x_num = self.mish_activation(self.dense1(x_num))
        
        ##concat
        x = [x_cat, x_num]
        x = self.w_avg(x)
        
        ##rest of NN
        x = self.batch_norm2(x)
        x = self.mish_activation(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.mish_activation(self.dense3(x))
        
        
        x = self.batch_norm4(x)
        x = self.dense4(x)
        
        return x
    
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1

        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
            
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

# Helper functions

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    if CFG.apex:
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    global_step = 0
    for step, dct in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        num = dct['num_data'].to(device)
        cat = dct['cat_data'].to(device)
        labels = dct['y'].to(device)
        batch_size = labels.size(0)
        if CFG.apex:
            with autocast():
                y_preds = model(cat, num)
                loss = criterion(y_preds.squeeze(1), labels)
        else:
            y_preds = model(cat, num)
            loss = criterion(y_preds.squeeze(1), labels)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        if CFG.apex:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            if CFG.apex:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f} '
                  'LR: {lr:.6f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_last_lr()[0]))
        wandb.log({f"[fold{fold}] loss": losses.val,
                   f"[fold{fold}] lr": scheduler.get_last_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, dct in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        num = dct['num_data'].to(device)
        cat = dct['cat_data'].to(device)
        labels = dct['y'].to(device)
        batch_size = labels.size(0)
        # compute loss
        with torch.no_grad():
            y_preds = model(cat, num)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        loss = criterion(y_preds.squeeze(1), labels)
        losses.update(loss.item(), batch_size)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# Train loop

In [None]:
# ====================================================
# Train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['folds'] != fold].index
    val_idx = folds[folds['folds'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    valid_labels = valid_folds["target"].values

    train_dataset = AmexDataset(train_folds)
    valid_dataset = AmexDataset(valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    # ====================================================
    # scheduler 
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, **CFG.reduce_params)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, **CFG.cosanneal_params)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, **CFG.cosanneal_res_params)
        elif CFG.scheduler=='OneCycleLR':
            scheduler = OneCycleLR(optimizer, **CFG.onecycle_params)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    model = NNet(CFG)
    model.to(device)

    optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # loop
    # ====================================================
    valid_loss = nn.BCEWithLogitsLoss()
    train_loss = SmoothBCEwLogits(smoothing=0.0001)
    best_score = 0.
    best_loss = np.inf
    
    for epoch in range(CFG.epochs):
        
        start_time = time.time()
        
        
          
        avg_loss = train_fn(fold, train_loader, model, train_loss, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, valid_loss, device)
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time
        
            
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        wandb.log({f"[fold{fold}] epoch": epoch+1, 
                   f"[fold{fold}] avg_train_loss": avg_loss, 
                   f"[fold{fold}] avg_val_loss": avg_val_loss,
                   f"[fold{fold}] score": score})

        if score >= best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds_score': preds},
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth')
            
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save({'model': model.state_dict(), 
                        'preds_loss': preds},
                        OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_loss.pth')
        
        
    valid_folds["preds_score"] = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth', 
                                      map_location=torch.device('cpu'))['preds_score']
    valid_folds["preds_loss"] = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_loss.pth', 
                                      map_location=torch.device('cpu'))['preds_loss']
   

    return valid_folds

In [None]:
# ====================================================
# main
# ====================================================
def main():

    """
    Prepare: 1.train 
    """

    def get_result(result_df):
        preds_score = result_df['preds_score'].values
        preds_loss = result_df['preds_loss'].values
        labels = result_df["target"].values
        score = get_score(labels, preds_score)
        score_loss = get_score(labels, preds_loss)
        LOGGER.info(f'Score with best score weights: {score:<.4f}')
        LOGGER.info(f'Score with best loss weights: {score_loss:<.4f}')
    
    if CFG.train:
        # train 
        oof_df = pd.DataFrame()
        for fold in range(CFG.nfolds):
            if fold in CFG.trn_folds:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        # save result
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv', index=False)
        
    wandb.finish()

In [None]:
if __name__ == "__main__":
    main()