This is a notebook makes it easy to test with the following implementations:
Optimizer : Adam, Ranger
Scheduler : ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts', OneCycleLR]
Models    : Any Timm Model
Loss Fn   : ['CrossEntropyLoss', LabelSmoothing', 'FocalLoss','FocalCosineLoss', 'SymmetricCrossEntropyLoss', 'BiTemperedLoss', 'TaylorCrossEntropyLoss']

thanks to https://www.kaggle.com/piantic/train-cassava-starter-using-various-loss-funcs/data and @piantic for sharing all the loss functions

Any feedback is appreciated!

In [None]:
# ====================================================
# Required Installations
# ====================================================
!pip install --quiet timm
!pip install --quiet '../input/pytorch-ranger'

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import random
import os
import math
import time

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import imread
import numpy as np
import cv2
from sklearn.model_selection import GroupKFold, StratifiedKFold

import torch
import torch.nn as nn
import torchvision
from torchvision import models as tvmodels
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn.functional as F
from tqdm import tqdm
import timm

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold

import albumentations as A
from albumentations import Compose
from albumentations.pytorch import ToTensorV2


from PIL import Image, ImageOps, ImageEnhance, ImageChops

from pytorch_ranger import Ranger

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Run Config

In [None]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    #Model Params
    N_FOLDS = 5
    MODEL_NAME = 'tf_efficientnet_b4_ns' # Recommended : ['tf_efficientnet_b3_ns','tf_efficientnet_b4_ns',resnext50_32x4d']
    pretrained = True   
    N_CLASSES = 5
    
    N_TTA = 4
    TEST_TTA = True
    
    #Image Size
    HEIGHT = 256
    WIDTH = 256
    CHANNELS = 3
    
    #Training Params
    BATCH_SIZE = 32
    EPOCHS = 3
    LR = 5e-4
    LR_MIN = 1e-6
    weight_decay = 1e-6
    eps = 1e-8
    PATIENCE = 4
    
    #BiTemperedLoss
    T1 = 0.8
    T2 = 1.3
    LABEL_SMOOTH = 0.2
    
    #CosineAnnealingWarmRestarts
    T_0 = 10
    
    #CosineAnnealingLR
    T_max = 10
    
    NUM_WORKERS = os.cpu_count()
    
    scheduler_name = 'OneCycleLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts', OneCycleLR]
    criterion_name = 'BiTemperedLoss' # ['CrossEntropyLoss', LabelSmoothing', 'FocalLoss','FocalCosineLoss', 'SymmetricCrossEntropyLoss', 'BiTemperedLoss', 'TaylorCrossEntropyLoss']
    optimizer_name = 'Ranger' #['Adam','Ranger']
    
    #TRAIN_FOLDS = [0,1,2,3,4]
    TRAIN_FOLDS = [0] #Folds to be Trained
    model_print = False #If the model architecture is printed
    TRAIN_AUG_TYPE = 'autoaugment' #['train','lightaug','autoaugment']
    VALID_AUG_TYPE = 'valid' #['valid']
    
    IMG_MEAN = [0.485, 0.456, 0.406] #Mean for normalization Transform
    IMG_STD = [0.229, 0.224, 0.225] #STD for normalization Transform
    SEED = 1234

# Loss Functions

In [None]:
def log_t(u, t):
    """Compute log_t for `u'."""
    if t==1.0:
        return u.log()
    else:
        return (u.pow(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u'."""
    if t==1:
        return u.exp()
    else:
        return (1.0 + (1.0-t)*u).relu().pow(1.0 / (1.0 - t))

def compute_normalization_fixed_point(activations, t, num_iters):

    """Returns the normalization value for each example (t > 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same shape as activation with the last dimension being 1.
    """
    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations_step_0 = activations - mu

    normalized_activations = normalized_activations_step_0

    for _ in range(num_iters):
        logt_partition = torch.sum(
                exp_t(normalized_activations, t), -1, keepdim=True)
        normalized_activations = normalized_activations_step_0 * \
                logt_partition.pow(1.0-t)

    logt_partition = torch.sum(
            exp_t(normalized_activations, t), -1, keepdim=True)
    normalization_constants = - log_t(1.0 / logt_partition, t) + mu

    return normalization_constants

def compute_normalization_binary_search(activations, t, num_iters):

    """Returns the normalization value for each example (t < 1.0).
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (< 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """

    mu, _ = torch.max(activations, -1, keepdim=True)
    normalized_activations = activations - mu

    effective_dim = \
        torch.sum(
                (normalized_activations > -1.0 / (1.0-t)).to(torch.int32),
            dim=-1, keepdim=True).to(activations.dtype)

    shape_partition = activations.shape[:-1] + (1,)
    lower = torch.zeros(shape_partition, dtype=activations.dtype, device=activations.device)
    upper = -log_t(1.0/effective_dim, t) * torch.ones_like(lower)

    for _ in range(num_iters):
        logt_partition = (upper + lower)/2.0
        sum_probs = torch.sum(
                exp_t(normalized_activations - logt_partition, t),
                dim=-1, keepdim=True)
        update = (sum_probs < 1.0).to(activations.dtype)
        lower = torch.reshape(
                lower * update + (1.0-update) * logt_partition,
                shape_partition)
        upper = torch.reshape(
                upper * (1.0 - update) + update * logt_partition,
                shape_partition)

    logt_partition = (upper + lower)/2.0
    return logt_partition + mu

class ComputeNormalization(torch.autograd.Function):
    """
    Class implementing custom backward pass for compute_normalization. See compute_normalization.
    """
    @staticmethod
    def forward(ctx, activations, t, num_iters):
        if t < 1.0:
            normalization_constants = compute_normalization_binary_search(activations, t, num_iters)
        else:
            normalization_constants = compute_normalization_fixed_point(activations, t, num_iters)

        ctx.save_for_backward(activations, normalization_constants)
        ctx.t=t
        return normalization_constants

    @staticmethod
    def backward(ctx, grad_output):
        activations, normalization_constants = ctx.saved_tensors
        t = ctx.t
        normalized_activations = activations - normalization_constants 
        probabilities = exp_t(normalized_activations, t)
        escorts = probabilities.pow(t)
        escorts = escorts / escorts.sum(dim=-1, keepdim=True)
        grad_input = escorts * grad_output
        
        return grad_input, None, None

def compute_normalization(activations, t, num_iters=5):
    """Returns the normalization value for each example. 
    Backward pass is implemented.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    return ComputeNormalization.apply(activations, t, num_iters)

def tempered_sigmoid(activations, t, num_iters = 5):
    """Tempered sigmoid function.
    Args:
      activations: Activations for the positive class for binary classification.
      t: Temperature tensor > 0.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_probabilities = tempered_softmax(internal_activations, t, num_iters)
    return internal_probabilities[..., 0]


def tempered_softmax(activations, t, num_iters=5):
    """Tempered softmax function.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      t: Temperature > 1.0.
      num_iters: Number of iterations to run the method.
    Returns:
      A probabilities tensor.
    """
    if t == 1.0:
        return activations.softmax(dim=-1)

    normalization_constants = compute_normalization(activations, t, num_iters)
    return exp_t(activations - normalization_constants, t)

def bi_tempered_binary_logistic_loss(activations,
        labels,
        t1,
        t2,
        label_smoothing = 0.0,
        num_iters=5,
        reduction='mean'):

    """Bi-Tempered binary logistic loss.
    Args:
      activations: A tensor containing activations for class 1.
      labels: A tensor with shape as activations, containing probabilities for class 1
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing
      num_iters: Number of iterations to run the method.
    Returns:
      A loss tensor.
    """
    internal_activations = torch.stack([activations,
        torch.zeros_like(activations)],
        dim=-1)
    internal_labels = torch.stack([labels.to(activations.dtype),
        1.0 - labels.to(activations.dtype)],
        dim=-1)
    return bi_tempered_logistic_loss(internal_activations, 
            internal_labels,
            t1,
            t2,
            label_smoothing = label_smoothing,
            num_iters = num_iters,
            reduction = reduction)

def bi_tempered_logistic_loss(activations,
        labels,
        t1 = CFG.T1,
        t2 = CFG.T2,
        label_smoothing=CFG.LABEL_SMOOTH,
        num_iters=5,
        reduction = 'mean'):

    """Bi-Tempered Logistic Loss.
    Args:
      activations: A multi-dimensional tensor with last dimension `num_classes`.
      labels: A tensor with shape and dtype as activations (onehot), 
        or a long tensor of one dimension less than activations (pytorch standard)
      t1: Temperature 1 (< 1.0 for boundedness).
      t2: Temperature 2 (> 1.0 for tail heaviness, < 1.0 for finite support).
      label_smoothing: Label smoothing parameter between [0, 1). Default 0.0.
      num_iters: Number of iterations to run the method. Default 5.
      reduction: ``'none'`` | ``'mean'`` | ``'sum'``. Default ``'mean'``.
        ``'none'``: No reduction is applied, return shape is shape of
        activations without the last dimension.
        ``'mean'``: Loss is averaged over minibatch. Return shape (1,)
        ``'sum'``: Loss is summed over minibatch. Return shape (1,)
    Returns:
      A loss tensor.
    """

    if len(labels.shape)<len(activations.shape): #not one-hot
        labels_onehot = torch.zeros_like(activations)
        labels_onehot.scatter_(1, labels[..., None], 1)
    else:
        labels_onehot = labels

    if label_smoothing > 0:
        num_classes = labels_onehot.shape[-1]
        labels_onehot = ( 1 - label_smoothing * num_classes / (num_classes - 1) ) \
                * labels_onehot + \
                label_smoothing / (num_classes - 1)

    probabilities = tempered_softmax(activations, t2, num_iters)

    loss_values = labels_onehot * log_t(labels_onehot + 1e-10, t1) \
            - labels_onehot * log_t(probabilities, t1) \
            - labels_onehot.pow(2.0 - t1) / (2.0 - t1) \
            + probabilities.pow(2.0 - t1) / (2.0 - t1)
    loss_values = loss_values.sum(dim = -1) #sum over classes

    if reduction == 'none':
        return loss_values
    if reduction == 'sum':
        return loss_values.sum()
    if reduction == 'mean':
        return loss_values.mean()

class BiTemperedLogistic(nn.Module):
    def __init__(self, T1 = CFG.T1, T2 = CFG.T2, LABEL_SMOOTH = CFG.LABEL_SMOOTH):
        super().__init__()
        self.T1 = T1
        self.T2 = T2
        self.LABEL_SMOOTH = LABEL_SMOOTH

    def forward(self, logits,labels):
        return bi_tempered_logistic_loss(logits, labels,t1 = self.T1,t2 = self.T2, label_smoothing = self.LABEL_SMOOTH)
    
class SymmetricCrossEntropy(nn.Module):

    def __init__(self, alpha=0.1, beta=1.0, num_classes= 5):
        super(SymmetricCrossEntropy, self).__init__()
        self.alpha = alpha
        self.beta = beta
        self.num_classes = num_classes

    def forward(self, logits, targets, reduction='mean'):
        onehot_targets = torch.eye(self.num_classes)[targets].cuda()
        ce_loss = F.cross_entropy(logits, targets, reduction=reduction)
        rce_loss = (-onehot_targets*logits.softmax(1).clamp(1e-7, 1.0).log()).sum(1)
        if reduction == 'mean':
            rce_loss = rce_loss.mean()
        elif reduction == 'sum':
            rce_loss = rce_loss.sum()
        return self.alpha * ce_loss + self.beta * rce_loss
    
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=5, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.cls = classes 
        self.dim = dim 
    def forward(self, pred, target): 
        pred = pred.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.cls - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
    
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce

    def forward(self, inputs, targets):
        BCE_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss
        
class FocalCosineLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, xent=.1):
        super(FocalCosineLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

        self.xent = xent

        self.y = torch.Tensor([1]).cuda()

    def forward(self, input, target, reduction="mean"):
        cosine_loss = F.cosine_embedding_loss(input, F.one_hot(target, num_classes=input.size(-1)), self.y, reduction=reduction)

        cent_loss = F.cross_entropy(F.normalize(input), target, reduce=False)
        pt = torch.exp(-cent_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * cent_loss

        if reduction == "mean":
            focal_loss = torch.mean(focal_loss)

        return cosine_loss + self.xent * focal_loss
    
class TaylorSoftmax(nn.Module):
    '''
    This is the autograd version
    '''
    def __init__(self, dim=1, n=2):
        super(TaylorSoftmax, self).__init__()
        assert n % 2 == 0
        self.dim = dim
        self.n = n

    def forward(self, x):
        '''
        usage similar to nn.Softmax:
            >>> mod = TaylorSoftmax(dim=1, n=4)
            >>> inten = torch.randn(1, 32, 64, 64)
            >>> out = mod(inten)
        '''
        fn = torch.ones_like(x)
        denor = 1.
        for i in range(1, self.n+1):
            denor *= i
            fn = fn + x.pow(i) / denor
        out = fn / fn.sum(dim=self.dim, keepdims=True)
        return out
    
class TaylorCrossEntropyLoss(nn.Module):
    '''
    This is the autograd version
    '''
    def __init__(self, n=2, ignore_index=-1, reduction='mean'):
        super(TaylorCrossEntropyLoss, self).__init__()
        assert n % 2 == 0
        self.taylor_softmax = TaylorSoftmax(dim=1, n=n)
        self.reduction = reduction
        self.ignore_index = ignore_index

    def forward(self, logits, labels):
        '''
        usage similar to nn.CrossEntropyLoss:
            >>> crit = TaylorCrossEntropyLoss(n=4)
            >>> inten = torch.randn(1, 10, 64, 64)
            >>> label = torch.randint(0, 10, (1, 64, 64))
            >>> out = crit(inten, label)
        '''
        log_probs = self.taylor_softmax(logits).log()
        loss = F.nll_loss(log_probs, labels, reduction=self.reduction,
                ignore_index=self.ignore_index)
        return loss

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def retrieve_df(df,name,idx):
    series = df[name].iloc[idx]
    series.reset_index(drop=True,inplace=True)
    return series

def accuracy_metric(input, targs):
    return accuracy_score(targs.cpu(), input.cpu())

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
#Choose Criterions for the Training Loop
if CFG.criterion_name == 'BiTemperedLoss':
    CFG.criterion = BiTemperedLogistic()
    CFG.val_criterion = BiTemperedLogistic()
elif CFG.criterion_name == 'SymmetricCrossEntropyLoss':
    CFG.criterion = SymmetricCrossEntropy()
    CFG.val_criterion = SymmetricCrossEntropy()
elif CFG.criterion_name == 'CrossEntropyLoss':
    CFG.criterion = nn.CrossEntropyLoss()
    CFG.val_criterion = nn.CrossEntropyLoss()
elif CFG.criterion_name == 'LabelSmoothingLoss':
    CFG.criterion = LabelSmoothingLoss()
    CFG.val_criterion = LabelSmoothingLoss()
elif CFG.criterion_name == 'FocalLoss':
    CFG.criterion = FocalLoss()
    CFG.val_criterion = FocalLoss()
elif CFG.criterion_name == 'FocalCosineLoss':
    CFG.criterion = FocalCosineLoss()
    CFG.val_criterion = FocalCosineLoss()
elif CFG.criterion_name == 'TaylorCrossEntropyLoss':
    CFG.criterion = TaylorCrossEntropyLoss()
    CFG.val_criterion = TaylorCrossEntropyLoss()
    
def GetScheduler(scheduler_name,optimizer):
    #['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR']
    if scheduler_name == 'OneCycleLR':
        return torch.optim.lr_scheduler.OneCycleLR(optimizer,max_lr = 1e-2,epochs = CFG.EPOCHS,steps_per_epoch = batches+1,pct_start = 0.3)
    elif scheduler_name == 'CosineAnnealingWarmRestarts':
        return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0 = CFG.T_0, T_mult=1, eta_min=0, last_epoch=-1)
    elif scheduler_name == 'CosineAnnealingLR':
        return torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = CFG.T_max, eta_min=0, last_epoch=-1)
    elif scheduler_name == 'ReduceLROnPlateau':
        return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=0.1, patience=1, threshold=0.0001, cooldown=0, min_lr=CFG.LR_MIN, eps=CFG.eps)
    
def GetOptimizer(optimizer_name,parameters):
    #['Adam','Ranger']
    if optimizer_name == 'Adam':
        return torch.optim.Adam(parameters, lr=CFG.LR, weight_decay=CFG.weight_decay, amsgrad=False)
    elif optimizer_name == 'Ranger':
        return Ranger(parameters,lr = CFG.LR,alpha = 0.5, k = 6,N_sma_threshhold = 5,betas = (0.95,0.999),eps=CFG.eps,weight_decay=CFG.weight_decay)

SEED = CFG.SEED
seed_everything(SEED)  
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Datasets

In [None]:
# ====================================================
# Datasets
# ====================================================
class GetData(Dataset):
    def __init__(self, Dir, FNames, labels,Type):
        self.dir = Dir
        self.fnames = FNames
        self.lbs = labels
        self.type = Type
        self.auto_augment = timm.data.auto_augment.auto_augment_transform('originalr',None)
        
    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, index):
        if "train" in self.type:
            x = imread(os.path.join(self.dir, self.fnames[index]))
            aug_data = train_transforms(image = x)
            return aug_data['image'], self.lbs[index]
        if "lightaug" in self.type:
            x = imread(os.path.join(self.dir, self.fnames[index]))
            aug_data = light_transforms(image = x)
            return aug_data['image'], self.lbs[index]
        elif "autoaugment" in self.type:
            x = Image.open(os.path.join(self.dir, self.fnames[index]))
            aug_image = self.auto_augment(x)
            aug_data = image_net_post(image = np.asarray(aug_image,dtype = np.float32))
            return aug_data['image'], self.lbs[index]
        elif "valid" in self.type:
            x = imread(os.path.join(self.dir, self.fnames[index]))
            aug_data = valid_transforms(image = x)
            return aug_data['image'], self.lbs[index]
        elif "tr-tst" in self.type:
            x = imread(os.path.join(self.dir, self.fnames[index]))
            return x, self.lbs[index]
        elif "test" in self.type:
            x = imread(os.path.join(self.dir, self.fnames[index]))
            return x, self.fnames[index]

# CV SPLIT

In [None]:
# ====================================================
# CV Split
# ====================================================
DATA_PATH = '../input/cassava-leaf-disease-classification/'
TRAIN_DIR = DATA_PATH + 'train_images/'
DATA_PATH_2019 = '../input/cassava-leaf-disease-merged/'
TRAIN_DIR_2019 = DATA_PATH_2019 + 'train/'
TEST_DIR = DATA_PATH + 'test_images/'

#This guarantees that no images from 2019 contaminate the validation split
train_df = pd.read_csv(DATA_PATH + 'train.csv')
train_df_merged = pd.read_csv(DATA_PATH_2019 + 'merged.csv')
train_df_2019 = train_df_merged.loc[train_df_merged.source == 2019]
skf = StratifiedKFold(n_splits=CFG.N_FOLDS, shuffle=True, random_state=CFG.SEED)
skf.get_n_splits(np.arange(train_df.shape[0]), train_df['label'])
folds = [(idxT,idxV) for i,(idxT,idxV) in enumerate(skf.split(np.arange(train_df.shape[0]), train_df['label']))]
folds_2019 = [np.concatenate((idxT,idxV)) for i,(idxT,idxV) in enumerate(skf.split(np.arange(train_df_2019.shape[0]), train_df_2019['label']))]
for i in range(CFG.N_FOLDS):
    (idxT,idxV) = folds[i]
    folds[i] = (np.concatenate((idxT,folds_2019[i])),idxV)
    (idxT,idxV) = folds[i]
    print(np.bincount(train_df_merged['label'].iloc[idxT]),np.bincount(train_df['label'].iloc[idxV]))

# Augmentations

In [None]:
# ====================================================
# Augmentations
# ====================================================
Aug_Norm = A.Normalize(mean=CFG.IMG_MEAN, std=CFG.IMG_STD, max_pixel_value=255.0, p=1.0)
Drop_Rand = A.CoarseDropout(max_holes=16, max_height=int(0.11*CFG.HEIGHT), max_width=int(0.11*CFG.WIDTH),
                            min_holes=4, min_height=int(0.08*CFG.HEIGHT), min_width=int(0.08*CFG.WIDTH),
                            fill_value=0, mask_fill_value=0, always_apply=False, p=0.4)
Rand_Crop = A.RandomCrop(height= CFG.HEIGHT, width = CFG.WIDTH,always_apply=True, p=1.0)
Resize_Crop = A.RandomResizedCrop(CFG.HEIGHT, CFG.WIDTH,p=1.0)
train_transforms = Compose([
            A.Transpose(p=0.5),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.ColorJitter(brightness=0.10, contrast=0.2, saturation=0.2, hue=0.00, always_apply=False, p=0.5),
            A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.5),
            Resize_Crop,
            Drop_Rand,
            Aug_Norm,   
            ToTensorV2(p=1.0),
        ], p=1.)

light_transforms = Compose([
            A.Transpose(p=0.5),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.5),
            Resize_Crop,
            Aug_Norm,   
            ToTensorV2(p=1.0),
        ], p=1.)

valid_transforms = Compose([
            Resize_Crop,
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            Aug_Norm,   
            ToTensorV2(p=1.0),
        ], p=1.)

test_aug = Compose([
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.ShiftScaleRotate(p = 1.0),
            A.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.2, hue=0.00, always_apply=False, p=1.0),
            Rand_Crop,
            Aug_Norm,
            ToTensorV2(p=1.0)
        ], p=1.)

image_net_post = Compose([
            Rand_Crop,
            Drop_Rand,
            Aug_Norm,    
            ToTensorV2(p=1.0)
        ], p=1.)

class UnNormalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
        Returns:
            Tensor: Normalized image.
        """
        for t, m, s in zip(tensor, self.mean, self.std):
            t.mul_(s).add_(m)
            # The normalize code -> t.sub_(m).div_(s)
        return tensor

In [None]:
plt.figure(figsize=(10, 10))
inv_normalize = UnNormalize(mean = CFG.IMG_MEAN,std = CFG.IMG_STD)
for i in range(16):
    ax = plt.subplot(4, 4, i + 1)
    image = imread(TRAIN_DIR+train_df['image_id'].iloc[i])
    aug_image = train_transforms(image = image)['image']
    aug_image = inv_normalize(aug_image)
    aug_image = np.transpose(aug_image.numpy(),[1,2,0])
    plt.imshow(aug_image)
    label = train_df['label'].iloc[i]
    plt.title(label)
    plt.axis("off")

# Model

In [None]:
# ====================================================
# Model
# ====================================================
class CassavaNet(nn.Module):
    def __init__(self, model_name=CFG.MODEL_NAME, pretrained=CFG.pretrained):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        try:
            self.n_features = self.model.fc.in_features
        except:
            self.n_features = list(self.model.children())[-1].in_features
        self.model = torch.nn.Sequential(*(list(self.model.children())[:-1]))
        self.drop_fc = nn.Dropout(p = 0.3)
        self.out = nn.Linear(self.n_features, CFG.N_CLASSES)
        
    def forward(self, x):
        x = self.model(x)
        x = self.drop_fc(x)
        return self.out(x)

In [None]:
model = CassavaNet()
if CFG.model_print:
    print(model)

# Train Loop

In [None]:
# ====================================================
# Train Loop
# ====================================================
torch.cuda.empty_cache()
for fold,(idxT, idxV) in enumerate(folds):
    if fold not in CFG.TRAIN_FOLDS:
        continue
    print(fold)
    #______INSTANTIATE TRAINING DATASETS_____
    x_train = retrieve_df(train_df_merged,'image_id',idxT)
    y_train = retrieve_df(train_df_merged,'label',idxT)
    x_val = retrieve_df(train_df,'image_id',idxV)
    y_val = retrieve_df(train_df,'label',idxV)
    train_set = GetData(TRAIN_DIR, x_train, y_train, Type = CFG.TRAIN_AUG_TYPE)
    train_loader = DataLoader(train_set, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=CFG.NUM_WORKERS,pin_memory = True)
    val_set = GetData(TRAIN_DIR, x_val, y_val, Type = CFG.VALID_AUG_TYPE)
    val_loader = DataLoader(val_set, batch_size=CFG.BATCH_SIZE, shuffle=True, num_workers=CFG.NUM_WORKERS,pin_memory = True)
    batches = math.floor(len(x_train)/CFG.BATCH_SIZE)
    val_batches = math.floor(len(x_val)/CFG.BATCH_SIZE)
    
    #INSTANTIATE FOLD MODEL
    model = CassavaNet(model_name=CFG.MODEL_NAME, pretrained=True).to(DEVICE)
    criterion = CFG.criterion.to(DEVICE)
    val_criterion = CFG.val_criterion.to(DEVICE)
    
    optimizer = GetOptimizer(CFG.optimizer_name, model.parameters())
    scheduler = GetScheduler(CFG.scheduler_name, optimizer)
    
    saved_model = None
    best_val_acc = 0.0
    best_val_loss = 1e3
    fold_patience = 0.0
    for epoch in range(CFG.EPOCHS):
        #______TRAINING______
        tr_loss = 0.0
        cur_step = 0
        scores = []
        model.train()
        progress = tqdm(enumerate(train_loader), desc="Loss: ", total=batches)
        for i, (images,labels) in progress:
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)

            logits = model(images)
            loss = criterion(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            preds = F.softmax(logits).argmax(axis = 1)
            scores.append(accuracy_metric(preds,labels))

            tr_loss += loss.detach().item()
            cur_step = i+1
            trn_epoch_result = dict()
            trn_epoch_result['Epoch'] = epoch
            trn_epoch_result['train_loss'] = round(tr_loss/cur_step, 4)
            trn_epoch_result['train_acc'] = round(np.mean(scores), 4)
            
            scheduler.step()
            progress.set_description(str(trn_epoch_result))
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        #______VALIDATION_______
        val_loss = 0.0
        val_bi_loss = 0.0
        cur_step = 0
        scores = []
        model.eval()
        val_progress = tqdm(enumerate(val_loader), desc="Loss: ", total=val_batches)
        with torch.no_grad():
            for i, (images,labels) in val_progress:
                images = images.to(DEVICE)
                labels = labels.to(DEVICE)

                logits = model(images)
                val_loss_value = val_criterion(logits,labels)
                val_loss += val_loss_value.detach().item()

                preds = F.softmax(logits).argmax(axis = 1)
                scores.append(accuracy_metric(preds,labels))

                cur_step = i + 1
                val_epoch_result = dict()
                val_epoch_result['Epoch'] = epoch
                val_epoch_result['val_loss'] = round(val_loss/cur_step, 4)

                val_epoch_result['val_acc'] = round(np.mean(scores), 4)
                val_progress.set_description(str(val_epoch_result))
        
        if val_loss/cur_step < best_val_loss or np.mean(scores) > best_val_acc:
            fold_patience = 0
            best_val_loss = val_loss/cur_step
            best_val_acc = np.mean(scores)
            torch.save({'model': model.state_dict(), 
                        'preds': preds},
                        f'{CFG.MODEL_NAME}_f{fold}_b{round(np.mean(scores), 4)}.pth')
            if saved_model is not None:
                os.remove("./"+saved_model)
            saved_model = f'{CFG.MODEL_NAME}_f{fold}_b{round(np.mean(scores), 4)}.pth'
            print(f'Model Saved at {round(np.mean(scores), 5)} accuracy')
        else:
            fold_patience += 1
            if fold_patience >= CFG.PATIENCE:
                print(f'Early stopping due to model not improving for {CFG.PATIENCE} epochs')
                '''
                torch.save({'model': model.state_dict(), 
                'preds': preds},
                f'ResNext50_fold{fold}_ES.pth')
                '''
                break
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

# VTA Validation

In [None]:
# ====================================================
# TTA Validation
# ====================================================
if CFG.TEST_TTA:
    submission = pd.DataFrame()
    list_files = os.listdir(TRAIN_DIR)
    submission['image_id'] = pd.Series(list_files)
    submission['label'] = 0
    models= []
    best_model = CassavaNet()
    info = torch.load(saved_model,map_location = torch.device(DEVICE))
    best_model.load_state_dict(info['model'])
    models.append(best_model)

    #IF TTA AVERAGED
    TEST = False
    if not TEST:
        start_time = time.time()
        BATCH_SIZE = 1
        val_set = GetData(TRAIN_DIR, x_val, y_val, Type = 'tr-tst')
        val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=8,pin_memory = True)

        scores = []
        count = {0:0,1:0,2:0,3:0,4:0}
        N_TTA = CFG.N_TTA
        right_count = {0:0,1:0,2:0,3:0,4:0}
        val_tta = {"Acc_0":0,"Acc_1":0,"Acc_2":0,"Acc_3":0,"Acc_4":0,"Acc_T":0}
        tta_progress = tqdm(enumerate(val_loader), desc="Loss: ", total=len(x_val))
        confusion_matrix = np.zeros((CFG.N_CLASSES,CFG.N_CLASSES))
        with torch.no_grad():
            for i, (images,labels) in tta_progress:
                voting = np.zeros((len(models),N_TTA,CFG.N_CLASSES))
                aug_images = np.zeros((N_TTA,CFG.CHANNELS,CFG.HEIGHT,CFG.WIDTH))
                for aug_no in range(N_TTA):
                    img_np = images.numpy()

                    aug_data = test_aug(image = np.reshape(img_np,(600,800,CFG.CHANNELS)))
                    aug_images[aug_no,:,:,:] = aug_data['image'].numpy()
                aug_images = torch.from_numpy(aug_images).to(torch.float32).to(DEVICE)
                for model_no in range(len(models)):
                    model = models[model_no]
                    model = model.to(DEVICE)
                    model.eval()            

                    labels = labels.to(DEVICE)

                    logits = model(aug_images)
                    voting[model_no,:,:] = F.softmax(logits).cpu().numpy()

                voting = np.sum(voting,axis = 1) / N_TTA
                voting = np.sum(voting,axis = 0) / len(models)
                label = np.argmax(voting)
                count[int(labels[0].cpu().numpy())] += 1
                confusion_matrix[labels[0].cpu().numpy(),label] += 1
                if label == labels[0].cpu().numpy():
                    right_count[label] += 1
                try:
                    val_tta['Acc_'+str(label)] = round(right_count[label]/count[label],3)
                except:
                    val_tta['Acc_'+str(label)] = 0
                val_tta['Acc_T'] = round(sum([right_count[x] for x in range(5)])/sum([count[x] for x in range(5)]),4)
                tta_progress.set_description(str(val_tta))
        print(time.time()-start_time)
        print(round(sum([right_count[x] for x in range(5)])/sum([count[x] for x in range(5)]),4))
        new_cm = confusion_matrix
        for i in range(5):
            new_cm[i,:] = new_cm[i,:]/np.sum(new_cm[i,:]) 
        print("Confusion Matrix:")
        print(new_cm)