Version 13: EfficientB3NS

Version 14: Skipped because I am superstitous and do not like this number LOL

Version 15: EfficientnetB5NS

Version 16: Changed albumentations and image size to native resolution. Changed to Normalization mean using [this](https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/207450).

Version 17: Fixed error in albumentation pipeline for normalization.

Version 18-20: Fixing bugs cause I want to implement `LabelSmoothing`.

Version 21: 7 Jan fixed label smoothing classes from 2 to 5

Version 22: LIES! training more epochs don't seem good on my pipeline! Maybe my scheduler is bad!!! Let us try something else, maybe `Cosine Annealing Warm Restart`? Changed to 256 image size and return to normal mean std for augs.

# Foreword

This is a pipeline on training using PyTorch. If anyone finds any improvement, please comment in the notebook! I have spent a lot of time to construct such a pipeline for image classification tasks, this is because I am a strong believer in code reusability. The purpose is to change a minimal amount of code every time you run a new training. Neat and clean brings one a long way as ML/DL is more than just knowing the theories, but also includes software engineering.

Much thanks to @shonenkov as my `Trainer Class` is largely modified from his previous works! Also, thanks to the first Quadruple GM @abhishek as his [youtube videos](https://www.youtube.com/user/abhisheksvnit) inspired me to (try my best to) write clean code.

PS: It does not look as neat in notebook format, but I made a GitHub link to store individual classes into respective files, which is a better practice. I will also further attach my detailed explanation of each line when I have the time.


Do give a like/upvote if you feel this is useful to you!

# Check GPU

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
    print('and then re-execute this cell.')
else:
    print(gpu_info)

# Dependencies and Imports

In [None]:
!pip install -q yamale==3.0.4
!pip install -q scikit-learn==0.23.2
!pip install -q torch==1.7.0
!pip install -q torchvision==0.8.1
!pip install -q albumentations==0.5.1
!pip install -q torchtoolbox==0.1.5

In [None]:
import datetime
import gc
import os
import random
import sys
import time
import warnings
from abc import ABC, abstractmethod
from collections import Counter
from glob import glob
from typing import *
from typing import List, Optional
import albumentations
import cv2

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytz
import seaborn as sns
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtoolbox
import torchvision
import yamale
from albumentations.pytorch.transforms import ToTensorV2
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from torch.optim import *
from torch.utils.data import DataLoader, Dataset, Subset
from torchtoolbox.transform import Cutout
from tqdm import tqdm


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

sys.path.append('../input/hongnangeffnet/gen-efficientnet-pytorch-master-hongnan')
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
sys.path.append('../input/autoaug')
import geffnet
import timm
from auto_augment import AutoAugment, Cutout

# Config

In [None]:
class GlobalConfig:
    seed = 1992
    num_classes = 5
    batch_size = 32
    n_epochs = 10

    
    # unpack the key dict
    scheduler = 'CosineAnnealingWarmRestarts'
    scheduler_params = {'StepLR': {'step_size':2, 'gamma':0.3, 'last_epoch':-1, 'verbose':True},
                
                'ReduceLROnPlateau': {'mode':'max', 'factor':0.5, 'patience':0, 'threshold':0.0001,
                                      'threshold_mode':'rel', 'cooldown':0, 'min_lr':1e-6,
                                      'eps':1e-08, 'verbose':True},
                
                'CosineAnnealingWarmRestarts': {'T_0':10, 'T_mult':1, 'eta_min':1e-6, 'last_epoch':-1,
                                                'verbose':True}}
    
    # do scheduler.step after optimizer.step
    train_step_scheduler = False  
    val_step_scheduler = True
    
    # optimizer
    optimizer = 'AdamW'
    optimizer_params = {'AdamW':{'lr':1e-3, 'betas':(0.9,0.999), 'eps':1e-08,
                                 'weight_decay':0.001,'amsgrad':False},
                       'Adam':{'lr':1e-4, 'betas':(0.9,0.999), 'eps':1e-08,
                                 'weight_decay':0.001,'amsgrad':False}}

    # criterion
    criterion = 'CrossEntropyLoss'
    criterion_params = {'CrossEntropyLoss': {'weight':None,'size_average':None,
                                             'ignore_index':-100,'reduce':None,
                                             'reduction':'mean'},
                        'LabelSmoothingLoss': {'classes':5, 'smoothing':0.05, 'dim':-1},
                        'FocalCosineLoss': {'alpha':1, 'gamma':2 , 'xent':0.1}}
    
    image_size = 256
    resize = 256
    crop_size = {128:110, 256:200, 456:384, 512:400}
    verbose = 1
    verbose_step = 1
    num_folds = 5
    image_col_name = 'image_id'
    class_col_name = 'label'
    paths = {'train_path': '../input/cassava-jpeg-256x256/kaggle/train_images_jpeg',
             'test_path': '../input/cassava-leaf-disease-classification/test_images/2216849948.jpg',
             'csv_path': '../input/cassava-leaf-disease-classification/train.csv',
             'log_path': './log.txt',
             'save_path': './',
             'model_weight_path_folder': '../input/efficientnet-weights'}

    effnet = 'tf_efficientnet_b4_ns'
    model_name = 'resnext50_32x4d'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
config=GlobalConfig

# Seeding

In [None]:
def seed_all(seed: int = 1930):

    print("Using Seed Number {}".format(seed))

    os.environ["PYTHONHASHSEED"] = str(
        seed)  # set PYTHONHASHSEED env var at fixed value
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)  # pytorch (both CPU and CUDA)
    np.random.seed(seed)  # for numpy pseudo-random generator
    random.seed(
        seed)  # set fixed value for python built-in pseudo-random generator
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


def seed_worker(_worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
seed_all(seed=config.seed)

# Utilities

In [None]:
def get_file_type(image_folder_path: str,
                  allowed_extensions: Optional[List] = None):
    """Get the file type of images in a folder."""
    if allowed_extensions is None:
        allowed_extensions = ['.jpg', '.png', '.jpeg']

    file_list = os.listdir(image_folder_path)
    extension_type = [os.path.splitext(file)[-1].lower() for file in file_list]
    extension_dict = Counter(extension_type)
    assert len(extension_dict.keys()
               ) == 1, "The extension in the folder should all be the same, "
    "but found {} extensions".format(extension_dict.keys)
    extension_type = list(extension_dict.keys())[0]
    assert extension_type in allowed_extensions
    return extension_type


''' Consider modifying this function below to check if the dataframe's
image id column has extension or not '''


def check_df_ext(df: pd.DataFrame,
                 col_name: str,
                 allowed_extensions: Optional[List] = None):
    """Get the image file extension used in a data frame."""
    if allowed_extensions is None:
        allowed_extensions = ['.jpg', '.png', '.jpeg']
    # check if the col has an extension, this is tricky.
    # if no extension, it gives default ""
    image_id_list = df[col_name].tolist()
    extension_type = [
        # Review Comments: os.path.splitext is guaranteed to return a 2-tuple,
        # so no need to use -1 index.
        os.path.splitext(image_id)[1].lower() for image_id in image_id_list
    ]

    assert len(set(extension_type)
               ) == 1, "The extension in the image id should all be the same"


    if "" in extension_type:
        return False

    # Review Comments: No need to use else after return.
    assert list(set(extension_type))[0] in allowed_extensions
    return True

# Make Folds (Cross Validation)

In [None]:
def make_folds(train_csv: pd.DataFrame, config: type, cv_schema=None) -> pd.DataFrame:
    """Split the given dataframe into training folds."""
    #TODO: add options for cv_scheme.
    df_folds = train_csv.copy()
    skf = StratifiedKFold(5, shuffle=True, random_state=config.seed)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df_folds[config.image_col_name], y=df_folds[config.class_col_name])):
        df_folds.loc[val_idx, 'fold'] = int(fold+1)
    df_folds['fold'] = df_folds['fold'].astype(int)
    print(df_folds.groupby(['fold', config.class_col_name]).size())

    return df_folds

train_csv = pd.read_csv(config.paths['csv_path']) 
df_folds = make_folds(train_csv, config)
df_folds

# Augmentations

In [None]:
class Augmentation(ABC):

    @abstractmethod
    def augment(image):
        """Augment an image."""
        
class AlbumentationsAugmentation(Augmentation):

    def __init__(self, transforms: albumentations.core.composition.Compose):
        self.transforms = transforms

    def augment(self, image):
        albu_dict = {"image": image}
        transform = self.transforms(**albu_dict)
        return transform["image"]

In [None]:
class augment_config:
    train_augmentations =  [#albumentations.RandomSizedCrop(min_max_height=(config.crop_size[config.image_size],config.crop_size[config.image_size]),height=config.image_size, width=config.image_size, p=0.5),
                            albumentations.RandomResizedCrop(height=config.image_size, width=config.image_size),                    
                            albumentations.RandomRotate90(p=0.5),
                            albumentations.VerticalFlip(p=0.5),
                            albumentations.HorizontalFlip(p=0.5),
                            albumentations.Cutout(p=0.5),
                            albumentations.Resize(height=config.image_size, width=config.image_size, p=1.0),
                            albumentations.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
                            ToTensorV2(p=1.0)]

    val_augmentations = [albumentations.Resize(height=config.image_size, width=config.image_size, p=1.0),
                         albumentations.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
                         ToTensorV2(p=1.0)]

    test_augmentations = [albumentations.Resize(height=config.image_size, width=config.image_size, p=1.0),
                          ToTensorV2(p=1.0)]
    

def get_albu_transforms(config):
    transforms_train = albumentations.Compose([*augment_config.train_augmentations],p=1.0)
    transforms_val = albumentations.Compose([*augment_config.val_augmentations],p=1.0)

    return transforms_train, transforms_val    

# Dataset

In [None]:
class Cassava(torch.utils.data.Dataset):

    """The Cassava dataset. transforms is now an abstract class"""

    def __init__(self,
                 df: pd.DataFrame,
                 config: type,
                 transforms: type = None,
                 test: bool = False,
                 transform_norm: bool = False, meta_features=None):
        """Construct a Cassava dataset."""

        self.df = df
        self.config = config
        self.transforms = transforms
        self.test = test
        self.transform_norm = transform_norm
        self.meta_features = meta_features

        if self.transforms is None:
            assert self.transform_norm is False
            print('Transforms is None and Transform Normalization is not '
                  'initialized!')

        self.image_extension = get_file_type(
            image_folder_path=config.paths['train_path'], allowed_extensions=None)
        self.df_has_ext = check_df_ext(df=self.df, col_name=config.image_col_name)

        if self.df_has_ext is True:
            self.image_extension = ""
            
    def __len__(self):
        """Get the dataset length."""
        return len(self.df)

    def __getitem__(self, idx: int):
        """Get a row from the dataset."""

        image_id = self.df[self.config.image_col_name].values[idx]
        # simple hack to bypass testset df may not have label as column name and throw error when 
        # iterating through the dataset.
        label = None
        label = torch.zeros(1)

        
        if self.test:
            image_path = os.path.join(
                self.config.paths['test_path'], "{}{}".format(image_id,
                                                     self.image_extension))
        else:
            label = self.df[self.config.class_col_name].values[idx]
            label = torch.as_tensor(data=label, dtype=torch.int64, device=None)
            image_path = os.path.join(
                self.config.paths['train_path'], "{}{}".format(image_id,
                                                      self.image_extension))

        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform_norm is False:
            image = image.astype(np.float32) / 255.0

        if self.transforms is not None:
            
            image = self.transforms.augment(image)
        else:
            image = torch.as_tensor(data=image,
                                    dtype=torch.float32,
                                    device=None)
            
        if self.meta_features is not None:            
            meta = np.array(self.df.iloc[idx][self.meta_features].values, dtype=np.float32) 
            return image_id, (image, meta), label
            
        return image_id, image, label


# Model Creation

This is a modified version of [roff's geffnet](https://github.com/rwightman/gen-efficientnet-pytorch). I merely added some lines of code so that one can load his own weights for pretrained. Also' if one wants to use [roff's timm module instead](https://github.com/rwightman/pytorch-image-models), then one just need to change the code below ever so slightly.

In [None]:
class CustomEfficientNet(nn.Module):
    def __init__(self, config: type, pretrained: bool=True):
        super().__init__()
        self.config = config
        self.model = geffnet.create_model(
            model_weight_path_folder=config.paths['model_weight_path_folder'],
            model_name=config.effnet,
            pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, config.num_classes)
        

    def forward(self, input_neurons):
        # TODO: add dropout layers, or the likes.
        output_predictions = self.model(input_neurons)
        return output_predictions

Using `timm` module instead of `geffnet` module (both are from the same author).

In [None]:
print("Available Vision Transformer Models: ")
timm.list_models("vit*")

In [None]:

class CustomResNext(nn.Module):
    def __init__(self,config, pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name=config.model_name, pretrained=pretrained)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, config.image_size)

    def forward(self, x):
        x = self.model(x)
        return x

In [None]:
class CustomEfficientNet_pytorch_image_models(nn.Module):
    def __init__(self, config: type, pretrained: bool=True):
        super().__init__()
        self.config = config
        self.model = timm.create_model(model_name=config.model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, config.num_classes)
        

    def forward(self, input_neurons):
        # TODO: add dropout layers, or the likes.
        output_predictions = self.model(input_neurons)
        return output_predictions

# Meters

In [None]:
class AverageLossMeter:
    """
    Computes and stores the average and current loss
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.curr_batch_avg_loss = 0
        self.avg = 0
        self.running_total_loss = 0
        self.count = 0

    def update(self, curr_batch_avg_loss: float, batch_size: str):
        self.curr_batch_avg_loss = curr_batch_avg_loss
        self.running_total_loss += curr_batch_avg_loss * batch_size
        self.count += batch_size
        self.avg = self.running_total_loss / self.count

class AccuracyMeter:
    def __init__(self):        
        self.reset()
     
        
    def reset(self):
        self.score = 0
        self.count = 0
        self.sum = 0

    def update(self, y_true, y_pred, batch_size=1):

        # so we just need to count total num of images / batch_size
        #self.count += num_steps
        self.batch_size = batch_size
        self.count += self.batch_size
        # this part here already got an acc score for the 4 images, so no need divide batch size
        self.score = sklearn.metrics.accuracy_score(y_true, y_pred)
        total_score = self.score * self.batch_size

        self.sum += total_score
        

    @property
    def avg(self):        
        self.avg_score = self.sum/self.count
        return self.avg_score

# Loss

In [None]:
# ====================================================
# Label Smoothing
# ====================================================
class LabelSmoothingLoss(nn.Module): 
    def __init__(self, classes=2, smoothing=0.0, dim=-1): 
        super(LabelSmoothingLoss, self).__init__() 
        self.confidence = 1.0 - smoothing 
        self.smoothing = smoothing 
        self.classes = classes 
        self.dim = dim 
    def forward(self, input, target): 
        pred = input.log_softmax(dim=self.dim) 
        with torch.no_grad():
            true_dist = torch.zeros_like(pred) 
            true_dist.fill_(self.smoothing / (self.classes - 1)) 
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

# Callbacks

In [None]:
from enum import Enum
from typing import Union



class Mode(Enum):
    MIN = np.inf
    MAX = -np.inf


class EarlyStopping:

    """Class for Early Stopping."""

    mode_dict = {'min': np.inf, 'max': -np.inf}

    def __init__(self,
                 patience: int = 5,
                 mode: Mode = Mode.MIN,
                 min_delta: float = 1e-5):
        """Construct an EarlyStopping instance.
        Arguments:
            patience : Number of epochs with no improvement after
                       which training will be stopped. (Default = 5)
            mode : One of {"min", "max"}. In min mode, training will
                   stop when the quantity monitored has stopped
                   decreasing.  In "max" mode it will stop when the
                   quantity monitored has stopped increasing.
            min_delta : Minimum change in the monitored quantity to
                        qualify as an improvement.
        """
        self.patience = patience
        self.mode = mode
        self.min_delta = min_delta
        self.stopping_counter = 0
        self.early_stop = False
        self.best_score = mode.value

    def improvement(self, curr_epoch_score: Union[float, int],
                    curr_best_score: Union[float, int]):
        # bool_flag = False, consider the reset bool_flag = True trick
        if self.mode == Mode.MIN:
            return curr_epoch_score <= (curr_best_score - self.min_delta)

        return curr_epoch_score >= (curr_best_score + self.min_delta)

    @property
    def monitor_op(self):
        return self.mode.value


    def should_stop(self, curr_epoch_score):
        """
        The actual algorithm of early stopping.
        Arguments:
            epoch_score : The value of metric or loss which you montoring for that epoch.
            mode : The model which is being trained.
            model_path : The path to save the model.
            
            rmb false or true --> true, one is true is enough in boolean logic in or clause.
        """
        # may not need if self.best_score is None or etc

        if self.improvement(curr_epoch_score=curr_epoch_score,
                            curr_best_score=self.best_score):

            # update self.best_score
            self.best_score = curr_epoch_score
            # self.checkpoint_model(model=model, model_path=model_path)

        else:
            self.stopping_counter += 1
            print("Early Stopping Counter {} out of {}".format(
                self.stopping_counter, self.patience))

        if self.stopping_counter >= self.patience:

            print("Early Stopping and since it is early stopping, we will not "
                  "save the model since the metric has not improved for {} "
                  "epochs".format(self.patience))
            # set flag to true, and in Trainer class, one this is
            # true, stop training.LOL
            self.early_stop = True

        return self.best_score, self.early_stop

# Trainer (Alex Style)

In [None]:
class Trainer:

    """A class to perform model training."""

    def __init__(self, model, config, early_stopping=None):
        """Construct a Trainer instance."""
        self.model = model
        
        self.config = config
        self.early_stopping = early_stopping
        self.epoch = 0
        self.best_auc = 0
        self.best_acc = 0
        self.best_loss = np.inf
        # self.save_path = config.save_path
        # if not os.path.exists(self.save_path):
        #     os.makedirs(self.save_path)
        
        # self.criterion=LabelSmoothingLoss(**config.criterion_params[config.criterion]).to(self.config.device)
        self.criterion = getattr(torch.nn, config.criterion)(**config.criterion_params[config.criterion])
        self.optimizer = getattr(torch.optim, config.optimizer)(self.model.parameters(), **config.optimizer_params[config.optimizer])
        self.scheduler = getattr(torch.optim.lr_scheduler, config.scheduler)(optimizer=self.optimizer, **config.scheduler_params[config.scheduler])


        self.val_predictions = None
        self.monitored_metrics = None
        self.date = datetime.datetime.now(pytz.timezone("Asia/Singapore")).strftime("%Y-%m-%d")

        self.log("Trainer prepared. We are using {} device.".format(
            self.config.device))

    def fit(self, train_loader, val_loader, fold: int):
        """Fit the model on the given fold."""
        self.log("Training on Fold {} and using {}".format(fold, config.effnet))

        for _epoch in range(self.config.n_epochs):
            # Getting the learning rate after each epoch!
            lr = self.optimizer.param_groups[0]["lr"]
            timestamp = datetime.datetime.now(pytz.timezone("Asia/Singapore")).strftime("%Y-%m-%d %H-%M-%S")
            # printing the lr and the timestamp after each epoch.
            self.log("\n{}\nLR: {}".format(timestamp, lr))

            # start time of training on the training set
            train_start_time = time.time()

            # train one epoch on the training set
            avg_train_loss, avg_train_acc_score = self.train_one_epoch(
                train_loader)
            # end time of training on the training set
            train_end_time = time.time()

            # formatting time to make it nicer
            train_elapsed_time = time.strftime(
                "%H:%M:%S", time.gmtime(train_end_time - train_start_time))
            self.log(
                "[RESULT]: Train. Epoch {} | Avg Train Summary Loss: {:.6f} | "
                "Train Accuracy: {:6f} | Time Elapsed: {}".format(
                    self.epoch + 1, avg_train_loss, avg_train_acc_score,
                    train_elapsed_time))

            val_start_time = time.time()
            # note here has val predictions... in actual fact it is
            # repeated because its same as avg_val_acc_score
            avg_val_loss, avg_val_acc_score, val_predictions, val_roc_auc = \
                self.valid_one_epoch(val_loader)
            # not sure if it is good practice to write it here
            self.val_predictions = val_predictions
            val_end_time = time.time()
            val_elapsed_time = time.strftime(
                "%H:%M:%S", time.gmtime(val_end_time - val_start_time))

            self.log("[RESULT]: Validation. Epoch: {} | "
                     "Avg Validation Summary Loss: {:.6f} | "
                     "Validation Accuracy: {:.6f} | Validation ROC: {:.6f} | Time Elapsed: {}".format(
                         self.epoch + 1, avg_val_loss, avg_val_acc_score,
                         val_roc_auc,
                         val_elapsed_time))

            # added this flag right before early stopping to let user
            # know which metric im monitoring.
            self.monitored_metrics = avg_val_acc_score

            if self.early_stopping is not None:

                best_score, early_stop = self.early_stopping.should_stop(curr_epoch_score=self.monitored_metrics)
                self.best_loss = best_score
                self.save("{}_best_loss_fold_{}.pt".format(
                    self.config.effnet, fold))
                if early_stop:
                    break

            else:
                # note here we use avg_val_loss, not train_val_loss! It is
                # just right to use val_loss as benchmark
                if avg_val_loss < self.best_loss:
                    self.best_loss = avg_val_loss

            if self.best_acc < avg_val_acc_score:
                self.best_acc = avg_val_acc_score
   
                self.save(os.path.join(self.config.paths['save_path'],
                                       "{}_{}_best_acc_fold_{}.pt".format(self.date,
                                                                          self.config.effnet, fold)))
                
            if val_roc_auc > self.best_auc:
                self.best_auc = val_roc_auc



            if self.config.val_step_scheduler:
                if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                    self.scheduler.step(self.monitored_metrics)
                else:
                    self.scheduler.step()

            # end of training, epoch + 1 so that self.epoch can be updated.
            self.epoch += 1

        # this is where we end the epoch training for the current
        # fold/model, therefore we can call the final "best weight
        # saved" by this exact name that we saved earlier on.
        curr_fold_best_checkpoint = self.load(
            os.path.join(
                self.config.paths["save_path"],
                "{}_{}_best_acc_fold_{}.pt".format(self.date, self.config.effnet, fold)
            )
        )
        # return the checkpoint for further usage.
        return curr_fold_best_checkpoint

    def train_one_epoch(self, train_loader):
        """Train one epoch of the model."""
        # set to train mode
        self.model.train()

        # log metrics
        summary_loss = AverageLossMeter()
        accuracy_scores = AccuracyMeter()

        # timer
        start_time = time.time()

        # looping through train loader for one epoch, steps is the
        # number of times to go through each epoch
        for step, (_image_ids, images, labels) in enumerate(train_loader):

            
            images = images.to(self.config.device).float()
            labels = labels.to(self.config.device)
            batch_size = labels.shape[0]
            logits = self.model(images)
            
            #print(logits.shape, labels.shape)
            loss = self.criterion(input=logits, target=labels)
            summary_loss.update(loss.item(), batch_size)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            y_true = labels.cpu().numpy()
            softmax_preds = torch.nn.Softmax(dim=1)(input=logits).to("cpu").detach().numpy()
            y_preds = np.argmax(a=softmax_preds, axis=1)
            
            accuracy_scores.update(y_true, y_preds, batch_size=batch_size)

            if self.config.train_step_scheduler:
                self.scheduler.step()

            # measure elapsed time
            end_time = time.time()

            if self.config.verbose:
                if (step % self.config.verbose_step) == 0:
                    print(
                        f"Train Steps {step}/{len(train_loader)}, "
                        f"summary_loss: {summary_loss.avg:.3f}, "
                        f"acc: {accuracy_scores.avg:.3f} "
                        f"time: {(end_time - start_time):.3f}",
                        end="\r",
                    )
        
        return summary_loss.avg, accuracy_scores.avg

    def valid_one_epoch(self, val_loader):
        """Validate one training epoch."""
        # set to eval mode
        self.model.eval()

        # log metrics
        summary_loss = AverageLossMeter()
        accuracy_scores = AccuracyMeter()

        # timer
        start_time = time.time()

        val_gt_label_list, val_preds_softmax_list, val_preds_roc_list, val_preds_argmax_list = [], [], [], []

        with torch.no_grad():
            for step, (_image_ids, images, labels) in enumerate(val_loader):

                images = images.to(self.config.device).float()
                labels = labels.to(self.config.device)
                batch_size = labels.shape[0]
                logits = self.model(images)
                loss = self.criterion(input=logits, target=labels)
                summary_loss.update(loss.item(), batch_size)
                y_true = labels.cpu().numpy()
                softmax_preds = torch.nn.Softmax(dim=1)(input=logits).to("cpu").numpy()
                positive_class_preds = softmax_preds[:,1]
                y_preds = np.argmax(a=softmax_preds, axis=1)
                accuracy_scores.update(y_true, y_preds, batch_size=batch_size)
                val_preds_roc_list.append(positive_class_preds)
                val_gt_label_list.append(y_true)
                val_preds_softmax_list.append(softmax_preds)
                val_preds_argmax_list.append(y_preds)
                end_time = time.time()

                if config.verbose:
                    if (step % config.verbose_step) == 0:
                        print(
                            f"Validation Steps {step}/{len(val_loader)}, " +
                            f"summary_loss: {summary_loss.avg:.3f}, val_acc: {accuracy_scores.avg:.6f} "
                            + f"time: {(end_time - start_time):.3f}",
                            end="\r",
                        )
            val_gt_label_array  = np.concatenate(val_gt_label_list, axis=0)
            val_preds_softmax_array = np.concatenate(val_preds_softmax_list, axis=0)
            val_preds_argmax_array = np.concatenate(val_preds_argmax_list,axis=0)
            val_preds_roc_array = np.concatenate(val_preds_roc_list, axis=0)
            print(val_gt_label_array)
            print(val_preds_argmax_array)
            if self.config.num_classes > 2:                
                val_roc_auc_score =  sklearn.metrics.roc_auc_score(y_true=val_gt_label_array, y_score=val_preds_softmax_array,multi_class='ovr')
            else:
                val_roc_auc_score =  sklearn.metrics.roc_auc_score(y_true=val_gt_label_array, y_score=val_preds_roc_array)
                
        return summary_loss.avg, accuracy_scores.avg, val_preds_softmax_array, val_roc_auc_score

    def save_model(self, path):
        """Save the trained model."""
        self.model.eval()
        torch.save(self.model.state_dict(), path)

    # will save the weight for the best val loss and corresponding oof preds
    def save(self, path):
        """Save the weight for the best evaluation loss."""
        self.model.eval()
        torch.save(
            {
                "model_state_dict": self.model.state_dict(),
                "optimizer_state_dict": self.optimizer.state_dict(),
                "scheduler_state_dict": self.scheduler.state_dict(),
                "best_acc": self.best_acc,
                "best_auc": self.best_auc,
                "best_loss": self.best_loss,
                "epoch": self.epoch,
                "oof_preds": self.val_predictions,
            },
            path,
        )

    def load(self, path):
        """Load a model checkpoint from the given path."""
        checkpoint = torch.load(path)
        return checkpoint


    def log(self, message):
        """Log a message."""
        if self.config.verbose:
            print(message)
        with open(self.config.paths['log_path'], "a+") as logger:
            logger.write(f"{message}\n")

# Train on Folds

In [None]:
def train_on_fold(df_folds: pd.DataFrame, config, fold: int):
    """Train the model on the given fold."""
    model = CustomEfficientNet(config=config, pretrained=True)
    #model = CustomResNext(config=config, pretrained=True)
    
    model.to(config.device)

    transforms_train, transforms_val = get_albu_transforms(config)
    
    debug = True
    if debug:
        train_df = df_folds[df_folds["fold"] != fold].sample(
        32* 32
        )
        val_df = df_folds[df_folds["fold"] == fold].sample(
        32 * 32
        )
    else:
        train_df = df_folds[df_folds["fold"] != fold].reset_index(drop=True)
        val_df = df_folds[df_folds["fold"] == fold].reset_index(drop=True)

    train_set = Cassava(train_df, config, transforms=AlbumentationsAugmentation(transforms=transforms_train),
                        transform_norm=True, meta_features=None)
    train_loader = DataLoader(train_set,
                              batch_size=config.batch_size,
                              shuffle=True,
                              num_workers=4,
                              worker_init_fn=seed_worker)

    val_set = Cassava(val_df, config, transforms=AlbumentationsAugmentation(transforms=transforms_val),
                      transform_norm=True, meta_features=None)
    val_loader = DataLoader(val_set,batch_size=config.batch_size,shuffle=False,num_workers=4,worker_init_fn=seed_worker)

    melanoma_detector = Trainer(model=model, config=config)

    curr_fold_best_checkpoint = melanoma_detector.fit(train_loader, val_loader,
                                                      fold)

    # loading checkpoint for all 10 epochs for this current fold

    val_df[[str(c) for c in range(config.num_classes)]] = curr_fold_best_checkpoint["oof_preds"]
    val_df["preds"] = curr_fold_best_checkpoint["oof_preds"].argmax(1)

    return val_df



def get_acc_score(config, result_df):
    """Get the accuracy of model predictions."""
    preds = result_df["preds"].values
    labels = result_df[config.class_col_name].values
    score = sklearn.metrics.accuracy_score(y_true=labels, y_pred=preds)
    return score

def get_roc_score(config, result_df):
    max_label = str(np.max(result_df[config.class_col_name].values))
    preds = result_df[max_label].values
    labels = result_df[config.class_col_name].values
    score = sklearn.metrics.roc_auc_score(y_true=labels, y_score=preds, multi_class='ovr')
    return score

def train_loop(df_folds: pd.DataFrame,config,fold_num: int = None,train_one_fold=False):
    """Perform the training loop on all folds."""
    # here The CV score is the average of the validation fold metric.
    cv_score_list = []
    oof_df = pd.DataFrame()
    if train_one_fold:
        _oof_df = train_on_fold(df_folds=df_folds,config=config,fold=fold_num)
        oof_df = pd.concat([oof_df, _oof_df])
        curr_fold_best_score = get_acc_score(config, _oof_df)
        print("Fold {} OOF Score is {}".format(fold_num,
                                               curr_fold_best_score))
    else:
        # the below for loop guarantees it starts from 1 for fold.
        # https://stackoverflow.com/questions/33282444/pythonic-way-to-iterate-through-a-range-starting-at-1
        for fold in (number+1 for number in range(config.num_folds)):
            _oof_df = train_on_fold(df_folds=df_folds,config=config, fold=fold)
            oof_df = pd.concat([oof_df, _oof_df])
            curr_fold_best_score = get_acc_score(config, _oof_df)
            cv_score_list.append(curr_fold_best_score)
            print("\n\n\nOOF Score for Fold {}: {}\n\n\n".format(fold, curr_fold_best_score))

        print("CV score", np.mean(cv_score_list))
        print("Variance", np.var(cv_score_list))
        print("Five Folds OOF", get_acc_score(config, oof_df))
    oof_df.to_csv("oof.csv")


In [None]:
train_fold_1 = train_loop(df_folds=df_folds, config=config, fold_num=1, train_one_fold=True)
#train_fold_2 = train_loop(df_folds=df_folds, config=config, fold_num=2, train_one_fold=True)