This inference notebook is a part of work, heavily inspired by brilliant notebooks of @abhishek and @cdeotte in [SIIM-ISIC Melanoma Classification](https://www.kaggle.com/c/siim-isic-melanoma-classification). Training part is [here](https://www.kaggle.com/dunklerwald/pytorch-efficientnet-with-tta-training).

**UPDATE**:
- switched to noisy-student
- added simple upsampling option (disabled by default)
- added confusion matrix

**UPDATE1**:
- switched to CosineAnnealingWarmRestarts
- upgraded to image size 512
- switched back to B4 effnet

**UPDATE(FINAL)**:
- added weight decay
- switched to smoothed cross entropy loss

**TO DO** : looking at training progress plots for image size 512 in the training kernel, there can be some room for improvement through applying more regularization and reducing learning rate.

In [None]:
package_path = '../input/pytorch-image-models/pytorch-image-models-master'
import sys
sys.path.append(package_path)    

In [None]:
import os
import torch
import albumentations

import numpy as np
import pandas as pd
import warnings

import time
import datetime

import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots

import torch
import torch.nn as nn
from torch.nn import functional as F

from sklearn import metrics
from sklearn import model_selection


from PIL import Image
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


#import efficientnet_pytorch
import timm

warnings.simplefilter('ignore')
%matplotlib inline

In [None]:
n_epochs = 10
n_patience = 5
n_folds = 3
train_bsize = 24
valid_bsize = 48
test_bsize = 48
seed = 42

effnet_output = {0: 1280, 1: 1280, 2: 1408, 3: 1536, 4: 1792, 5: 2048, 6: 2304, 7: 2560}

IMG_SIZE = 512
EFFNET_MODEL = 4

AUGMENTATION =[albumentations.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=15, border_mode=0, p=0.6),
              albumentations.Flip(p=0.5),
              albumentations.RandomRotate90(p=0.5),
              albumentations.RandomBrightness(limit=0.2, p=0.6),
              albumentations.RandomContrast(limit=0.2, p=0.6),
              albumentations.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=20, val_shift_limit=20, p=0.6),
              albumentations.CoarseDropout(max_holes=8, max_height=int(IMG_SIZE*0.2), max_width=int(IMG_SIZE*0.2), p=0.6),
              albumentations.Cutout(num_holes=1, max_h_size=int(IMG_SIZE*0.33), max_w_size=int(IMG_SIZE*0.33), p=0.6)
              ]   

IS_TTA = True
TTA = 5

UPSAMPLE = False
N_UPSAMPLE = 1

SCHEDULER_NAME = 'CosineAnnealingLR' # ReduceLROnPlateau, CosineAnnealingLR, CustomSchedulerLR
LOSS_FN_NAME = 'CrossEntropyLoss' # WeightedFocalLoss
SMOOTHING = 0.05

DISPLAY_PLOT= True

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(seed)

In [None]:
path = '../input/cassava-leaf-disease-classification/'
trained_path = '../input/cassava-b4-512-final/'

In [None]:
# create folds
df = pd.read_csv(path + 'train.csv')
N_CLASSES = df.label.nunique()

In [None]:
class ClassificationDataset:
    def __init__(self, image_paths, targets, resize, augmentations=None):
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize
        self.augmentations = augmentations

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        image = Image.open(self.image_paths[item])
        targets = self.targets[item]
        if self.resize is not None:
            image = image.resize(
                (self.resize[1], self.resize[0]), resample=Image.BILINEAR
            )
        image = np.array(image)
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        return {
            "image": torch.tensor(image),
            "targets": torch.tensor(targets),
        }


class ClassificationDataLoader:
    def __init__(self, image_paths, targets, resize, augmentations=None):
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize
        self.augmentations = augmentations
        self.dataset = ClassificationDataset(
            image_paths=self.image_paths,
            targets=self.targets,
            resize=self.resize,
            augmentations=self.augmentations
        )
    
    def fetch(self, batch_size, num_workers, drop_last=False, shuffle=True, tpu=False):
        sampler = None

        data_loader = torch.utils.data.DataLoader(
            self.dataset,
            batch_size=batch_size,
            sampler=sampler,
            drop_last=drop_last,
            shuffle=shuffle,
            num_workers=num_workers
        )
        return data_loader

In [None]:
class Engine:
    @staticmethod
    def train(
        data_loader,
        model,
        optimizer,
        device,
        scheduler=None,
        accumulation_steps=1,
        fp16=True,
    ):

        losses = AverageMeter()
        accuracies = AverageMeter()
        final_predictions = []
        model.train()
        if accumulation_steps > 1:
            optimizer.zero_grad()

        if fp16:
          scaler = torch.cuda.amp.GradScaler()    

        for b_idx, data in enumerate(data_loader):
            for key, value in data.items():
                data[key] = value.to(device)
            if accumulation_steps == 1 and b_idx == 0:
                optimizer.zero_grad()
            if fp16:    
                with torch.cuda.amp.autocast():    
                    predictions, loss, accuracy = model(**data)
            else:
                predictions, loss, accuracy = model(**data)

            predictions = predictions.detach().cpu().numpy()  
            final_predictions.append(predictions) 

            with torch.set_grad_enabled(True):
                if fp16:
                    scaler.scale(loss).backward()                   
                else:
                    loss.backward()
                if (b_idx + 1) % accumulation_steps == 0:
                    if fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:     
                        optimizer.step()
                    if scheduler is not None:
                         scheduler.step()
                    if b_idx > 0:
                        optimizer.zero_grad()

            losses.update(loss.item(), data_loader.batch_size)
            accuracies.update(accuracy.item(), data_loader.batch_size)

        return final_predictions, losses.avg, accuracies.avg

    @staticmethod
    def evaluate(data_loader, model, device):
        losses = AverageMeter()
        accuracies = AverageMeter()
        final_predictions = []
        model.eval()
        with torch.no_grad():
            for b_idx, data in enumerate(data_loader):    
                for key, value in data.items():
                    data[key] = value.to(device)
                predictions, loss, accuracy = model(**data)
                predictions = predictions.detach().cpu().numpy()  
                final_predictions.append(predictions) 
                
                losses.update(loss.item(), data_loader.batch_size)    
                accuracies.update(accuracy.item(), data_loader.batch_size)

        return final_predictions, losses.avg, accuracies.avg

    @staticmethod
    def predict(data_loader, model, device):
        model.eval()
        final_predictions = []

        with torch.no_grad():

            for b_idx, data in enumerate(data_loader):    
                for key, value in data.items():
                    data[key] = value.to(device)
                predictions, _, _ = model(**data)
                predictions = predictions.detach().cpu().numpy()  
                final_predictions.append(predictions) 
                   
        return final_predictions

In [None]:
class EfficientNet(nn.Module):
    def __init__(self, num_classes):
        super(EfficientNet, self).__init__()
        #self.base_model = efficientnet_pytorch.EfficientNet.from_pretrained('efficientnet-b' + str(EFFNET_MODEL))
        self.base_model = timm.create_model(f"tf_efficientnet_b{str(EFFNET_MODEL)}_ns", pretrained=False)
        self.dropout = nn.Dropout(0.2)
        
        self.out = nn.Linear(
            in_features=effnet_output[EFFNET_MODEL], 
            out_features=num_classes, 
            bias=True
        )
        
    def forward(self, image, targets=None):
        batch_size, _, _, _ = image.shape
        
        x = self.base_model.forward_features(image) 
        x = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)
        out = self.out(self.dropout(x)) 
        
        return out, None, None    

In [None]:
def predict(fold = 0, apply_tta = False):
    print('=' * 20, 'Fold', fold, '=' * 20)
    test_data_path = path + "test_images/"
    df = test
    device = "cuda"
    model_path=trained_path + f"model_fold_{fold}.bin"

    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    if apply_tta:
        aug = albumentations.Compose(AUGMENTATION + [albumentations.RandomResizedCrop(IMG_SIZE, IMG_SIZE, always_apply=True), albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True)])
    else:
        aug = albumentations.Compose([albumentations.CenterCrop(IMG_SIZE, IMG_SIZE, always_apply=True),albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True)])
    
    images = [os.path.join(test_data_path, x) for x in df.image_id.values]
    targets = df.label.values

    test_loader = ClassificationDataLoader(
        image_paths=images,
        targets=targets,
        resize=None,
        augmentations=aug,
    ).fetch(
        batch_size=test_bsize, 
        drop_last=False, 
        num_workers=4, 
        shuffle=False
    )

    model = EfficientNet(num_classes=N_CLASSES)
    model.load_state_dict(torch.load(model_path))
    model.to(device)

    # PREDICT
    print('Predicting...')
    if apply_tta:
        predictions = np.zeros([len(images),N_CLASSES])
        
        for i in range(TTA): 
            tta_predictions = Engine.predict(test_loader, model, device=device)
            tta_predictions = np.vstack(tta_predictions)
            predictions += tta_predictions/TTA  
        predictions = predictions.reshape((len(images),1, N_CLASSES))    
    else:
        predictions = Engine.predict(test_loader, model, device=device)

    return predictions

In [None]:
test = pd.read_csv(path + "sample_submission.csv")

In [None]:
final_preds = None

for i in range(n_folds):
    preds = predict(fold = i, apply_tta=IS_TTA)
    temp_preds = None
    for p in preds:
        if temp_preds is None:
            temp_preds = p
        else:
            temp_preds = np.vstack((temp_preds, p))
    if final_preds is None:
        final_preds = temp_preds
    else:
        final_preds += temp_preds

final_preds /= n_folds
final_preds = final_preds.argmax(axis=1)

test.label = final_preds
test.to_csv('submission.csv', index=False)

In [None]:
test.head()