In [1]:
import random
import os, sys
from importlib import import_module

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Subset
from torch.optim import SGD, Adam, AdamW
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR

sys.path.append(os.path.abspath('..'))
from dataset import MaskBaseDataset
from model import *

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

In [2]:
# -- parameters
img_root = '/mnt/ssd/data/mask/resized_data/'
label_path = '/mnt/ssd/data/mask/metadata.csv'

model_name = "VGG19"
use_pretrained = True
freeze_backbone = False

val_split = 0.4
batch_size = 64
num_workers = 4
num_classes = 3

num_epochs = 100
lr = 1e-4
lr_decay_step = 10

train_log_interval = 20
name = "02_vgg"

# -- settings
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

## Loss

### Cross Entropy Loss

In [3]:
class CrossEntropyLoss(nn.Module):
    def __init__(self, weight=None, reduction='mean'):
        nn.Module.__init__(self)
        self.weight = weight
        self.reduction = reduction

    def forward(self, input_tensor, target_tensor):
        log_prob = F.log_softmax(input_tensor, dim=-1)
        prob = torch.exp(log_prob)
        return F.nll_loss(
            log_prob,
            target_tensor,
            weight=self.weight,
            reduction=self.reduction
        )

### Focal Loss

In [4]:
# https://discuss.pytorch.org/t/is-this-a-correct-implementation-for-focal-loss-in-pytorch/43327/8
class FocalLoss(nn.Module):
    def __init__(self, weight=None,
                 gamma=2., reduction='mean'):
        nn.Module.__init__(self)
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input_tensor, target_tensor):
        log_prob = F.log_softmax(input_tensor, dim=-1)
        prob = torch.exp(log_prob)
        return F.nll_loss(
            ((1 - prob) ** self.gamma) * log_prob,
            target_tensor,
            weight=self.weight,
            reduction=self.reduction
        )

### Label Smoothing Loss

In [5]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=3, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

### F1 Loss

In [6]:
# https://gist.github.com/SuperShinyEyes/dcc68a08ff8b615442e3bc6a9b55a354
class F1Loss(nn.Module):
    def __init__(self, classes=3, epsilon=1e-7):
        super().__init__()
        self.classes = classes
        self.epsilon = epsilon
    def forward(self, y_pred, y_true):
        assert y_pred.ndim == 2
        assert y_true.ndim == 1
        y_true = F.one_hot(y_true, self.classes).to(torch.float32)
        y_pred = F.softmax(y_pred, dim=1)

        tp = (y_true * y_pred).sum(dim=0).to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum(dim=0).to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum(dim=0).to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum(dim=0).to(torch.float32)

        precision = tp / (tp + fp + self.epsilon)
        recall = tp / (tp + fn + self.epsilon)

        f1 = 2 * (precision * recall) / (precision + recall + self.epsilon)
        f1 = f1.clamp(min=self.epsilon, max=1 - self.epsilon)
        return 1 - f1.mean()

In [7]:
criterion = CrossEntropyLoss()

## Optimizer

In [8]:
# -- model
model_cls = getattr(import_module("model"), model_name)
model = model_cls(
    num_classes=num_classes,
    pretrained=use_pretrained,
    freeze=freeze_backbone
).to(device)

In [9]:
# -- SGD optimizer

optimizer = SGD(model.parameters(), lr=lr, weight_decay=5e-4)

In [10]:
# -- Adam optimizer

optimizer = Adam(model.parameters(), lr=lr, weight_decay=5e-4)

In [11]:
list(model.named_children())

[('net',
  VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
      (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (9): ReLU(inplace=True)
      (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (12): ReLU(inplace=True)
      (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, c

In [12]:
# -- optimizer: Different Learning Rates on different layers

train_params = [{'params': getattr(model.net, 'features').parameters(), 'lr': lr / 10, 'weight_decay':5e-4},
                {'params': getattr(model.net, 'classifier').parameters(), 'lr': lr, 'weight_decay':5e-4}]
optimizer = Adam(train_params)

## Scheduler

In [13]:
# -- scheduler: StepLR

scheduler = StepLR(optimizer, lr_decay_step, gamma=0.5)

In [14]:
# -- scheduler: ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=10)

In [15]:
# -- scheduler: CosineAnnealingLR

scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=0.)

## Metric
Example Code is from scikit-learn tutorial codes

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]

### Accuracy

In [17]:
accuracy_score(y_true, y_pred)

0.3333333333333333

In [18]:
accuracy_score(y_true, y_pred, normalize=False)

2

### Precision & Recall

In [19]:
precision = precision_score(y_true, y_pred, average='macro')
precision

0.2222222222222222

In [20]:
recall = recall_score(y_true, y_pred, average='macro')
recall

0.3333333333333333

### F1 Score

In [21]:
2 * (precision * recall) / (precision + recall)

0.26666666666666666

In [22]:
f1_score(y_true, y_pred, average='macro')

0.26666666666666666

## Training process

In [23]:
dataset = MaskBaseDataset(img_root, label_path, 'train')
n_val = int(len(dataset) * val_split)
n_train = len(dataset) - n_val
train_set, val_set = torch.utils.data.random_split(dataset, [n_train, n_val])
val_set.dataset.set_phase("test")  # todo : fix

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True,
)

### Callback - Checkpoint, Early Stopping

In [24]:
# -- Callback1: Save Checkpoints by Accuracy
# Continue to the training code

# -- Callback2: Early Stopping
patience = 10
counter = 0
# Continue to the training code

### Training Method - Gradient Accumulation

In [25]:
# -- Gradient Accumulation
accumulation_steps = 2
# Continue to the training code

### Training Loop

In [None]:
os.makedirs(os.path.join(os.getcwd(), 'results', name), exist_ok=True)

counter = 0
best_val_acc = 0
best_val_loss = np.inf
for epoch in range(num_epochs):
    # train loop
    model.train()
    loss_value = 0
    matches = 0
    for idx, train_batch in enumerate(train_loader):
        inputs, labels = train_batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        outs = model(inputs)
        preds = torch.argmax(outs, dim=-1)
        loss = criterion(outs, labels)

        loss.backward()
        
        # -- Gradient Accumulation
        if (idx+1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        loss_value += loss.item()
        matches += (preds == labels).sum().item()
        if (idx + 1) % train_log_interval == 0:
            train_loss = loss_value / train_log_interval
            train_acc = matches / batch_size / train_log_interval
            current_lr = scheduler.get_last_lr()
            print(
                f"Epoch[{epoch}/{num_epochs}]({idx + 1}/{len(train_loader)}) || "
                f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
            )

            loss_value = 0
            matches = 0

    scheduler.step()

    # val loop
    with torch.no_grad():
        print("Calculating validation results...")
        model.eval()
        val_loss_items = []
        val_acc_items = []
        for val_batch in val_loader:
            inputs, labels = val_batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outs = model(inputs)
            preds = torch.argmax(outs, dim=-1)

            loss_item = criterion(outs, labels).item()
            acc_item = (labels == preds).sum().item()
            val_loss_items.append(loss_item)
            val_acc_items.append(acc_item)

        val_loss = np.sum(val_loss_items) / len(val_loader)
        val_acc = np.sum(val_acc_items) / len(val_set)
        
        # Callback1
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        if val_acc > best_val_acc:
            print("New best model for val accuracy! saving the model..")
            torch.save(model.state_dict(), f"results/{name}/{epoch:03}_accuracy_{val_acc:4.2%}.ckpt")
            best_val_acc = val_acc
            counter = 0
        else:
            counter += 1
        # Callback2
        if counter > patience:
            print("Early Stopping...")
            break
        
        
        print(
            f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2} || "
            f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}"
        )

Epoch[0/100](20/287) || training loss 0.6277 || training accuracy 72.81% || lr [1e-05, 0.0001]
Epoch[0/100](40/287) || training loss 0.2611 || training accuracy 91.02% || lr [1e-05, 0.0001]
Epoch[0/100](60/287) || training loss 0.1992 || training accuracy 93.28% || lr [1e-05, 0.0001]
Epoch[0/100](80/287) || training loss 0.1265 || training accuracy 96.17% || lr [1e-05, 0.0001]
Epoch[0/100](100/287) || training loss 0.1084 || training accuracy 96.88% || lr [1e-05, 0.0001]
Epoch[0/100](120/287) || training loss 0.09347 || training accuracy 96.95% || lr [1e-05, 0.0001]
Epoch[0/100](140/287) || training loss 0.0781 || training accuracy 97.19% || lr [1e-05, 0.0001]
Epoch[0/100](160/287) || training loss 0.1003 || training accuracy 96.88% || lr [1e-05, 0.0001]
Epoch[0/100](180/287) || training loss 0.05327 || training accuracy 98.20% || lr [1e-05, 0.0001]
Epoch[0/100](200/287) || training loss 0.0734 || training accuracy 97.58% || lr [1e-05, 0.0001]
Epoch[0/100](220/287) || training loss 0.0