In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# 0. Imports

In [2]:
from imgaug import augmenters as iaa
import torch.optim as optim
import torch

torch.backends.cudnn.benchmark = True

from dataloader import get_dataloader
from preprocess import transformer, img_transform, reverse_img_transform
from models import RetinaNet
from loss import FocalLoss
from train import Trainer, load_components
from inference import InferenceTransform
from evaluation import support_evaluate_model
from utils import Visualiser
from callbacks import Callback
import pickle
import gc

from apex import amp

In [3]:
gc.collect(2)

25

# 1. Parameters

In [4]:
project_name = "gatletag/Local-Object-Detection-Tests"
experiment_name = "local_test1"

dir_params = {
    "train_images_dir": "dataset/validation",
    "train_bbox_dir": "data_info/valid/annotations/valid-anno.json",
    "train_idx_to_id_dir": "data_info/valid/annotations/valid-idx_to_id.pkl",

    "valid_images_dir": "dataset/validation",
    "valid_bbox_dir": "data_info/valid/annotations/valid-anno.json",
    "valid_idx_to_id_dir": "data_info/valid/annotations/valid-idx_to_id.pkl",

    "clsids_to_idx_dir": "data_info/clsids_to_idx.pkl",
    "clsids_to_names_dir": "data_info/clsids_to_names.pkl"
}

In [5]:
hyper_params = {
    # speed parameters
    "num_workers": 8,
    "device": "cuda",

    # dataloader parameters
    "bs": 16,
    "img_dim": 512,

    # anchor parameters
    "ratios": [1/3, 1/2, 1, 2],
    "scales": [0.25, 1, 2],

    # network parameters
    "backbone": "resnet50",
    "num_classes": 501,
    "pretrained": True,
    "freeze_bn": True,

    # loss parameters
    "alpha": 0.25,
    "gamma": 2.0,
    "IoU_bkgrd":0.4,
    "IoU_pos":0.5,
    "regress_factor": [0.1, 0.1, 0.2, 0.2],

    # optimizer parameters
    "lr": 0.0003,
    "min_lr": 0.000001,
    "patience": 100,
    "decay_factor": 0.3,

    # training parameters
    "epochs": 1,
    "checkpoint_dir": "temp3/final.pth",
    "save_dir": "temp4",
    "fine_tune": False,

    # evaluation parameters
    "cls_thresh":0.10, 
    "overlap":0.5
}


# 2. Initialisations

### 2.1 Class Info

In [6]:
clsids_to_names = pickle.load(open(dir_params["clsids_to_names_dir"],'rb'))
clsids_to_idx = pickle.load(open(dir_params["clsids_to_idx_dir"],'rb'))
idx_to_cls_ids = {v: k for k, v in clsids_to_idx.items()}
idx_to_names = {k: clsids_to_names[v] for k, v in idx_to_cls_ids.items()}

### 2.2 Dataset Info

In [7]:
train_seq = iaa.Sequential([
        iaa.Resize({"height": int(hyper_params["img_dim"]*1.05), "width": int(hyper_params["img_dim"]*1.05)}),
        iaa.GammaContrast((0.9,1.1)),
        iaa.Affine(rotate=(-5, 5), scale=(0.90, 1.10)),
        iaa.Fliplr(0.5),
        iaa.CropAndPad(percent=(-0.05, 0.00)),
        iaa.Resize({"height": hyper_params["img_dim"], "width": hyper_params["img_dim"]})
    ])
train_transform_fn = transformer(train_seq, img_transform)

train_dl = get_dataloader(
        dir_params["train_images_dir"], dir_params["train_bbox_dir"], 
        dir_params["train_idx_to_id_dir"], clsids_to_idx,
        train_transform_fn, hyper_params["bs"], True, hyper_params["num_workers"], True
    )

valid_seq = iaa.Sequential([
        iaa.Resize({"height": hyper_params["img_dim"], "width": hyper_params["img_dim"]})
    ])
valid_transform_fn = transformer(valid_seq, img_transform)

valid_dl = get_dataloader(
        dir_params["valid_images_dir"], 
        dir_params["valid_bbox_dir"], 
        dir_params["valid_idx_to_id_dir"], 
        clsids_to_idx, 
        valid_transform_fn, hyper_params["bs"], False, hyper_params["num_workers"], False
    )

### 2.4 Loss Info

In [8]:
criterion = FocalLoss(
        hyper_params["alpha"], 
        hyper_params["gamma"], 
        hyper_params["IoU_bkgrd"], 
        hyper_params["IoU_pos"], 
        hyper_params["regress_factor"], 
        hyper_params["device"]
    )

### 2.3 Model Info

In [9]:
retinanet = RetinaNet(
        hyper_params["backbone"], 
        hyper_params["num_classes"],
        hyper_params["ratios"], 
        hyper_params["scales"], 
        device=hyper_params["device"], 
        pretrained = hyper_params["pretrained"], 
        freeze_bn = hyper_params["freeze_bn"],
        prior=0.01, 
        feature_size=256, 
        pyramid_levels = [3, 4, 5, 6, 7],
        criterion=criterion
    )

# TODO: Move this over to training file
def set_parameter_requires_grad(model):
        for name, param in model.named_parameters():
            if (name.split('.')[0]) not in ["fpn", "regressionModel", "classificationModel"]:
                param.requires_grad = False

if hyper_params["fine_tune"]==True:
    set_parameter_requires_grad(retinanet)

### 2.5 Optimizer Info

In [10]:
optimizer = optim.SGD(retinanet.parameters(), lr=hyper_params["lr"], momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
        mode='min', 
        factor=hyper_params['decay_factor'], 
        patience=hyper_params["patience"], 
        verbose=True, 
        min_lr=hyper_params["min_lr"]
    )

### 2.6 Visualisation Info

In [11]:
vis = Visualiser(
        hyper_params["num_classes"],
        idx_to_names,
        reverse_img_transform
    )

### 2.7 Inference Info

In [12]:
inferencer = InferenceTransform(
        idx_to_names,
        idx_to_cls_ids,
        hyper_params["regress_factor"]
    ) 

# 3. Prepare Training

In [13]:
retinanet = retinanet.to(hyper_params["device"])

load_components(retinanet, optimizer, scheduler, hyper_params["checkpoint_dir"])

retinanet, optimizer = amp.initialize(retinanet, optimizer, opt_level="O1")

if torch.cuda.device_count() > 1:
    print("Using Multiple GPUs")
    retinanet = torch.nn.DataParallel(retinanet, device_ids=range(torch.cuda.device_count()))
retinanet = retinanet.to(hyper_params["device"])


cb = Callback(project_name, experiment_name, hyper_params, hyper_params["save_dir"])

eval_params = {
    "overlap":hyper_params["overlap"],
    "cls_thresh":hyper_params["cls_thresh"]
}

trainer = Trainer(
        retinanet, 
        train_dl, 
        valid_dl, 
        optimizer, 
        scheduler, 
        criterion, 
        hyper_params["device"],
        inferencer, 
        hyper_params["num_classes"],
        eval_params,
        cb,
        vis
    )

Loading from checkpoint: temp3/final.pth
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Using Multiple GPUs
LOD-83
https://ui.neptune.ml/gatletag/Local-Object-Detection-Tests/e/LOD-83


In [14]:
trainer.train(hyper_params["epochs"])

Started Training
Started Epoch: 0
0:00:20.439681 : 0 / 2182
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
0:00:26.965808 : 10 / 2182
0:00:32.636722 : 20 / 2182
0:00:38.392405 : 30 / 2182
0:00:44.283270 : 40 / 2182
0:00:50.039885 : 50 / 2182
0:00:55.830644 : 60 / 2182
0:01:01.469948 : 70 / 2182
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
0:01:07.296155 : 80 / 2182
0:01:13.135126 : 90 / 2182
0:01:18.895243 : 100 / 2182
0:01:24.731936 : 110 / 2182
0:01:30.416616 : 120 / 2182
0:01:36.172319 : 130 / 2182
0:01:41.983930 : 140 / 2182
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
0:01:47.774008 : 150 / 2182
0:01:53.527762 : 160 / 2182
0:01:59.264256 : 170 / 2182
0:02:05.148280 : 180 / 2182
0:02:10.896548 : 190 / 2182
0:02:16.518932 : 200 / 2182
0:02:22.245612 : 210 / 2182
0:02:27.905260 : 220 / 2182
0:02:33.556727 : 230 / 2182
0:02:39.219760 : 240 / 2182
0:02:45.100806 : 250 / 2182
0:02:50.76

[VALID] 0:03:10.460506 : 440 / 2183
[VALID] 0:03:14.777974 : 450 / 2183
[VALID] 0:03:19.081153 : 460 / 2183
[VALID] 0:03:23.221529 : 470 / 2183
[VALID] 0:03:27.423603 : 480 / 2183
[VALID] 0:03:31.840870 : 490 / 2183
[VALID] 0:03:35.990251 : 500 / 2183
[VALID] 0:03:40.147353 : 510 / 2183
[VALID] 0:03:44.428569 : 520 / 2183
[VALID] 0:03:48.887496 : 530 / 2183
[VALID] 0:03:53.155668 : 540 / 2183
[VALID] 0:03:57.391175 : 550 / 2183
[VALID] 0:04:01.775405 : 560 / 2183
[VALID] 0:04:05.922627 : 570 / 2183
[VALID] 0:04:10.209390 : 580 / 2183
[VALID] 0:04:14.282432 : 590 / 2183
[VALID] 0:04:18.627472 : 600 / 2183
[VALID] 0:04:22.759677 : 610 / 2183
[VALID] 0:04:26.879634 : 620 / 2183
[VALID] 0:04:31.198412 : 630 / 2183
[VALID] 0:04:35.314563 : 640 / 2183
[VALID] 0:04:39.554838 : 650 / 2183
[VALID] 0:04:43.615963 : 660 / 2183
[VALID] 0:04:47.912707 : 670 / 2183
[VALID] 0:04:52.215635 : 680 / 2183
[VALID] 0:04:56.518359 : 690 / 2183
[VALID] 0:05:00.806536 : 700 / 2183
[VALID] 0:05:04.978863 : 710