In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# 0. Imports

In [2]:
from imgaug import augmenters as iaa
import torch.optim as optim
import torch

torch.backends.cudnn.benchmark = True

from dataloader import get_dataloader
from preprocess import transformer, img_transform, reverse_img_transform
from models import RetinaNet
from loss import FocalLoss
from train import Trainer, load_components
from inference import InferenceTransform
from evaluation import support_evaluate_model
from utils import Visualiser
from callbacks import Callback
import pickle
import gc

from apex import amp

In [3]:
gc.collect(2)

25

# 1. Parameters

In [4]:
project_name = "gatletag/Local-Object-Detection-Tests"
experiment_name = "local_test1"

dir_params = {
    "train_images_dir": "dataset/validation",
    "train_bbox_dir": "data_info/valid/annotations/valid-anno.json",
    "train_idx_to_id_dir": "data_info/valid/annotations/valid-idx_to_id.pkl",

    "valid_images_dir": "dataset/validation",
    "valid_bbox_dir": "data_info/valid/annotations/valid-anno.json",
    "valid_idx_to_id_dir": "data_info/valid/annotations/valid-idx_to_id.pkl",

    "clsids_to_idx_dir": "data_info/clsids_to_idx.pkl",
    "clsids_to_names_dir": "data_info/clsids_to_names.pkl"
}

In [5]:
hyper_params = {
    # speed parameters
    "num_workers": 8,
    "device": "cuda",

    # dataloader parameters
    "bs": 16,
    "img_dim": 512,

    # anchor parameters
    "ratios": [1/3, 1/2, 1, 2],
    "scales": [0.25, 1, 2],

    # network parameters
    "backbone": "resnet50",
    "num_classes": 501,
    "pretrained": True,
    "freeze_bn": True,

    # loss parameters
    "alpha": 0.25,
    "gamma": 2.0,
    "IoU_bkgrd":0.4,
    "IoU_pos":0.5,
    "regress_factor": [0.1, 0.1, 0.2, 0.2],

    # optimizer parameters
    "lr": 0.001,
    "min_lr": 0.000001,
    "patience": 100,
    "decay_factor": 0.3,

    # training parameters
    "epochs": 2,
    "checkpoint_dir": "temp/final.pth",
    "save_dir": "temp",
    "fine_tune": True,

    # evaluation parameters
    "cls_thresh":0.10, 
    "overlap":0.5
}


# 2. Initialisations

### 2.1 Class Info

In [6]:
clsids_to_names = pickle.load(open(dir_params["clsids_to_names_dir"],'rb'))
clsids_to_idx = pickle.load(open(dir_params["clsids_to_idx_dir"],'rb'))
idx_to_cls_ids = {v: k for k, v in clsids_to_idx.items()}
idx_to_names = {k: clsids_to_names[v] for k, v in idx_to_cls_ids.items()}

### 2.2 Dataset Info

In [7]:
train_seq = iaa.Sequential([
        iaa.Resize({"height": int(hyper_params["img_dim"]*1.05), "width": int(hyper_params["img_dim"]*1.05)}),
        iaa.GammaContrast((0.9,1.1)),
        iaa.Affine(rotate=(-5, 5), scale=(0.90, 1.10)),
        iaa.Fliplr(0.5),
        iaa.CropAndPad(percent=(-0.05, 0.00)),
        iaa.Resize({"height": hyper_params["img_dim"], "width": hyper_params["img_dim"]})
    ])
train_transform_fn = transformer(train_seq, img_transform)

train_dl = get_dataloader(
        dir_params["train_images_dir"], dir_params["train_bbox_dir"], 
        dir_params["train_idx_to_id_dir"], clsids_to_idx,
        train_transform_fn, hyper_params["bs"], True, hyper_params["num_workers"], True
    )

valid_seq = iaa.Sequential([
        iaa.Resize({"height": hyper_params["img_dim"], "width": hyper_params["img_dim"]})
    ])
valid_transform_fn = transformer(valid_seq, img_transform)

valid_dl = get_dataloader(
        dir_params["valid_images_dir"], 
        dir_params["valid_bbox_dir"], 
        dir_params["valid_idx_to_id_dir"], 
        clsids_to_idx, 
        valid_transform_fn, hyper_params["bs"], False, hyper_params["num_workers"], False
    )

### 2.4 Loss Info

In [8]:
criterion = FocalLoss(
        hyper_params["alpha"], 
        hyper_params["gamma"], 
        hyper_params["IoU_bkgrd"], 
        hyper_params["IoU_pos"], 
        hyper_params["regress_factor"], 
        hyper_params["device"]
    )

### 2.3 Model Info

In [9]:
retinanet = RetinaNet(
        hyper_params["backbone"], 
        hyper_params["num_classes"],
        hyper_params["ratios"], 
        hyper_params["scales"], 
        device=hyper_params["device"], 
        pretrained = hyper_params["pretrained"], 
        freeze_bn = hyper_params["freeze_bn"],
        prior=0.01, 
        feature_size=256, 
        pyramid_levels = [3, 4, 5, 6, 7],
        criterion=criterion
    )

# TODO: Move this over to training file
def set_parameter_requires_grad(model):
        for name, param in model.named_parameters():
            if (name.split('.')[0]) not in ["fpn", "regressionModel", "classificationModel"]:
                param.requires_grad = False

if hyper_params["fine_tune"]==True:
    set_parameter_requires_grad(retinanet)

### 2.5 Optimizer Info

In [10]:
optimizer = optim.SGD(retinanet.parameters(), lr=hyper_params["lr"], momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
        mode='min', 
        factor=hyper_params['decay_factor'], 
        patience=hyper_params["patience"], 
        verbose=True, 
        min_lr=hyper_params["min_lr"]
    )

### 2.6 Visualisation Info

In [11]:
vis = Visualiser(
        hyper_params["num_classes"],
        idx_to_names,
        reverse_img_transform
    )

### 2.7 Inference Info

In [12]:
inferencer = InferenceTransform(
        idx_to_names,
        idx_to_cls_ids,
        hyper_params["regress_factor"]
    ) 

# 3. Prepare Training

In [13]:
retinanet = retinanet.to(hyper_params["device"])

load_components(retinanet, optimizer, scheduler, hyper_params["checkpoint_dir"])

retinanet, optimizer = amp.initialize(retinanet, optimizer, opt_level="O1")

if torch.cuda.device_count() > 1:
    print("Using Multiple GPUs")
    retinanet = torch.nn.DataParallel(retinanet, device_ids=range(torch.cuda.device_count()))
retinanet = retinanet.to(hyper_params["device"])


cb = Callback(project_name, experiment_name, hyper_params, hyper_params["save_dir"])

eval_params = {
    "overlap":hyper_params["overlap"],
    "cls_thresh":hyper_params["cls_thresh"]
}

trainer = Trainer(
        retinanet, 
        train_dl, 
        valid_dl, 
        optimizer, 
        scheduler, 
        criterion, 
        hyper_params["device"],
        inferencer, 
        hyper_params["num_classes"],
        eval_params,
        cb,
        vis
    )

Loading from checkpoint: temp/final.pth
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Using Multiple GPUs
LOD-79
https://ui.neptune.ml/gatletag/Local-Object-Detection-Tests/e/LOD-79


In [14]:
trainer.train(hyper_params["epochs"])

Started Training
Started Epoch: 0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
0:00:22.796237 : 0 / 100
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
0:00:37.043305 : 10 / 100
0:00:51.474983 : 20 / 100
0:01:05.384628 : 30 / 100
0:01:19.276098 : 40 / 100
0:01:33.274959 : 50 / 100
0:01:47.184525 : 60 / 100
0:02:01.041681 : 70 / 100
0:02:15.022724 : 80 / 100
0:02:28.926209 : 90 / 100
Ended Train Epoch in (hours): 0:02:41.489188
[VALID] 0:00:02.864453 : 0 / 100
[VALID] 0:00:11.996232 : 10 / 100
[VALID] 0:00:20.886543 : 20 / 100
[VALID] 0:00:29.727993 : 30 / 100
[VALID] 0:00:38.565853 : 40 / 100
[VALID] 0:00:47.378727 : 50 / 100
[VALID] 0:00:56.335509 : 60 / 100
[VALID] 0:01:05.119619 : 70 / 100
[VALID] 0:01:13.862506 : 80 / 100
[VALID] 0:01:22.567850 : 90 / 100
Epoch