In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from imgaug import augmenters as iaa
import torch.optim as optim
import torch

torch.backends.cudnn.benchmark = True

from dataloader import get_dataloader, get_balanced_dataloader
from preprocess import transformer, img_transform, reverse_img_transform
from models import RetinaNet
from loss import FocalLoss
from train import Trainer, load_components
from inference import InferenceTransform
from utils import Visualiser
from callbacks import Callback
import pickle
import gc
import json

from apex import amp

import argparse

gc.collect(2)



5

In [3]:
# parser = argparse.ArgumentParser(description='Train Retinanet')

# parser.add_argument('--config-file-dir', type=str, help='config file location')

# args = parser.parse_args()

configs_dir = "configs/balanced_subset0_2_config.json" # args.config_file_dir 

In [4]:
parameters = json.load(open(configs_dir,'r'))

## 1. Parameters

In [5]:
hyper_params = parameters["hyperparams"]
dir_params = parameters["dirparams"]
project_params = parameters["projectconfig"]

project_name = project_params["project_name"]
experiment_name = project_params["experiment_name"]

## 2. Initialisations

In [6]:
clsids_to_names = json.load(open(dir_params["clsids_to_names_dir"],'r'))
clsids_to_idx = json.load(open(dir_params["clsids_to_idx_dir"],'r'))
idx_to_cls_ids = {v: k for k, v in clsids_to_idx.items()}
idx_to_names = {k: clsids_to_names[v] for k, v in idx_to_cls_ids.items()}

## 3. Dataloaders

In [7]:
train_seq = iaa.Sequential([
        iaa.Resize({"height": int(hyper_params["img_dim"]*1.05), "width": int(hyper_params["img_dim"]*1.05)}),
        iaa.GammaContrast((0.9,1.1)),
        iaa.Affine(rotate=(-5, 5), scale=(0.90, 1.10)),
        iaa.Fliplr(0.5),
        iaa.CropAndPad(percent=(-0.05, 0.00)),
        iaa.Resize({"height": hyper_params["img_dim"], "width": hyper_params["img_dim"]})
    ])
train_transform_fn = transformer(train_seq, img_transform)
train_dl = get_balanced_dataloader(
        dir_params["train_images_dir"], dir_params["train_bbox_dir"], 
        dir_params["train_dict_clsid_to_list_imgs_dir"], dir_params["dict_distributions"], clsids_to_idx, dir_params["num_items"],
        train_transform_fn, hyper_params["bs"], True, hyper_params["num_workers"], True
    )

valid_seq = iaa.Sequential([
        iaa.Resize({"height": hyper_params["img_dim"], "width": hyper_params["img_dim"]})
    ])
valid_transform_fn = transformer(valid_seq, img_transform)
valid_dl = get_dataloader(
        dir_params["valid_images_dir"], 
        dir_params["valid_bbox_dir"], 
        dir_params["valid_idx_to_id_dir"], 
        clsids_to_idx, 
        valid_transform_fn, hyper_params["bs"], False, hyper_params["num_workers"], False
    )

## 4. Loss

In [8]:
criterion = FocalLoss(
        hyper_params["alpha"], 
        hyper_params["gamma"], 
        hyper_params["IoU_bkgrd"], 
        hyper_params["IoU_pos"], 
        hyper_params["regress_factor"], 
        hyper_params["device"]
    )

## 5. Model

In [9]:
retinanet = RetinaNet(
        hyper_params["backbone"], 
        hyper_params["num_classes"],
        hyper_params["ratios"], 
        hyper_params["scales"], 
        device=hyper_params["device"], 
        pretrained = hyper_params["pretrained"], 
        freeze_bn = hyper_params["freeze_bn"],
        prior=0.01, 
        feature_size=256, 
        pyramid_levels = [3, 4, 5, 6, 7],
        criterion=criterion
    )

In [10]:
def set_parameter_requires_grad(model):
        for name, param in model.named_parameters():
            if (name.split('.')[0]) not in ["fpn", "regressionModel", "classificationModel"]:
                param.requires_grad = False

if hyper_params["fine_tune"]==True:
    set_parameter_requires_grad(retinanet)
retinanet = retinanet.to(hyper_params["device"])

## 6. Optimizer

In [11]:
optimizer = optim.SGD(retinanet.parameters(), lr=hyper_params["lr"], momentum=0.9, weight_decay=1e-4)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
#         mode='min', 
#         factor=hyper_params['decay_factor'], 
#         patience=hyper_params["patience"], 
#         verbose=True, 
#         min_lr=hyper_params["min_lr"]
#     )
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=hyper_params["lr"], max_lr=hyper_params["lr"]*10,step_size_up=int(len(train_dl)*hyper_params["epochs"]*0.3), step_size_down=int(len(train_dl)*hyper_params["epochs"]*0.7), mode='exp_range')

## 7. Visualisations

In [12]:
vis = Visualiser(
        hyper_params["num_classes"],
        idx_to_names,
        reverse_img_transform
    )


## 8. Inference

In [13]:
inferencer = InferenceTransform(
        idx_to_names,
        idx_to_cls_ids,
        hyper_params["regress_factor"]
    ) 

## 9. Prepare Training

In [14]:
load_components(retinanet, optimizer, scheduler, hyper_params["checkpoint_dir"])
retinanet, optimizer = amp.initialize(retinanet, optimizer, opt_level="O1")

if torch.cuda.device_count() > 1:
    print("Using Multiple GPUs")
    retinanet = torch.nn.DataParallel(retinanet, device_ids=range(torch.cuda.device_count())) 
retinanet = retinanet.to(hyper_params["device"])

cb = Callback(project_name, experiment_name, hyper_params, hyper_params["save_dir"])

eval_params = {
    "overlap":hyper_params["overlap"],
    "cls_thresh":hyper_params["cls_thresh"]
}

trainer = Trainer(
        retinanet, 
        train_dl, 
        valid_dl, 
        optimizer, 
        scheduler, 
        criterion, 
        hyper_params["device"],
        inferencer, 
        hyper_params["num_classes"],
        eval_params,
        cb,
        vis
    )

Loading from checkpoint: temp_subset0/final.pth
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Using Multiple GPUs
OBV-23
https://ui.neptune.ml/gatletag/ObjectDetectionVM/e/OBV-23


## 10. Training

In [15]:
trainer.train(hyper_params["epochs"])

Started Training
Started Epoch: 0
0:00:20.311298 : 0 / 625
Saving checkpoint to: temp_subset1/batch-0.pth
0:00:25.714676 : 10 / 625
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
0:00:31.519499 : 20 / 625
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
0:00:37.078363 : 30 / 625
0:00:42.578318 : 40 / 625
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
0:00:48.336377 : 50 / 625
0:00:53.855667 : 60 / 625
0:00:59.547722 : 70 / 625
0:01:05.576483 : 80 / 625
0:01:11.284652 : 90 / 625
0:01:16.810005 : 100 / 625
0:01:23.010627 : 110 / 625
0:01:28.634089 : 120 / 625
0:01:34.398034 : 130 / 625
0:01:39.670341 : 140 / 625
0:01:45.439549 : 150 / 625
0:01:51.283162 : 160 / 625
0:02:00.759345 : 170 / 625
0:02:06.486337 : 180 / 625
0:02:12.206102 : 190 / 625
0:02:17.756951 : 200 / 625
0:02:23.628626 : 210 / 625
0:02:29.384763 : 2

0:06:00.318225 : 595 / 625
0:06:06.058796 : 605 / 625
0:06:11.238306 : 615 / 625
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 512.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 256.0
Ended Train Epoch in (hours): 0:06:16.007337
Saving checkpoint to: temp_subset1/epoch-3.pth
[VALID] 0:00:05.667291 : 0 / 7
Epoch completed in (hours): 0:06:24.793316
aPs:  {'background': 0.0, 'Butterfly': 0.7942650082427809, 'Ladybug': 0.2976648106580695, 'Caterpillar': 0.3996718386768343}
Started Epoch: 4
0:00:07.892249 : 0 / 625
0:00:14.791538 : 10 / 625
0:00:20.636052 : 20 / 625
0:00:26.272409 : 30 / 625
0:00:32.201145 : 40 / 625
0:00:41.393283 : 50 / 625
0:00:46.839903 : 60 / 625
0:00:52.461734 : 70 / 625
0:00:58.234641 : 80 / 625
0:01:04.140579 : 90 / 625
0:01:09.973421 : 100 / 625
0:01:15.837182 : 110 / 625
0:01:21.433337 : 120 / 625
0:01:27.097383 : 130 / 625
0:01:32.953521 : 140 / 625
0:01:38.812258 : 150 / 625
0:01:44.306968 : 160 / 625
0:01:50