In [None]:
import torch
import torchvision
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
!pip3 install git+https://github.com/rpmcruz/objdetect.git

In [None]:
import objdetect as od

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

## Data augmentation

Let's use [Albumentations](https://albumentations.ai/) for this purpose.

In [None]:
transform = A.Compose([
    A.Resize(int(256*1.1), int(256*1.1)),
    A.RandomCrop(256, 256),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(0.1, 0.1, p=1),
    A.Normalize(),
    ToTensorV2()
], bbox_params=A.BboxParams(format='albumentations', label_fields=['classes']))

inv_normalize = torchvision.transforms.Normalize((-0.485/0.229, -0.456/0.224, -0.406/0.225), (1/0.229, 1/0.224, 1/0.225))

## Dataset loader

Here, we will sub-class the data load code that comes with TorchVision. Something to keep in mind is the format and units of your bounding boxes. <u>We recommend using the 0-1 normalized x1y1x2y2 format.</u>

In [None]:
class MyVOCDetection(torchvision.datasets.VOCDetection):
    labels = ['person', 'bird', 'cat', 'cow', 'dog', 'horse', 'sheep', 'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train', 'bottle', 'chair', 'diningtable', 'pottedplant', 'sofa', 'tvmonitor']

    def __init__(self, root, split, dict_transform=None, download=False):
        super().__init__(root, image_set=split, download=download)
        self.dict_transform = dict_transform

    def __getitem__(self, i):
        image, xml = super().__getitem__(i)
        w, h = image.size
        objs = xml['annotation']['object']
        bboxes = torch.tensor([(
            float(o['bndbox']['xmin']) / w,
            float(o['bndbox']['ymin']) / h,
            float(o['bndbox']['xmax']) / w,
            float(o['bndbox']['ymax']) / h,
        ) for o in objs])
        classes = torch.tensor([self.labels.index(o['name']) for o in objs])
        d = {'image': np.array(image), 'bboxes': bboxes, 'classes': classes}
        if self.dict_transform:
            d = self.dict_transform(**d)
        return d

In [None]:
ds = MyVOCDetection('data', 'train', transform, download=True)

Let's look at the first sample:

In [None]:
d = ds[0]
print(d.keys())

In [None]:
od.utils.plot(inv_normalize(d['image']), d['bboxes'], [ds.labels[k] for k in d['classes']], grid=(8, 8))
plt.show()

Naturally, the number of bounding boxes varies for each image, therefore they cannot be turned into tensors, so we need to specify a `collate` function for how the batches should be created.

In [None]:
tr = torch.utils.data.DataLoader(ds, 16, True, collate_fn=od.utils.collate_fn)

## Model

We will prepare a one-stage model where for each location in the grid predicts: if there is an object (score), and if so, what is the object class and bounding box. Like the object detection models that come with torchvision (see e.g. [FCOS](https://pytorch.org/vision/stable/models/generated/torchvision.models.detection.fcos_resnet50_fpn.html#torchvision.models.detection.fcos_resnet50_fpn)), the behavior changes if in `train` or `eval` mode, but we don't do exactly what they do. In `train` mode, we return the *unprocessed* scores/classes/bboxes grids. In `eval` mode, we return the *processed* classes/bboxes in the form of a list.

![](model.svg)

In [None]:
class MyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        resnet = torchvision.models.resnet50(weights='DEFAULT')
        self.backbone = torch.nn.Sequential(*list(resnet.children())[:-2])
        self.scores = torch.nn.Conv2d(2048, 1, 1)
        self.classes = torch.nn.Conv2d(2048, 20, 1)
        self.bboxes = torch.nn.Conv2d(2048, 4, 1)

    def forward(self, x, threshold=0.5):
        x = self.backbone(x)
        scores = self.scores(x)
        classes = self.classes(x)
        bboxes = self.bboxes(x)
        if not self.training:
            # when in evaluation mode, convert the output grid into a list of bboxes/classes
            scores = torch.sigmoid(scores)
            hasobjs = scores >= threshold
            scores = od.grid.inv_scores(hasobjs, scores)
            bboxes = od.grid.inv_offset_logsize_bboxes(hasobjs, bboxes)
            classes = od.grid.inv_classes(hasobjs, classes)
            bboxes, classes = od.post.NMS(scores, bboxes, classes)
            return bboxes, classes
        return scores, bboxes, classes

## Training

In [None]:
model = MyModel().to(device)
scores_loss = torch.nn.BCEWithLogitsLoss()
bboxes_loss = torch.nn.MSELoss(reduction='none')
classes_loss = torch.nn.CrossEntropyLoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters())
epochs = 10

In [None]:
model.train()
for epoch in range(epochs):
    avg_loss = 0
    for imgs, targets in tqdm(tr):
        imgs = imgs.to(device)
        preds_scores, preds_bboxes, preds_classes = model(imgs)

        slices = od.grid.slices_center_locations(8, 8, targets['bboxes'])
        scores = od.grid.scores(8, 8, slices, device=device)
        bboxes = od.grid.offset_logsize_bboxes(8, 8, slices, targets['bboxes'], device=device)
        classes = od.grid.classes(8, 8, slices, targets['classes'], device=device)

        loss_value = \
            scores_loss(preds_scores, scores) + \
            (scores * bboxes_loss(preds_bboxes, bboxes)).mean() + \
            (scores * classes_loss(preds_classes, classes)).mean()
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        avg_loss += float(loss_value) / len(tr)
    print(f'Epoch {epoch+1}/{epochs} - Avg loss: {avg_loss}')

In [None]:
model.eval()
imgs = torch.stack([ds[i]['image'] for i in range(12)])
bboxes, classes = model(imgs.to(device))

In [None]:
for i in range(12):
    plt.subplot(3, 4, i+1)
    od.utils.plot(inv_normalize(imgs[i]), bboxes[i].detach().cpu(), [int(k) for k in classes[i]])
plt.show()