# Dataset

In [1]:
import os
import torchvision.transforms as T


DATA_PATH = 'data'
CLASSES_PATH = os.path.join(DATA_PATH, 'classes.json')

BATCH_SIZE = 32
EPOCHS = 135
WARMUP_EPOCHS = 0
LEARNING_RATE = 1E-4

EPSILON = 1E-6
IMAGE_SIZE = (88, 88)

S = 4       # Divide each image into a SxS grid
B = 2       # Number of bounding boxes to predict
C = 1      # Number of classes in the dataset

In [2]:
import torch
import json
import os
import matplotlib.patches as patches
import torchvision.transforms as T
from PIL import ImageDraw, ImageFont
from matplotlib import pyplot as plt


def get_iou(p, a):
    p_tl, p_br = bbox_to_coords(p)          # (batch, S, S, B, 2)
    a_tl, a_br = bbox_to_coords(a)

    # Largest top-left corner and smallest bottom-right corner give the intersection
    coords_join_size = (-1, -1, -1, B, B, 2)
    tl = torch.max(
        p_tl.unsqueeze(4).expand(coords_join_size),         # (batch, S, S, B, 1, 2) -> (batch, S, S, B, B, 2)
        a_tl.unsqueeze(3).expand(coords_join_size)          # (batch, S, S, 1, B, 2) -> (batch, S, S, B, B, 2)
    )
    br = torch.min(
        p_br.unsqueeze(4).expand(coords_join_size),
        a_br.unsqueeze(3).expand(coords_join_size)
    )

    intersection_sides = torch.clamp(br - tl, min=0.0)
    intersection = intersection_sides[..., 0] \
                   * intersection_sides[..., 1]       # (batch, S, S, B, B)

    p_area = bbox_attr(p, 2) * bbox_attr(p, 3)                  # (batch, S, S, B)
    p_area = p_area.unsqueeze(4).expand_as(intersection)        # (batch, S, S, B, 1) -> (batch, S, S, B, B)

    a_area = bbox_attr(a, 2) * bbox_attr(a, 3)                  # (batch, S, S, B)
    a_area = a_area.unsqueeze(3).expand_as(intersection)        # (batch, S, S, 1, B) -> (batch, S, S, B, B)

    union = p_area + a_area - intersection

    # Catch division-by-zero
    zero_unions = (union == 0.0)
    union[zero_unions] = EPSILON
    intersection[zero_unions] = 0.0

    return intersection / union


def bbox_to_coords(t):
    """Changes format of bounding boxes from [x, y, width, height] to ([x1, y1], [x2, y2])."""

    width = bbox_attr(t, 2)
    x = bbox_attr(t, 0)
    x1 = x - width / 2.0
    x2 = x + width / 2.0

    height = bbox_attr(t, 3)
    y = bbox_attr(t, 1)
    y1 = y - height / 2.0
    y2 = y + height / 2.0

    return torch.stack((x1, y1), dim=4), torch.stack((x2, y2), dim=4)


def scheduler_lambda(epoch):
    if epoch < WARMUP_EPOCHS + 75:
        return 1
    elif epoch < WARMUP_EPOCHS + 105:
        return 0.1
    else:
        return 0.01


def load_class_dict():
    if os.path.exists(CLASSES_PATH):
        with open(CLASSES_PATH, 'r') as file:
            return json.load(file)
    new_dict = {}
    save_class_dict(new_dict)
    return new_dict


def load_class_array():
    classes = load_class_dict()
    result = [None for _ in range(len(classes))]
    for c, i in classes.items():
        result[i] = c
    return result


def save_class_dict(obj):
    folder = os.path.dirname(CLASSES_PATH)
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(CLASSES_PATH, 'w') as file:
        json.dump(obj, file, indent=2)


def get_dimensions(label):
    size = label['annotation']['size']
    return int(size['width']), int(size['height'])


def get_bounding_boxes(label):
    width, height = get_dimensions(label)
    x_scale = IMAGE_SIZE[0] / width
    y_scale = IMAGE_SIZE[1] / height
    boxes = []
    objects = label['annotation']['object']
    for obj in objects:
        box = obj['bndbox']
        coords = (
            int(int(box['xmin']) * x_scale),
            int(int(box['xmax']) * x_scale),
            int(int(box['ymin']) * y_scale),
            int(int(box['ymax']) * y_scale)
        )
        name = obj['name']
        boxes.append((name, coords))
    return boxes


def bbox_attr(data, i):
    """Returns the Ith attribute of each bounding box in data."""

    attr_start = C + i
    return data[..., attr_start::5]


def scale_bbox_coord(coord, center, scale):
    return ((coord - center) * scale) + center


def get_overlap(a, b):
    """Returns proportion overlap between two boxes in the form (tl, width, height, confidence, class)."""

    a_tl, a_width, a_height, _, _ = a
    b_tl, b_width, b_height, _, _ = b

    i_tl = (
        max(a_tl[0], b_tl[0]),
        max(a_tl[1], b_tl[1])
    )
    i_br = (
        min(a_tl[0] + a_width, b_tl[0] + b_width),
        min(a_tl[1] + a_height, b_tl[1] + b_height),
    )

    intersection = max(0, i_br[0] - i_tl[0]) \
                   * max(0, i_br[1] - i_tl[1])

    a_area = a_width * a_height
    b_area = b_width * b_height

    a_intersection = b_intersection = intersection
    if a_area == 0:
        a_intersection = 0
        a_area = EPSILON
    if b_area == 0:
        b_intersection = 0
        b_area = EPSILON

    return torch.max(
        a_intersection / a_area,
        b_intersection / b_area
    ).item()


In [3]:
def plot_boxes(data, labels, classes, color='orange', min_confidence=0.2, max_overlap=0.5, file=None):
    """Plots bounding boxes on the given image."""
    grid_size_x = data.size(dim=2) / S
    grid_size_y = data.size(dim=1) / S
    m = labels.size(dim=0)
    n = labels.size(dim=1)

    bboxes = []
    for i in range(m):
        for j in range(n):
            for k in range((labels.size(dim=2) - C) // 5):
                bbox_start = 5 * k + C
                bbox_end = 5 * (k + 1) + C
                bbox = labels[i, j, bbox_start:bbox_end]
                class_index = 0
                confidence = labels[i, j, class_index].item() * bbox[4].item()          # pr(c) * IOU
                if confidence > min_confidence:
                    width = bbox[2] * IMAGE_SIZE[0]
                    height = bbox[3] * IMAGE_SIZE[1]
                    tl = (
                        bbox[0] * IMAGE_SIZE[0] + j * grid_size_x - width / 2,
                        bbox[1] * IMAGE_SIZE[1] + i * grid_size_y - height / 2
                    )
                    bboxes.append([tl, width, height, confidence, class_index])

    # Sort by highest to lowest confidence
    bboxes = sorted(bboxes, key=lambda x: x[3], reverse=True)

    # Calculate IOUs between each pair of boxes
    num_boxes = len(bboxes)
    iou = [[0 for _ in range(num_boxes)] for _ in range(num_boxes)]
    for i in range(num_boxes):
        for j in range(num_boxes):
            iou[i][j] = get_overlap(bboxes[i], bboxes[j])

    # Non-maximum suppression and render image
    image = T.ToPILImage()(data)
    draw = ImageDraw.Draw(image)
    discarded = set()
    for i in range(num_boxes):
        if i not in discarded:
            tl, width, height, confidence, class_index = bboxes[i]

            # Decrease confidence of other conflicting bboxes
            for j in range(num_boxes):
                other_class = bboxes[j][4]
                if j != i and other_class == class_index and iou[i][j] > max_overlap:
                    discarded.add(j)

            # Annotate image
            draw.rectangle((tl, (tl[0] + width, tl[1] + height)), outline='orange')
            text_pos = (max(0, tl[0]), max(0, tl[1] - 11))
            text = f'car {round(confidence * 100, 1)}%'
            text_bbox = draw.textbbox(text_pos, text)
            draw.rectangle(text_bbox, fill='orange')
            draw.text(text_pos, text)
    if file is None:
        image.show()
    else:
        output_dir = os.path.dirname(file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if not file.endswith('.png'):
            file += '.png'
        image.save(file)

In [4]:
import torch
import random
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from tqdm import tqdm
from torchvision.datasets.voc import VOCDetection
from torch.utils.data import Dataset


class YoloPascalVocDataset(Dataset):
    def __init__(self, set_type, normalize=False, augment=False):
        assert set_type in {'train', 'test'}
        self.dataset = VOCDetection(
            root=DATA_PATH,
            year='2012',
            image_set=('train' if set_type == 'train' else 'val'),
            download=True,
            transform=T.Compose([
                T.ToTensor(),
                T.Resize(IMAGE_SIZE)
            ])
        )
        self.normalize = normalize
        self.augment = augment
        self.classes = {"car" : 0}

        # Preprocess dataset to only include images with cars
        self.indices = []
        for i in range(len(self.dataset)):
            _, label = self.dataset[i]
            for _, bbox_pair in enumerate(get_bounding_boxes(label)):
                name, _ = bbox_pair
                if name == 'car':
                    self.indices.append(i)
                    break
        

    def __getitem__(self, i):
        index = self.indices[i]
        data, label = self.dataset[index]
        original_data = data
        if self.normalize:
            data = TF.normalize(data, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        grid_size_x = data.size(dim=2) / S  # Images in PyTorch have size (channels, height, width)
        grid_size_y = data.size(dim=1) / S

        # Process bounding boxes into the SxSx(5*B+C) ground truth tensor
        boxes = {}
        class_names = {}                    # Track what class each grid cell has been assigned to
        depth = 5 * B + C                   # 5 numbers per bbox, then one-hot encoding of label
        ground_truth = torch.zeros((S, S, depth))
        for j, bbox_pair in enumerate(get_bounding_boxes(label)):
            name, coords = bbox_pair
            if name != 'car':
                continue
            assert name in self.classes, f"Unrecognized class '{name}'"
            class_index = self.classes[name]
            x_min, x_max, y_min, y_max = coords

            # Calculate the position of center of bounding box
            mid_x = (x_max + x_min) / 2
            mid_y = (y_max + y_min) / 2
            col = int(mid_x // grid_size_x)
            row = int(mid_y // grid_size_y)

            if 0 <= col < S and 0 <= row < S:
                cell = (row, col)
                if cell not in class_names or name == class_names[cell]:
                    # Insert class one-hot encoding into ground truth
                    one_hot = torch.zeros(C)
                    one_hot[class_index] = 1.0
                    ground_truth[row, col, :C] = one_hot
                    class_names[cell] = name

                    # Insert bounding box into ground truth tensor
                    bbox_index = boxes.get(cell, 0)
                    if bbox_index < B:
                        bbox_truth = (
                            (mid_x - col * grid_size_x) / IMAGE_SIZE[0],     # X coord relative to grid square
                            (mid_y - row * grid_size_y) / IMAGE_SIZE[1],     # Y coord relative to grid square
                            (x_max - x_min) / IMAGE_SIZE[0],                 # Width
                            (y_max - y_min) / IMAGE_SIZE[1],                 # Height
                            1.0                                              # Confidence
                        )

                        # Fill all bbox slots with current bbox (starting from current bbox slot, avoid overriding prev)
                        # This prevents having "dead" boxes (zeros) at the end, which messes up IOU loss calculations
                        bbox_start = 5 * bbox_index + C
                        ground_truth[row, col, bbox_start:] = torch.tensor(bbox_truth).repeat(B - bbox_index)
                        boxes[cell] = bbox_index + 1

        return data, ground_truth, original_data

    def __len__(self):
        return len(self.indices)


# Display data
obj_classes = load_class_array()
train_set = YoloPascalVocDataset('train', normalize=True, augment=False)

negative_labels = 0
smallest = 0
largest = 0
for data, label, _ in train_set:
    negative_labels += torch.sum(label < 0).item()
    smallest = min(smallest, torch.min(data).item())
    largest = max(largest, torch.max(data).item())
    print(data)
    print(label)
    break
    # plot_boxes(data, label, obj_classes, max_overlap=float('inf'))
# print('num_negatives', negative_labels)
# print('dist', smallest, largest)

Using downloaded and verified file: data/VOCtrainval_11-May-2012.tar
Extracting data/VOCtrainval_11-May-2012.tar to data
tensor([[[ 1.1490, -0.6957, -0.7360,  ..., -0.5899, -1.0907, -1.5877],
         [ 1.5868, -0.5431, -0.6931,  ..., -0.9154, -1.3376, -1.5065],
         [ 1.8262,  0.2371,  0.4580,  ..., -1.3887, -1.6523, -1.8551],
         ...,
         [ 0.5374,  0.5428,  0.5538,  ...,  0.2864,  0.6001,  0.5623],
         [ 0.4863,  0.5045,  0.4972,  ..., -0.1126, -0.0948,  0.0390],
         [ 0.4158,  0.4906,  0.5705,  ..., -0.0849, -0.1190, -0.1542]],

        [[ 1.5046, -0.4341, -0.5148,  ..., -0.3460, -0.9647, -1.5150],
         [ 1.9263, -0.2801, -0.5617,  ..., -0.8313, -1.3558, -1.5984],
         [ 2.1506,  0.6540,  0.6640,  ..., -1.5073, -1.7435, -1.8723],
         ...,
         [ 0.7836,  0.8049,  0.8075,  ...,  0.4754,  0.8146,  0.8095],
         [ 0.7426,  0.7635,  0.7472,  ...,  0.0874,  0.0620,  0.1910],
         [ 0.6749,  0.7299,  0.8382,  ...,  0.1041,  0.0446,  0.0553

# Model

In [5]:
import torch.nn as nn

class TinyissimoYOLO(nn.Module):
    def __init__(self):
        super().__init__()
        self.depth = B * 5 + C

        layers = [
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=3),                   # Conv 1
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),                           # Conv 2
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),                           # Conv 3
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),                          # Conv 4
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        ]

        layers += [
            nn.Flatten(),
            nn.Linear(128*5*5, 256),                            # Linear 1
            nn.ReLU(),
            nn.Linear(256, S * S * self.depth),                      # Linear 2
        ]

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return torch.reshape(
            self.model.forward(x),
            (x.size(dim=0), S, S, self.depth)
        )


In [12]:
# Create a dummy input of the same size as your images
dummy_input = torch.randn(1, 3, 88, 88)

# Create an instance of your model and run the dummy input through it
model = TinyissimoYOLO()
output = model(dummy_input)
print(output.shape)

torch.Size([1, 4, 4, 11])


In [7]:
from torchinfo import summary
model = TinyissimoYOLO()
summary(model, (1, 3, 88, 88))

Layer (type:depth-idx)                   Output Shape              Param #
TinyissimoYOLO                           [1, 4, 4, 11]             --
├─Sequential: 1-1                        --                        --
│    └─Conv2d: 2-1                       [1, 16, 92, 92]           448
│    └─ReLU: 2-2                         [1, 16, 92, 92]           --
│    └─MaxPool2d: 2-3                    [1, 16, 46, 46]           --
│    └─Conv2d: 2-4                       [1, 32, 46, 46]           4,640
│    └─ReLU: 2-5                         [1, 32, 46, 46]           --
│    └─MaxPool2d: 2-6                    [1, 32, 23, 23]           --
│    └─Conv2d: 2-7                       [1, 64, 23, 23]           18,496
│    └─ReLU: 2-8                         [1, 64, 23, 23]           --
│    └─MaxPool2d: 2-9                    [1, 64, 11, 11]           --
│    └─Conv2d: 2-10                      [1, 128, 11, 11]          73,856
│    └─ReLU: 2-11                        [1, 128, 11, 11]          --
│  

# Loss

In [8]:
import torch
from torch import nn as nn
from torch.nn import functional as F


class SumSquaredErrorLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.l_coord = 5
        self.l_noobj = 0.5

    def forward(self, p, a):
        # Calculate IOU of each predicted bbox against the ground truth bbox
        iou = get_iou(p, a)                     # (batch, S, S, B, B)
        max_iou = torch.max(iou, dim=-1)[0]     # (batch, S, S, B)

        # Get masks
        bbox_mask = bbox_attr(a, 4) > 0.0
        p_template = bbox_attr(p, 4) > 0.0
        obj_i = bbox_mask[..., 0:1]         # 1 if grid I has any object at all
        responsible = torch.zeros_like(p_template).scatter_(       # (batch, S, S, B)
            -1,
            torch.argmax(max_iou, dim=-1, keepdim=True),                # (batch, S, S, B)
            value=1                         # 1 if bounding box is "responsible" for predicting the object
        )
        obj_ij = obj_i * responsible        # 1 if object exists AND bbox is responsible
        noobj_ij = ~obj_ij                  # Otherwise, confidence should be 0

        # XY position losses
        x_losses = mse_loss(
            obj_ij * bbox_attr(p, 0),
            obj_ij * bbox_attr(a, 0)
        )
        y_losses = mse_loss(
            obj_ij * bbox_attr(p, 1),
            obj_ij * bbox_attr(a, 1)
        )
        pos_losses = x_losses + y_losses
        # print('pos_losses', pos_losses.item())

        # Bbox dimension losses
        p_width = bbox_attr(p, 2)
        a_width = bbox_attr(a, 2)
        width_losses = mse_loss(
            obj_ij * torch.sign(p_width) * torch.sqrt(torch.abs(p_width) + EPSILON),
            obj_ij * torch.sqrt(a_width)
        )
        p_height = bbox_attr(p, 3)
        a_height = bbox_attr(a, 3)
        height_losses = mse_loss(
            obj_ij * torch.sign(p_height) * torch.sqrt(torch.abs(p_height) + EPSILON),
            obj_ij * torch.sqrt(a_height)
        )
        dim_losses = width_losses + height_losses
        # print('dim_losses', dim_losses.item())

        # Confidence losses (target confidence is IOU)
        obj_confidence_losses = mse_loss(
            obj_ij * bbox_attr(p, 4),
            obj_ij * torch.ones_like(max_iou)
        )
        # print('obj_confidence_losses', obj_confidence_losses.item())
        noobj_confidence_losses = mse_loss(
            noobj_ij * bbox_attr(p, 4),
            torch.zeros_like(max_iou)
        )
        # print('noobj_confidence_losses', noobj_confidence_losses.item())

        # Classification losses
        class_losses = mse_loss(
            obj_i * p[..., :C],
            obj_i * a[..., :C]
        )
        # print('class_losses', class_losses.item())

        total = self.l_coord * (pos_losses + dim_losses) \
                + obj_confidence_losses \
                + self.l_noobj * noobj_confidence_losses \
                + class_losses
        return total / BATCH_SIZE


def mse_loss(a, b):
    flattened_a = torch.flatten(a, end_dim=-2)
    flattened_b = torch.flatten(b, end_dim=-2).expand_as(flattened_a)
    return F.mse_loss(
        flattened_a,
        flattened_b,
        reduction='sum'
    )

# Train

In [9]:
import torch
import os
import numpy as np
from tqdm import tqdm
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.autograd.set_detect_anomaly(True)         # Check for nan loss
writer = SummaryWriter()
now = datetime.now()

model = TinyissimoYOLO().to(device)
loss_function = SumSquaredErrorLoss()
optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LEARNING_RATE
    )


train_set = YoloPascalVocDataset('train', normalize=True, augment=False)
train_loader = DataLoader(
        train_set,
        batch_size=BATCH_SIZE,
        num_workers=8,
        persistent_workers=True,
        drop_last=True,
        shuffle=True
    )



Using downloaded and verified file: data/VOCtrainval_11-May-2012.tar
Extracting data/VOCtrainval_11-May-2012.tar to data




In [10]:
data, label, _ = next(iter(train_loader))
print(f"Data shape: {data.shape}")
print(f"Label shape: {label.shape}")

Data shape: torch.Size([32, 3, 88, 88])
Label shape: torch.Size([32, 4, 4, 11])


In [45]:
# Create folders
root = os.path.join(
    'models',
    'yolo_v1',
    now.strftime('%m_%d_%Y'),
    now.strftime('%H_%M_%S')
)
weight_dir = os.path.join(root, 'weights')
if not os.path.isdir(weight_dir):
    os.makedirs(weight_dir)

In [46]:
# Metrics
train_losses = np.empty((2, 0))
train_errors = np.empty((2, 0))

def save_metrics():
        np.save(os.path.join(root, 'train_losses'), train_losses)
        np.save(os.path.join(root, 'train_errors'), train_errors)


In [47]:
for epoch in tqdm(range(WARMUP_EPOCHS + EPOCHS), desc='Epoch'):
    model.train()
    train_loss = 0
    for data, labels, _ in tqdm(train_loader, desc='Train', leave=False):
        data = data.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model.forward(data)
        loss = loss_function(predictions, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() / len(train_loader)
        
        del data, labels

    # Step and graph scheduler once an epoch
    # writer.add_scalar('Learning Rate', scheduler.get_last_lr()[0], epoch)
    # scheduler.step()

    train_losses = np.append(train_losses, [[epoch], [train_loss]], axis=1)
    writer.add_scalar('Loss/train', train_loss, epoch)

save_metrics()
torch.save(model.state_dict(), os.path.join(weight_dir, 'final'))

Epoch: 100%|██████████| 135/135 [03:57<00:00,  1.76s/it]


# Inference