In [None]:
# implemenatation of YOLO from this paper https://arxiv.org/pdf/1506.02640

In [None]:
# imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from torchvision.datasets import VOCDetection
from torchvision.models import googlenet

In [None]:
"""
In YOLO paper, the network consists of 20 conv layers that is pretrained on ImageNet
1000-class competition dataset for a week and used this pre-trained network as a backbone.
average-pooling layer and an FC layer. why does YOLO use a pre-trained backbone?

This pre-trained model is used for detection task. This model is augmented with few other
layers for better performance. The final model used for detection is exteded from the pre-trained 
model by adding four conv layers and two fully connected layers with randomly initialized
weights. Also, a typical resolution for the pre-trained models is 224x224 pixels but 
detection often requires fine-grained visual information so we increase the input resolution of the network
from 224 × 224 to 448 × 448.
"""

In [None]:
# setting device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"device found {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
    print("gpu not found")

In [None]:
# get the dataset
root='./'

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((448, 448)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = VOCDetection(root=root, year="2012", image_set="train", download=True, transform=transform)

In [None]:
# combine conv layer, leaky relu and layer norm
class ConvLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(ConvLayer, self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
            nn.BatchNorm2d(out_channels),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        return self.conv_layer(x)


In [None]:
def clones(layer, N):
    return [layer for _ in range(N)]

# build lots of CNNs from scratch!!!
# (out_channels, kernel_size, stride, padding)

def get_custom_model(arch):
    in_channels = 3
    layers = []

    for layer in arch:
        if layer == "M":
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        elif isinstance(layer, list):
            layer_len = len(layer)
            num_clones = layer[layer_len - 1] # number of clones to repeat
            out_channels, kernel_size, stride, padding = layer[0]
            layer1 = ConvLayer(in_channels, out_channels, kernel_size, stride, padding)
            in_channels = out_channels
            out_channels, kernel_size, stride, padding = layer[1]
            layer2 = ConvLayer(in_channels, out_channels, kernel_size, stride, padding)
            in_channels = out_channels
            for _ in range(num_clones):
                layers.extend([layer1, layer2])
        else:
            out_channels, kernel_size, stride, padding = layer
            layers.append(ConvLayer(in_channels, out_channels, kernel_size, stride, padding))
            in_channels = out_channels
    return nn.Sequential(*layers)


In [None]:
# architecture config from image in the paper 
CUSTOM_ARCH = [
    # input shape is 3x448x448
    (64, 7, 2, 3),
    # input shape is 64x224x224
    "M",
    # input shape is 64x112x112
    (192, 3, 1, 1),
    # input shape is 192x112x112
    "M",
    # input shape is 192x56x56
    (128, 1, 1, 0),
    # input shape is 128x56x56
    (256, 3, 1, 1),
    # input shape is 256x56x56
    (256, 1, 1, 0),
    # input shape is 256x56x56
    (512, 3, 1, 1),
    # input shape is 512x56x56
    "M",
    # input shape is 512x28x28
    [(256, 1, 1, 0), (512, 3, 1, 1), 4],
    # input shape is 512x28x28
    (512, 1, 1, 0),
    # input shape is 512x28x28
    (1024, 3, 1, 1),
    # iput shape is 1024x28x28
    "M",
    # input shape is 1024x14x14
    [(512, 1, 1, 0), (1024, 3, 1, 1), 2],
    # input shape is 1024x14x14
    (1024, 3, 1, 1),
    # input shape is 1024x14x14
    (1024, 3, 2, 1),
    # input shape is 1024x7x7
    (1024, 3, 1, 1),
    (1024, 3, 1, 1),
]

In [None]:
class YOLOv1(nn.Module):
    def __init__(self, grid_size=(7, 7), num_bnb_boxes=2, num_classes=20, use_pretrained=True):
        super(YOLOv1, self).__init__()
        self.grid_size = grid_size
        self.num_bnb_boxes = num_bnb_boxes
        self.num_classes = num_classes
        if use_pretrained:
            print("loading pre-trained googlenet")
            self.backbone = googlenet(weights='DEFAULT')
            print("backbone loaded")
        else:
            print("Building the custom model from architecture config..")
            # just the 20 layers or 24 layers?
            self.backbone = self._load_custom_model()
            print("custom model loaded")
        
        # get fcs
        self.fcs = self._create_fcs(grid_size[0], num_bnb_boxes, num_classes)

    def _load_custom_model(self):
        return get_custom_model(CUSTOM_ARCH)

    def _create_fcs(self, split_size, num_bnb_boxes, num_classes):
        S, B, C = split_size, num_bnb_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (B * 5 + C))
        )

    def forward(self, x):
        # x shape is [B, C, H, W]
        # pass it thru the backbone
        x = self.backbone(x)
        # pass it thru FC layers
        x = self.fcs(x).view(-1, self.grid_size[0], self.grid_size[1], self.num_bnb_boxes * 5 + self.num_classes)
        return x
        

In [None]:
model = YOLOv1(use_pretrained=False)