In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class YOLOv2(nn.Module):
    def __init__(self, num_classes, anchors, grid_size=13, num_bboxes=5):
        super(YOLOv2, self).__init__()

        # Initialize Darknet-19 backbone
        self.backbone = Darknet19() # Implemented in darknet19.ipynb

        # Final detection layer: 1x1 convolution to predict bounding boxes and class probabilities
        # Output channels are: (B * 5 + C), where B is number of bounding boxes per grid,
        # 5 for (x, y, w, h, confidence), and C is number of classes.
        self.det_conv = nn.Conv2d(1024, num_bboxes * (5 + num_classes), kernel_size=1, stride=1, padding=0)

        self.grid_size = grid_size
        self.num_bboxes = num_bboxes
        self.num_classes = num_classes
        self.anchors = anchors  # List of anchor box dimensions

    def forward(self, x):
        # Pass through the backbone to get feature map
        x = self.backbone(x)

        # Pass through the detection convolution layer
        output = self.det_conv(x)

        # Reshape the output to (batch_size, grid_size, grid_size, B*(5 + C))
        output = output.view(output.size(0), self.num_bboxes * (5 + self.num_classes), self.grid_size, self.grid_size)

        # Permute to (batch_size, grid_size, grid_size, B*(5 + C)) 
        # for easier access to bounding box parameters
        output = output.permute(0, 2, 3, 1)

        return output
