In [None]:
import torch
import torch.nn as nn

# Define the YOLOv3 model, which combines Darknet53 and the detection head
class YOLOv3(nn.Module):
    def __init__(self, num_classes):
        super(YOLOv3, self).__init__()
        self.backbone = Darknet53()  # implemented in darnet-53.ipynb

        # Detection head (3 detection layers, for each scale)
        self.det_head = nn.ModuleList([
            self._make_detection_head(1024, 3, num_classes),  # Detection head for 13x13 grid
            self._make_detection_head(512, 3, num_classes),   # Detection head for 26x26 grid
            self._make_detection_head(256, 3, num_classes)    # Detection head for 52x52 grid
        ])

    def _make_detection_head(self, in_channels, num_anchors, num_classes):
        """Creates a detection head for YOLOv3."""
        return nn.Sequential(
            ConvBlock(in_channels, 512, stride=1),  # First convolution to reduce features
            ConvBlock(512, 1024, stride=1),         # Second convolution for further processing
            nn.Conv2d(1024, num_anchors * (4 + 1 + num_classes), kernel_size=1)  # Final output layer
        )

    def forward(self, x):
        # Get features from Darknet53
        features = self.backbone(x)

        # Split the features into 3 different scales
        # For YOLOv3, these come from different layers of the backbone
        # at different resolutions (13x13, 26x26, 52x52)

        output = []
        for head in self.det_head:
            output.append(head(features))

        return output
