In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class YOLOv2(nn.Module):
    def __init__(self, num_classes, anchors, grid_size=13, num_bboxes=5):
        super(YOLOv2, self).__init__()

        # Initialize Darknet-19 backbone
        self.backbone = Darknet19() # implemented in darknet-19.ipynb

        # Final detection layer: 1x1 convolution to predict bounding boxes and class probabilities
        # Output channels are: (B * 5 + C), where B is number of bounding boxes per grid,
        # 5 for (x, y, w, h, confidence), and C is number of classes.
        self.det_conv = nn.Conv2d(1024, num_bboxes * (5 + num_classes), kernel_size=1, stride=1, padding=0)

        self.grid_size = grid_size
        self.num_bboxes = num_bboxes
        self.num_classes = num_classes
        self.anchors = anchors  # List of anchor box dimensions

    def forward(self, x):
        # Pass through the backbone to get feature map
        x = self.backbone(x)

        # Pass through the detection convolution layer
        output = self.det_conv(x)

        # Reshape the output to (batch_size, grid_size, grid_size, B*(5 + C))
        output = output.view(output.size(0), self.num_bboxes * (5 + self.num_classes), self.grid_size, self.grid_size)

        # Permute to (batch_size, grid_size, grid_size, B*(5 + C)) 
        # for easier access to bounding box parameters
        output = output.permute(0, 2, 3, 1)

        return output


    def predict(self, x, threshold=0.5):
        """
        This method processes the network's output and applies Non-Maximum Suppression (NMS).
        The output will be filtered based on the confidence score threshold.
        """
        # Run the forward pass
        output = self.forward(x)

        # Output shape is (batch_size, grid_size, grid_size, B*(5 + C))
        batch_size, grid_size, _, _ = output.shape

        # Initialize the predictions
        predictions = []

        for i in range(batch_size):
            grid_pred = output[i]  # Shape: (grid_size, grid_size, B*(5 + C))

            # Initialize list to store individual predictions for the current image
            image_predictions = []

            for j in range(grid_size):
                for k in range(grid_size):
                    cell_pred = grid_pred[j, k]  # Shape: (B*(5 + C),)
                    
                    # Reshape to (B, 5 + C)
                    cell_pred = cell_pred.view(self.num_bboxes, 5 + self.num_classes)

                    # Extract the box coordinates and confidence
                    box_confidence = cell_pred[:, 4]
                    box_coords = cell_pred[:, :4]  # x, y, w, h
                    class_probs = cell_pred[:, 5:]  # Class probabilities

                    # Apply sigmoid to box confidence and class probabilities
                    box_confidence = torch.sigmoid(box_confidence)
                    class_probs = torch.sigmoid(class_probs)

                    # Filter out predictions based on confidence threshold
                    mask = box_confidence > threshold
                    box_confidence = box_confidence[mask]
                    box_coords = box_coords[mask]
                    class_probs = class_probs[mask]

                    if len(box_confidence) > 0:
                        # Each prediction is (confidence, bbox, class_probs)
                        image_predictions.append((box_confidence, box_coords, class_probs))

            predictions.append(image_predictions)

        return predictions

In [None]:
import torch
import torch.nn.functional as F

def yolo_loss(predictions, targets, anchors, num_classes, grid_size=13, num_bboxes=5, lambda_coord=5, lambda_noobj=0.5):
    """
    Compute the YOLO v2 loss without using nn.Module.
    
    :param predictions: Tensor of shape (batch_size, grid_size, grid_size, B * (5 + C)), predicted outputs from the network
    :param targets: Tensor of shape (batch_size, grid_size, grid_size, B * (5 + C)), ground truth values
    :param anchors: List of anchor box sizes, e.g., [(116, 90), (156, 198), (373, 326)]
    :param num_classes: Number of classes
    :param grid_size: Size of the grid (S)
    :param num_bboxes: Number of bounding boxes per grid cell (B)
    :param lambda_coord: Scaling factor for bounding box loss
    :param lambda_noobj: Scaling factor for "no object" confidence loss
    :return: Total loss as a scalar tensor
    """
    batch_size = predictions.size(0)
    
    # Reshape predictions and targets
    predictions = predictions.view(batch_size, grid_size, grid_size, num_bboxes, 5 + num_classes)
    targets = targets.view(batch_size, grid_size, grid_size, num_bboxes, 5 + num_classes)
    
    # Extract prediction components
    pred_conf = predictions[..., 4]  # Predicted confidence scores
    pred_boxes = predictions[..., :4]  # Predicted bounding box coordinates (x, y, w, h)
    pred_class = predictions[..., 5:]  # Predicted class probabilities
    
    # Extract ground truth components
    target_conf = targets[..., 4]  # Ground truth confidence
    target_boxes = targets[..., :4]  # Ground truth bounding box coordinates
    target_class = targets[..., 5:]  # Ground truth class labels

    # Compute the losses
    loss_conf = compute_confidence_loss(pred_conf, target_conf, target_boxes, lambda_noobj)
    loss_coord = compute_coord_loss(pred_boxes, target_boxes, target_conf, lambda_coord)
    loss_class = compute_class_loss(pred_class, target_class, target_conf)

    # Total loss
    total_loss = loss_conf + loss_coord + loss_class
    return total_loss


def compute_confidence_loss(pred_conf, target_conf, target_boxes, lambda_noobj):
    """
    Confidence loss (binary cross-entropy).
    """
    # Loss for "object" cells (those where target_conf == 1)
    obj_mask = target_conf == 1
    noobj_mask = target_conf == 0

    # Compute confidence loss for objects and non-objects
    obj_loss = F.binary_cross_entropy_with_logits(pred_conf[obj_mask], target_conf[obj_mask], reduction='sum')
    noobj_loss = F.binary_cross_entropy_with_logits(pred_conf[noobj_mask], target_conf[noobj_mask], reduction='sum')

    # Adjust no object loss by lambda_noobj
    return obj_loss + lambda_noobj * noobj_loss


def compute_coord_loss(pred_boxes, target_boxes, target_conf, lambda_coord):
    """
    Coordinate loss for bounding box predictions (mean squared error).
    """
    # Only compute loss for cells with object present (target_conf == 1)
    mask = target_conf == 1

    # MSE loss for bounding box coordinates (x, y, w, h)
    coord_loss = F.mse_loss(pred_boxes[mask], target_boxes[mask], reduction='sum')
    return lambda_coord * coord_loss


def compute_class_loss(pred_class, target_class, target_conf):
    """
    Classification loss (cross-entropy).
    """
    # Only compute loss for cells with object present (target_conf == 1)
    mask = target_conf == 1

    # Reshape to make the target class a single dimension for cross entropy
    target_class = target_class[mask].max(dim=-1)[1]  # Use max to get the class index
    pred_class = pred_class[mask]  # Predicted class probabilities

    # Cross-entropy loss for class predictions
    class_loss = F.cross_entropy(pred_class.view(-1, pred_class.size(-1)), target_class.view(-1), reduction='sum')
    return class_loss
