# Imports

In [183]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import torchvision.transforms as transforms
import torchvision.transforms.functional as FT

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from collections import Counter

import os

from PIL import Image

from tqdm import tqdm

In [28]:
import warnings

def my_formatwarning(message, category, filename, lineno, line=None):
  print(message, category)
  # lineno is the line number you are looking for
  print('file:', filename, 'line number:', lineno) 
  ...

warnings.formatwarning = my_formatwarning

# Yolo V1 archietecture

* There 24 convolution layers, Each layer is comprised of convolution => batchNorm => Leaky ReLU
* After some of the convolution layers, there are maxpool layers with kernel and stride as (2, 2)
* After convolution layers, there are 2 Fully-connected layers. Output of the last convolution layer is supposed to be Batch_size * S * S * 1024 that is first flattened out to Batch_size * (S * S * 1024). and then fed to the fully-connected layers. Output of the last fully-connected layer is Batch_size * (S * S * (C + B * 5)) which is reshaped to Batch_size * S * S * (C + B * 5)  

In [184]:
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]


class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],
                    )
                ]
                in_channels = x[1]

            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3],
                        )
                    ]
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3],
                        )
                    ]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes

        # In original paper this should be
        # nn.Linear(1024*S*S, 4096),
        # nn.LeakyReLU(0.1),
        # nn.Linear(4096, S*S*(B*5+C))

        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B * 5)),
        )

# Intersection over Union(IOU)

In [226]:
def intersection_over_union(boxes_preds : torch.Tensor, boxes_labels : torch.Tensor,
                            box_format : str = "midpoint") -> torch.Tensor :
    """
    Calculates intersection over union
    Parameters:
        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)
    Returns:
        tensor: Intersection over union for all examples
    """
    epsilon = 1e-6

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    #box1 is the predicted box and box2 is the ground truth box
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # .clamp(0) is for the case when they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    # 1e-6 is for numberical stability, nothing else
    union_area = box1_area + box2_area - intersection

    return intersection / (union_area + epsilon)

# Non-Max Supression(NMS)

1. Choose all the predicted box of a class, call it bboxes
2. Remove all the boxes that have confidence score less than some threshold, call it valid_boxes
3. Sort valid_boxes in the monotonically decreasing order of confidence score
3. while valid_boxes is not empty
    1. Pick a box from the top, call it chosen_box. Remove it from valid_boxes
    2. for b in valid_boxes,
        1. if IOU(b, chosen_box) > iou_threshold then, remove b from valid_boxes

In [227]:
def non_max_suppression(boxes : torch.Tensor, 
                        iou_threshold : float, threshold : float, 
                        box_format : str = "midpoint") -> torch.Tensor:
    """
    Does Non Max Suppression given bboxes
    Parameters:
        bboxes (tensor): list of lists containing all bboxes with each bboxes
        specified as [class_pred, prob_score, x1, y1, x2, y2]
        iou_threshold (float): threshold where predicted bboxes is correct
        threshold (float): threshold to remove predicted bboxes (independent of IoU) 
        box_format (str): "midpoint" or "corners" used to specify bboxes
    Returns:
        tensor: bboxes after performing NMS given a specific IoU threshold
    """

    boxes = boxes.tolist()
    boxes = sorted(boxes, key= lambda x: x[1], reverse= True)
    
    boxes_after_nms = []
    while len(boxes) > 0 and boxes[0][1] > threshold:
        chosen_box = boxes.pop(0)

        boxes = [
            box
            for box in boxes
            if box[0] != chosen_box[0] or
               intersection_over_union(torch.tensor(chosen_box[2:]),
                                       torch.tensor(box[2:]),
                                       box_format=box_format).item()
            < iou_threshold
        ]
        boxes_after_nms.append(chosen_box)
    
    return torch.tensor(boxes_after_nms, requires_grad= False)

# Mean Average Precision(mAP)

In [229]:
def mean_average_precision_gaurav(pred_boxes : torch.Tensor, true_boxes : torch.Tensor, 
                                  num_classes : int, iou_threshold : float =0.5, 
                                  box_format : str = "midpoint") -> float:
    
    pred_boxes = pred_boxes.tolist()
    true_boxes = true_boxes.tolist()
    epsilon    = 1e-6
    
    sum_precision = 0
    num_precision = 0
    
    for c in range(num_classes):
        class_pred = []
        class_true = []

        for box in pred_boxes:
            if box[1] == c:
                class_pred.append(box)
        
        for box in true_boxes:
            if box[1] == c:
                class_true.append(box)

        total_class_boxes = len(class_true)
        class_pred        = sorted(class_pred, key= lambda x : x[2], reverse= True)
        
        true_positive  = 0
        false_positive = 0
        precision = [1]
        recall    = [0]
        
        for p_box in class_pred:
            best_iou     = 0
            best_box_idx = -1
            
            for idx, gt_box in enumerate(class_true):
                if gt_box[0] == p_box[0]:
                    iou = intersection_over_union(
                                torch.tensor(p_box[3:]),
                                torch.tensor(gt_box[3:]),
                                box_format=box_format,
                          ).item()
                    if iou >= best_iou:
                        best_iou     = iou
                        best_box_idx = idx

            if best_box_idx >= 0 and best_iou >= iou_threshold:
                true_positive += 1
                class_true.pop(best_box_idx)
            else:
                false_positive += 1
            
            #Numerical stability acually not required in precision as box is goingto be either true_positive or false_positive
            #so denominator can't be zero
            precision.append(true_positive / (true_positive + false_positive + epsilon))
            recall.append(true_positive / (total_class_boxes + epsilon))

        if len(class_pred) > 0:
            sum_precision += torch.trapz(torch.tensor(precision, requires_grad= False),
                                         torch.tensor(recall, requires_grad= False)).item()
            num_precision += 1
    
    return sum_precision / (num_precision + epsilon)

In [228]:
def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    """
    Calculates mean average precision 
    Parameters:
        pred_boxes (list): list of lists containing all bboxes with each bboxes
        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
        true_boxes (list): Similar as pred_boxes except all the correct ones 
        iou_threshold (float): threshold where predicted bboxes is correct
        box_format (str): "midpoint" or "corners" used to specify bboxes
        num_classes (int): number of classes
    Returns:
        float: mAP value across all classes given a specific IoU threshold 
    """

    # list storing all AP for respective classes
    average_precisions = []

    # used for numerical stability later on
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        # Go through all predictions and targets,
        # and only add the ones that belong to the
        # current class c
        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # find the amount of bboxes for each training example
        # Counter here finds how many ground truth bboxes we get
        # for each training example, so let's say img 0 has 3,
        # img 1 has 5 then we will obtain a dictionary with:
        # amount_bboxes = {0:3, 1:5}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        # We then go through each key, val in this dictionary
        # and convert to the following (w.r.t same example):
        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # sort by box probabilities which is index 2
        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)
        
        # If none exists for this class then we can safely skip
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # Only take out the ground_truths that have the same
            # training idx as detection
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                ).item()

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # if IOU is lower then the detection is a false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)

In [193]:
def get_box(pred : torch.Tensor,
            S : int, B : int, C : int,
            threshold : float = 0.5, iou_threshold : float = 0.5,
            use_nms : bool = False,
            box_format : str = 'midpoint') -> torch.Tensor:
    """
    pred : tensor(Batch_size, S * S, C + B * 5)
    """
    
    """
    input : pred : tensor(Batch_size, S * S, C + B * 5)

    1. For each grid-point,find the best_box which have the following properties
        i. class identifier of the box
        ii. confidence score for the class that it is predicting
        iii. box coordinates, which should be normalized relative to the entire image
             model infference predicts the boxes that are normalized to the grid cell

     output : temp : tensor(Batch_size, S * S, 6[pred_class, conf_score, box_coords])            
    """
    #i. and ii.
    best_conf = pred[..., C].unsqueeze(-1) 
    best_box  = pred[..., C + 1 : C + 5]

    for b in range(B):
        box_index = C + b * 5

        curr_conf = pred[..., box_index].unsqueeze(-1)
        curr_box  = pred[..., box_index + 1 : box_index + 5]

        confs = torch.cat((best_conf, curr_conf), dim= -1)
        max_conf, max_conf_idx = torch.max(confs, dim= -1, keepdims= True)

        best_conf = max_conf
        best_box  = (1 - max_conf_idx) * best_box + max_conf_idx * curr_box
    
    best_conf, pred_class = torch.max(best_conf * pred[..., :C], 
                                      dim= -1, keepdims= True)
    # iii.
    index = torch.arange(S)
    batch_size = best_box.shape[0]

    x = index.repeat(S, 1)
    y = x.permute(1, 0)
    x = x.repeat(batch_size, 1, 1).unsqueeze(dim= -1)
    y = y.repeat(batch_size, 1, 1).unsqueeze(dim= -1)

    grid_cell_index = torch.cat((x, y), dim= -1)

    best_box[..., :2] = (1 / S) * (grid_cell_index + best_box[..., :2])
    best_box[..., 2:] = (1 / S) * best_box[..., 2:]
    
    # output
    temp = torch.cat((pred_class, best_conf, best_box), dim= -1)
    
    """
    input : temp : tensor(Batch_size, S, S, 6[pred_class, conf_score, box_coords])  
    
    2. Convert the to the tensor(total_boxes, 6[pred_class, conf_score, box_coords])
        i. for each sample convert it to"""
    batch_size  = temp.shape[0]
    final_boxes = []
    
    for b in range(batch_size):
        boxes = temp[b].reshape(S * S, -1)
        
        if use_nms == True:
            boxes = non_max_suppression(
                            boxes, 
                            iou_threshold, threshold,
                            box_format)

        sample_id = torch.tensor([[b] for i in range(boxes.shape[0])], requires_grad= False)
        nms_boxes = torch.cat((sample_id, boxes), dim= 1)
        
        if b == 0:
            final_boxes = nms_boxes
        else:
            final_boxes = torch.cat((final_boxes, nms_boxes), dim= 0)
    
    return final_boxes

# YOLO Loss

![Loss Function](https://i.stack.imgur.com/IddFu.png)

In [194]:
def YoloLoss(S : float, B : float, C : float, 
             predictions : torch.Tensor, target : torch.Tensor) -> torch.Tensor:
        
        epsilon = 1e-6
        lambda_noobj = 0.5
        lambda_coord = 5
        mse = nn.MSELoss(reduction="sum")

        # predictions are shaped BATCH_SIZE * (S * S * (C + B * 5)) when inputted
        # they are re-shaped to BATCH_SIZE * S * S * (C + B * 5)
        predictions = predictions.reshape(-1, S, S, C + B * 5)

        # Calculate IoU for the two predicted bounding boxes with target bbox
        best_iou = intersection_over_union(
                        predictions[..., C + 1 : C + 5],
                        target[..., C + 1 : C + 5]
                   )
        best_iou_box = predictions[..., C : C + 5]
        
        
        for i in range(B):
            curr_box_index  = C + i * 5  
            next_box_index  = curr_box_index + 5
            
            curr_iou     = intersection_over_union(
                                predictions[..., curr_box_index + 1 : next_box_index],
                                target[..., C + 1 : C + 5]
                            )
            curr_iou_box = predictions[..., curr_box_index : next_box_index]
            
            ious                 = torch.cat((best_iou, curr_iou), dim= -1)
            max_iou, max_iou_idx = torch.max(ious, dim= -1, keepdims= True)
            
            best_iou     = max_iou
            best_iou_box = max_iou_idx * curr_iou_box + (1 - max_iou_idx) * best_iou_box

        # In paper this is Iobj_i
        exists_box = target[..., C].unsqueeze(-1)
        
        box_predictions = best_iou_box
        box_targets     = torch.clone(target[..., C : C + 5])
        
        # ======================== #
        #   FOR BOX COORDINATES    #
        # ======================== #
        
        # Take sqrt of width, height of boxes
        box_predictions[..., 3:5] = torch.sign(box_predictions[..., 3:5]) * \
                                    torch.sqrt(torch.abs(box_predictions[..., 3:5] + epsilon))
            
        box_targets[..., 3:5]     = torch.sqrt(box_targets[..., 3:5])

        # end_dim = -2 means the tensors will converted to (BATCH_SIZE * S * S) * 4, 
        # where 4 coordinates are (x, y, sqrt(w), sqrt(h))
        box_loss = mse(
                        torch.flatten(exists_box * box_predictions, end_dim= -2)[..., 1:5],
                        torch.flatten(exists_box * box_targets, end_dim= -2)[..., 1:5],
                   )

        # ==================== #
        #   FOR OBJECT LOSS    #
        # ==================== #

        object_loss = mse(
                           torch.flatten(box_predictions, end_dim= -2)[..., 0],
                           torch.flatten(box_targets, end_dim= -2)[..., 0],
                      )

        # ======================= #
        #   FOR NO OBJECT LOSS    #
        # ======================= #

        no_object_loss = 0
        
        for i in range(B):
            curr_box_index  = C + i * 5  
            
            no_object_loss = mse(
                                  torch.flatten((1 - exists_box) * predictions[..., curr_box_index : curr_box_index + 1], start_dim=1),
                                  torch.flatten((1 - exists_box) * target[..., C : C + 1], start_dim=1),
                              )

        # ================== #
        #   FOR CLASS LOSS   #
        # ================== #

        class_loss = mse(
                          torch.flatten(exists_box * predictions[..., :C], end_dim=-2),
                          torch.flatten(exists_box * target[..., :C], end_dim=-2),
                     )

        loss = (
              lambda_coord * box_loss  # first two rows in paper
            + object_loss  # third row in paper
            + lambda_noobj * no_object_loss  # forth row
            + class_loss  # fifth row
        )

        return loss

# Custom Dataset Creation

In the notebook space there are two folders, Image and labels. For each x.jpg in the Image folder there is a x.txt file in labels folder. x.txt file contain several lines where each line represents a bounding box for the object of certain class. the x.txt file looks like the following

1. *class_label, x, y, w, h*
2. .
3. .
4. .



1. x, y : mid-points of bounding box and are normalized
2. w, h : width and height of the bounding box and normalized
3. class_label : class of the object in the bounding box defined as 0 to C - 1

There are also ceratain .csv file. which maps an image file in the Image folder to it label file in the labels folder. Namely there is train.csv that is used for the traning purposes and test.csv for testing purposes.


To create ground truth labels for the image we need to resize the image to *(448, 448)* and convert it to tensor.

After that the image is logically divided into *S * S* size grid, since the bounding box coordinates are normalized between 0 and 1 therefore the size of each grid will be *(1/S * 1/S)*. Each grid cell is a tensor of size *C + 5*. 

So the grid for the image is like G[S][S][C + 5], all the values are initially 0.

We need to assign each of the bounding box in the labels files to a grid cell and logically it is assigned to the grid cell in which its mid-point resides. To find the grid cell of the bounding box we do

i, j = int ( y / (1 / S)), int(x / (1 / S))

Also, we need to find the position of x and y relative to the start of the grid and and that is done as following

x', y' =  x * S - j, y * S - i

Now, 
1. G[i][j][class_label] = 1,
2. G[i][j][C] = 1, confidence that there is an object in the cell
3. G[i][j][C + 1] = x'
4. G[i][j][C + 2] = y'
5. G[i][j][C + 3] = w
6. G[i][j][C + 4] = h

This is done for all the boxes in the labels file

In [195]:
class VOCDataset(torch.utils.data.Dataset):
    def __init__(
        self, csv_file, img_dir, label_dir, S, B, C, transform= None,
    ):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir     = img_dir
        self.label_dir   = label_dir
        self.transform   = transform
        self.S = S
        self.B = B
        self.C = C

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
        boxes      = []
        with open(label_path) as f:
            for label in f.readlines():
                class_label, x, y, width, height = [
                    float(x) if float(x) != int(float(x)) else int(x)
                    for x in label.replace("\n", "").split()
                ]
                boxes.append([class_label, x, y, width, height])

        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image    = Image.open(img_path)
        boxes    = torch.tensor(boxes)

        if self.transform:
            image, boxes = self.transform(image, boxes)

        # Convert To Cells
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)
            
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            """
            Calculating the width and height of cell of bounding box,
            relative to the cell is done by the following, with
            width as the example:
            
            width_pixels = (width*self.image_width)
            cell_pixels = (self.image_width)
            
            Then to find the width relative to the cell is simply:
            width_pixels/cell_pixels, simplification leads to the
            formulas below.
            """
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )

            # If no object already found for specific cell i,j
            # Note: This means we restrict to ONE object
            # per cell!
            if label_matrix[i, j, self.C] == 0:
                # Set that there exists an object
                label_matrix[i, j, self.C] = 1

                # Box coordinates
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                label_matrix[i, j, self.C + 1 : self.C + 5] = box_coordinates

                # Set one hot encoding for class_label
                label_matrix[i, j, class_label] = 1

        return image, label_matrix

# Macros

In [235]:
seed = 123
torch.manual_seed(seed)

# Hyperparameters etc. 
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available else "cpu"
# DEVICE = "cpu"
BATCH_SIZE = 16 # 64 in original paper but I don't have that much vram, grad accum?
WEIGHT_DECAY = 0
EPOCHS = 20
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "./overfit.pth.tar"
IMG_DIR = "../input/pascalvoc-yolo/images"
LABEL_DIR = "../input/pascalvoc-yolo/labels"


split_size    = 7
box_per_cell  = 2
classes       = 20
iou_threshold = 0.5 #Threshold for the consideration that boxes ar
threshold     = 0.005 #Theshold for the confidence score
box_format    = 'midpoint'

# Creating dataset and the dataloader

In [236]:
"""
This custom class is created so that if some wierd transform comes where as a result
of the transform we need to change the box as well

by default we could have done transforms.Compose([....])
"""
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes

transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor(),])

train_dataset = VOCDataset(
    "../input/pascalvoc-yolo/100examples.csv",
    transform=transform,
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
    S= split_size, B= box_per_cell, C= classes
)

test_dataset = VOCDataset(
    "../input/pascalvoc-yolo/test.csv", 
    transform=transform, 
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
    S= split_size, B= box_per_cell, C= classes
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=False,
)

# Model, Optimizer and the loss function

In [237]:
#1. Defining the model. Moving it to the CUDA if present
model = Yolov1(split_size= split_size, num_boxes= box_per_cell, num_classes= classes)

#2. Defining the optimizer
optimizer = optim.Adam(
    model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
)

#3. Defining the loss function
def customeLossFunction(pred : torch.Tensor, label : torch.Tensor) -> torch.Tensor:
    return YoloLoss(S= split_size, B= box_per_cell, C= classes,
                    predictions= pred, target= label)
    
loss_fn = customeLossFunction

#4. Defining the metric
def customMetric(model : nn.Module, loader : DataLoader) -> float:
    pred_boxes = 0
    true_boxes = 0
    
    model.eval()
    for idx, (img, label) in enumerate(loader):
        pred = model(img)
        if idx == 0:
            pred_boxes = pred
            true_boxes = label
        else:
            pred_boxes = torch.cat((pred_boxes, pred), dim= 0)
            true_boxes = torch.cat((true_boxes, label), dim= 0)
    model.train()

    batch_size = pred_boxes.shape[0]
    pred_boxes = torch.reshape(pred_boxes, (batch_size, split_size, split_size, -1))
    
    pred_boxes = get_box(pred_boxes, S= split_size, B= box_per_cell, C= classes,
                         threshold= threshold, iou_threshold= iou_threshold,
                         use_nms= True,
                         box_format= box_format)
    
    true_boxes = get_box(true_boxes, S= split_size, B= box_per_cell, C= classes,
                         threshold= threshold, iou_threshold= iou_threshold,
                         use_nms= False,
                         box_format= box_format)

    return mean_average_precision_gaurav(pred_boxes, true_boxes, 
                                      num_classes= classes,
                                      iou_threshold= iou_threshold, box_format= box_format)
    
Metric = customMetric

if LOAD_MODEL:
    load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

In [238]:
def train_step(model, loader, loss_fn, opt):
    batch_loss = []
    
    model.train()
    for img, label in loader:
        pred = model(img)

        loss = loss_fn(pred, label)
        batch_loss.append(loss.item())
        
        opt.zero_grad()
        loss.backward()
        opt.step()
    
    return sum(batch_loss)/len(batch_loss) 

In [None]:
load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

for epoch in range(EPOCHS):

    step_loss = train_step(model, train_loader, loss_fn, optimizer)
    mAP       = Metric(model, train_loader)

    print(f"epoch: {epoch}, Loss: {step_loss}, mAP: {mAP}")

checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
}
save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)

=> Loading checkpoint
epoch: 0, Loss: 661.6145760672433, mAP: 0.0
epoch: 1, Loss: 444.2152317592076, mAP: 1.4318439218946404e-05
epoch: 2, Loss: 356.4292471749442, mAP: 0.0
epoch: 3, Loss: 290.91578783307756, mAP: 1.5782826023796552e-05
epoch: 4, Loss: 239.77950286865234, mAP: 0.012361927411071385
epoch: 5, Loss: 204.75702558244978, mAP: 0.01862617431017064


In [213]:
def plot_image(image, boxes):
    """Plots predicted bounding boxes on the image"""
    im = np.array(image)
    height, width, _ = im.shape

    # Create figure and axes
    fig, ax = plt.subplots(1)
    # Display the image
    ax.imshow(im)

    # box[0] is x midpoint, box[2] is width
    # box[1] is y midpoint, box[3] is height

    # Create a Rectangle potch
    for box in boxes:
        box = box[2:]
        assert len(box) == 4, "Got more values than in x, y, w, h, in a box!"
        upper_left_x = box[0] - box[2] / 2
        upper_left_y = box[1] - box[3] / 2
        rect = patches.Rectangle(
            (upper_left_x * width, upper_left_y * height),
            box[2] * width,
            box[3] * height,
            linewidth=5,
            edgecolor="r",
            facecolor="none",
        )
        # Add the patch to the Axes
        ax.add_patch(rect)

    plt.show()

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

# Test code to plot the ground truth boxes

In [None]:


IMG_DIR       = "../input/pascalvoc-yolo/images"
LABEL_DIR     = "../input/pascalvoc-yolo/labels"
img_label_map = pd.read_csv('../input/pascalvoc-yolo/100examples.csv')
index         = 1
preprocess = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor()
])

box_coord = []

label_path = os.path.join(LABEL_DIR, img_label_map.iloc[index, 1])

with open(label_path) as f:
    for label in f.readlines():
        class_label, x, y, width, height = [
            float(x) if float(x) != int(float(x)) else int(x)
            for x in label.replace("\n", "").split()
        ]
        
        box_coord.append([0, 0, x, y, width, height])

img_path = os.path.join(IMG_DIR, img_label_map.iloc[index, 0])
img = Image.open(img_path)
img = preprocess(img)

print(box_coord)
plot_image(img.permute(1, 2, 0), box_coord)


