#### Setup Codes

In [1]:
%load_ext autoreload
%autoreload 2

##### Google Colab Setup
we need to run a few commands to set up our environment on Google Colab. If you are running this notebook on a local machine you can skip this section. Run the following cell to mount your Google Drive.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import sys

# TODO: Fill in the Google Drive path where you uploaded the assignment
# Example: If you create a 'Test' folder and put all the files under 'example' folder, then 'Test/example'
# GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Test/example'
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'GIT/tutorials/utils/'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
sys.path.append(GOOGLE_DRIVE_PATH)

print(os.listdir(GOOGLE_DRIVE_PATH))

['__pycache__', 'for_knn.py', 'linear_classifier.py', 'custom_model_utils', 'Convolutional_Neural_Network', '_utils.py', 'save.py', '_word_processing.py', '_layers.py', 'enc2dec', 'data', 'models', 'colab_utils', 'visualize.py']


### Load PASCAL VOC 2007 data

PASCAL VOC 2007 download info : https://pytorch.org/vision/main/generated/torchvision.datasets.VOCDetection.html

In [4]:
from torch.utils.data import DataLoader

# custom packages
from data.pascal_voc import PASCALVOC, inverse_image
from data.cv_utils import detection_visualizer

In [5]:
train_dataset = PASCALVOC(root='.', year='2007', split='train', download=True, image_size=224)
val_dataset = PASCALVOC(root='.', year='2007', split='val', download=True, image_size=224)

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar to ./VOCtrainval_06-Nov-2007.tar


100%|██████████| 460032000/460032000 [00:13<00:00, 34776579.86it/s]


Extracting ./VOCtrainval_06-Nov-2007.tar to .
Using downloaded and verified file: ./VOCtrainval_06-Nov-2007.tar
Extracting ./VOCtrainval_06-Nov-2007.tar to .


In [6]:
import multiprocessing
train_loader = DataLoader(train_dataset, batch_size=32, pin_memory=True, num_workers=multiprocessing.cpu_count())
val_loader = DataLoader(val_dataset, batch_size=1, pin_memory=True, num_workers=multiprocessing.cpu_count())

## Faster R-CNN (2)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# custom packages
import colab_utils.object_detection as utils
import data.cv_utils as cv_utils

In [8]:
def calculate_iou(boxes1, boxes2):

    gt = boxes1.repeat(1, boxes2.shape[0]).reshape(-1,4)
    anchor = boxes2.repeat(boxes1.shape[0], 1)

    # Calculate area of each box
    anchor_area = (anchor[:,2] - anchor[:,0]) * (anchor[:,3] - anchor[:,1])
    gt_area = (gt[:,2] - gt[:,0]) * (gt[:,3] - gt[:,1])

    # Calculate intersection coordinates
    intersection_x1 = torch.max(anchor[:,0], gt[:,0])
    intersection_y1 = torch.max(anchor[:,1], gt[:,1])
    intersection_x2 = torch.min(anchor[:,2], gt[:,2])
    intersection_y2 = torch.min(anchor[:,3], gt[:,3])

    # Calculate intersection area
    intersection_area = torch.max(torch.tensor(0), intersection_x2 - intersection_x1) * torch.max(torch.tensor(0), intersection_y2 - intersection_y1)

    # Calculate union area
    union_area = anchor_area + gt_area - intersection_area

    # Calculate IOU
    iou = (intersection_area / union_area).reshape(boxes1.shape[0], -1).T
    return iou

In [9]:
boxes1 = torch.Tensor([[10, 10, 90, 90], [60, 60, 80, 80], [30, 30, 70, 70]])
boxes2 = torch.Tensor([[10, 10, 90, 90], [20, 20, 40, 40], [60, 60, 80, 80]])

result_iou = calculate_iou(boxes1, boxes2)

In [10]:
def match_anchors_to_gt(anchor_boxes, gt_boxes, iou_thresholds=(0.3, 0.6)):

    # Filter empty GT boxes:
    gt_boxes = gt_boxes[gt_boxes[:, 4] != -1]

    # If no GT boxes are available, match all anchors to background and return.
    if len(gt_boxes) == 0:
        fake_boxes = torch.zeros_like(anchor_boxes) - 1
        fake_class = torch.zeros_like(anchor_boxes[:, [0]]) - 1
        return torch.cat([fake_boxes, fake_class], dim=2)

    # Match matrix => pairwise IoU of anchors (rows) and GT boxes (columns).
    # STUDENTS: This matching depends on your IoU implementation.
    match_matrix = calculate_iou(gt_boxes[:, :4], anchor_boxes)

    # Find matched ground-truth instance per anchor:
    match_quality, matched_idxs = match_matrix.max(dim=1)
    matched_gt_boxes = gt_boxes[matched_idxs]

    # Set boxes with low IoU threshold to background (-1).
    matched_gt_boxes[match_quality <= iou_thresholds[0]] = -1

    # Set remaining boxes to neutral (-1e8).
    neutral_idxs = (match_quality > iou_thresholds[0]) & (match_quality < iou_thresholds[1])
    matched_gt_boxes[neutral_idxs] = -1e8
    return matched_gt_boxes


In [11]:
_, image, gt_boxes = train_dataset[0]
anchors = utils.generate_anchors(stride=16, ratios=[0.5, 1.0, 2.0], scales=[8])
matched_gt_boxes = match_anchors_to_gt(anchors, gt_boxes, iou_thresholds=(0.3, 0.6))

fg_idxs_p4 = (matched_gt_boxes[:, 4] > 0).nonzero()

for fg_idx in fg_idxs_p4:
  dummy_vis_boxes = [anchors[fg_idx][0], matched_gt_boxes[fg_idx][0]]

  print(f"Unlabeled red box is positive anchor: {anchors[fg_idx][0]}")
  cv_utils.detection_visualizer(
      inverse_image(image),
      train_dataset.idx_to_class,
      bbox=dummy_vis_boxes,
    )

Output hidden; open in https://colab.research.google.com to view.

In [12]:
def sample_training(gt_boxes, num_samples, fg_fraction):

    foreground = (gt_boxes[:, 4] >= 0).nonzero().squeeze(1)
    background = (gt_boxes[:, 4] == -1).nonzero().squeeze(1)

    # Protect against not enough foreground examples.
    num_fg = min(int(num_samples * fg_fraction), foreground.numel())
    num_bg = num_samples - num_fg

    # Randomly select positive and negative examples.
    perm1 = torch.randperm(foreground.numel(), device=foreground.device)[:num_fg]
    perm2 = torch.randperm(background.numel(), device=background.device)[:num_bg]

    fg_idx = foreground[perm1]
    bg_idx = background[perm2]
    return fg_idx, bg_idx

In [13]:
class RegionProposalNetwork(nn.Module):
    def __init__(self, feat_dim,
                 out_dim=512,
                 num_anchors=3,
                 img_size=(224, 224),
                 stride=16,
                 ratios=[0.5, 1.0, 2.0],
                 scales=[8],
                 pre_nms_topN=400,
                 post_nms_topN=100,
                 thresh=0.7,
                 iou_thresholds=(0.3, 0.6)):
        super().__init__()

        self.prediction = utils.PredictionNetwork(feat_dim,
                                                  out_dim=512,
                                                  num_anchors=3)

        self.proposal = utils.ProposalLayer(img_size=(224, 224),
                                            stride=stride,
                                            ratios=[0.5, 1.0, 2.0],
                                            scales=[8],
                                            pre_nms_topN=400,
                                            post_nms_topN=100,
                                            thresh=0.7)

        self.batch_size_per_image = 1
        self.iou_thresholds = iou_thresholds

    def forward(self, base_feat, gt_boxes):

        batch_size = base_feat.size(0)

        # 1. predict deltas, cls_score
        bbox_deltas, scores = self.prediction(base_feat)
        proposals, anchors = self.proposal(scores, bbox_deltas)
        matched_gt = utils.get_match_anchor(anchors, gt_boxes, self.iou_thresholds)

        # collapse 'batch_size'
        bbox_deltas = bbox_deltas.reshape(-1, 4)
        scores = scores.reshape(-1)
        anchors = anchors.reshape(-1, 4)
        matched_gt = matched_gt.reshape(-1, 5)


        # 2. sample
        fg_idx, bg_idx = utils.sample_training(matched_gt, self.batch_size_per_image * batch_size, fg_fraction=0.5)
        idx = torch.cat((fg_idx, bg_idx), 0)

        sampled_gt_fg = torch.ones_like(fg_idx)
        sampled_gt_bg = torch.zeros_like(bg_idx)
        sampled_gt_objectness = torch.cat((sampled_gt_fg, sampled_gt_bg), 0).float()


        # Step 2: Compute GT targets for box regression
        sampled_anchor = anchors[idx]
        sampled_matched_gt = matched_gt[idx]
        sampled_scores = scores[idx]
        sampled_bbox_deltas = bbox_deltas[idx]

        sampled_gt_deltas = utils.get_deltas_from_anchors(sampled_anchor, sampled_matched_gt)


        # Step 3: Calculate objectness and box reg losses per sampled anchor
        loss_box = F.l1_loss(sampled_bbox_deltas, sampled_gt_deltas, reduction="none")
        loss_box[sampled_gt_deltas == -1e8] *= 0

        loss_obj = F.binary_cross_entropy_with_logits(sampled_scores, sampled_gt_objectness, reduction="none")

        total_batch_size = self.batch_size_per_image * batch_size
        loss_obj = loss_obj.sum() / total_batch_size
        loss_box = loss_box.sum() / total_batch_size

        return proposals, loss_obj, loss_box