<a href="https://colab.research.google.com/github/skj092/Computer_Vision_Lab/blob/main/FasterRCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch, torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchsummary import summary

In [2]:
# load a model; pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

num_classes = 2  

# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

In [3]:
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [4]:
print(model.backbone.body)

IntermediateLayerGetter(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): FrozenBatchNorm2d(64, eps=0.0)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): FrozenBatchNorm2d(64, eps=0.0)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): FrozenBatchNorm2d(256, eps=0.0)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(256, eps=0.0)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
     

In [5]:
# testing 
backbone = model.backbone.body

# # summary 
summary(backbone, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
 FrozenBatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
 FrozenBatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
 FrozenBatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
FrozenBatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
FrozenBatchNorm2d-14          [-1, 256,

In [6]:
# Testing:
input = torch.randn(1, 3, 224, 224)
output = backbone(input)
print(type(output)) # collection OrderdDict type data

<class 'collections.OrderedDict'>


In [7]:
# accessing the keynames
print(output.keys()) # odict_keys(['0', '1', '2', '3'])


odict_keys(['0', '1', '2', '3'])


In [8]:
print(output['0'].shape)
print(output['1'].shape)
print(output['2'].shape)
print(output['3'].shape)

torch.Size([1, 256, 56, 56])
torch.Size([1, 512, 28, 28])
torch.Size([1, 1024, 14, 14])
torch.Size([1, 2048, 7, 7])


In [9]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['1'],output_size=7,sampling_ratio=2)

In [10]:
boxes = torch.rand(10, 4) * 256; boxes[:, 2:] += boxes[:, :2]
boxes

tensor([[ 80.3295,  18.0511, 245.9047, 209.7267],
        [ 12.1265, 235.6114, 193.9242, 387.4028],
        [118.6245,  28.6781, 138.0499,  92.0241],
        [215.1286,  35.0997, 458.7510, 141.4454],
        [210.6844, 173.9176, 438.6125, 197.6967],
        [242.4908, 117.9391, 347.7723, 267.2972],
        [121.4364, 150.5512, 298.4742, 182.4809],
        [155.6185, 189.0894, 278.6864, 444.7293],
        [ 93.7294,  82.7825,  95.9084, 185.7234],
        [148.7873,  30.1920, 214.0348, 224.0678]])

In [11]:
image_sizes = [(512, 512)]

In [12]:
out = roi_pooler(output, [boxes], image_sizes)

In [13]:
out.shape

torch.Size([10, 512, 7, 7])

In [14]:
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [16]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

# Degugging Model

In [17]:
images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
boxes[:, :, 2:4] = boxes[:, :, 0:2] + boxes[:, :, 2:4]
labels = torch.randint(1, 91, (4, 11))
images = list(image for image in images)
targets = []
for i in range(len(images)):
  d = {}
  d['boxes'] = boxes[i]
  d['labels'] = labels[i]
  targets.append(d)

In [18]:
output = model(images, targets)

In [19]:
type(output)

dict

In [20]:
output.keys()

dict_keys(['loss_classifier', 'loss_box_reg', 'loss_objectness', 'loss_rpn_box_reg'])

# Backbone Output

In [29]:
backbone = model.backbone.body
images = torch.rand(4, 3, 600, 1200)
out_backbone = backbone(images)
print(type(out_backbone))
print(out_backbone.keys())
for i in out_backbone.keys():
  print(out_backbone[i].shape)

<class 'collections.OrderedDict'>
odict_keys(['0', '1', '2', '3'])
torch.Size([4, 256, 150, 300])
torch.Size([4, 512, 75, 150])
torch.Size([4, 1024, 38, 75])
torch.Size([4, 2048, 19, 38])


# FPN OUT

In [36]:
fpn = model.backbone.fpn
fpn_out = fpn(out_backbone)
print(type(fpn_out))
print(fpn_out.keys())
for i in fpn_out.keys():
  print(fpn_out[i].shape)

<class 'collections.OrderedDict'>
odict_keys(['0', '1', '2', '3', 'pool'])
torch.Size([4, 256, 150, 300])
torch.Size([4, 256, 75, 150])
torch.Size([4, 256, 38, 75])
torch.Size([4, 256, 19, 38])
torch.Size([4, 256, 10, 19])


# RPN 