<a href="https://colab.research.google.com/github/skj092/Object-Detection-with-Oxford-IIIT-Pet-Dataset/blob/main/iiit_object_detection_pets_fastai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from fastai.vision.all import *

In [2]:
path = untar_data(URLs.PETS)
Path.BASE_PATH  = path
path.ls()

(#2) [Path('annotations'),Path('images')]

In [14]:
import os
import torch
import torchvision.transforms as T
from torch.utils.data import Dataset
from PIL import Image
import xml.etree.ElementTree as ET

class CustomObjectDetectionDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.xml_files = [file for file in os.listdir(os.path.join(path, 'annotations/xmls')) if file.endswith('.xml')]

    def __len__(self):
        return len(self.xml_files)

    def __getitem__(self, idx):
        xml_file = os.path.join(self.root_dir, 'annotations/xmls', self.xml_files[idx])
        img_name = os.path.splitext(self.xml_files[idx])[0] + '.jpg'
        img_path = os.path.join(self.root_dir, 'images', img_name)

        # Load image
        img = Image.open(img_path).convert("RGB").resize((224,224))

        # Load and parse XML annotation
        tree = ET.parse(xml_file)
        root = tree.getroot()

        # Extract image size
        width = int(root.find('size').find('width').text)
        height = int(root.find('size').find('height').text)

        # Initialize lists for target data
        boxes = []
        labels = []

        # Extract bounding box information
        for obj in root.findall('object'):
            label = obj.find('name').text
            xmin = int(obj.find('bndbox').find('xmin').text)
            ymin = int(obj.find('bndbox').find('ymin').text)
            xmax = int(obj.find('bndbox').find('xmax').text)
            ymax = int(obj.find('bndbox').find('ymax').text)

            # Append bounding box coordinates and label
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(label)

        # Convert boxes and labels to tensors
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.tensor([labels.index(label) for label in labels], dtype=torch.int64)

        # Calculate area (optional)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        # Define iscrowd (optional)
        iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)

        # Create target dictionary
        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([idx]),
            "area": area,
            "iscrowd": iscrowd
        }

        if self.transform:
            img, target = self.transform(img, target)

        return img, target


In [15]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_dataset_sample(dataset, index, class_names=None):
    """
    Visualizes a single data sample from a dataset.

    Args:
        dataset (Dataset): The PyTorch dataset containing the data samples.
        index (int): The index of the data sample to visualize.
        class_names (list): Optional list of class names corresponding to label indices.
    """
    sample = dataset[index]
    image, target = sample

    image = np.array(image)
    boxes = target["boxes"].numpy()
    labels = target["labels"].numpy()

    plt.figure(figsize=(8, 6))
    plt.imshow(image)
    plt.axis('off')

    for box, label in zip(boxes, labels):
        xmin, ymin, xmax, ymax = box
        rect = plt.Rectangle(
            (xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor='red', linewidth=2)
        plt.gca().add_patch(rect)

        if class_names:
            label_name = class_names[label]
        else:
            label_name = str(label)

        plt.text(xmin, ymin, label_name, backgroundcolor='red', color='white', fontsize=8)

    plt.show()

In [16]:
# visualize_dataset_sample(ds, 1, class_names=['dog', 'cat'])

In [17]:
from torchvision.transforms import v2 as T


def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float))
    transforms.append(T.ToTensor())
    return T.Compose(transforms)

In [18]:
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")

0

In [19]:
import torch
from torch.utils.data import random_split, DataLoader, Subset

ds = CustomObjectDetectionDataset(path, transform=get_transform(train=True))


# Define the dataset size and the desired split ratio
dataset_size = len(ds)
validation_split = 0.2  # 20% of the data will be used for validation

# Calculate the sizes of the training and validation sets
valid_size = int(validation_split * dataset_size)
train_size = dataset_size - valid_size

# Use random_split to split the dataset into train and validation subsets
train_subset, valid_subset = random_split(ds, [train_size, valid_size])

# Create DataLoader objects for train and validation sets
batch_size = 32  # You can adjust this to your preference
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)

# Optionally, you can create a complete dataset for train and validation
train_ds = Subset(ds, train_subset.indices)
valid_ds = Subset(ds, valid_subset.indices)

len(train_ds), len(valid_ds)



(2949, 737)

In [20]:
def collate_fn(batch):
    return tuple(zip(*batch))

train_dl=DataLoader(train_ds,batch_size=batch_size,shuffle=True,num_workers=2,
                    pin_memory=True if torch.cuda.is_available else False,
                    collate_fn=collate_fn)
val_dl=DataLoader(valid_ds,batch_size=batch_size,shuffle=False,num_workers=2,
                  pin_memory=True if torch.cuda.is_available else False,
                  collate_fn=collate_fn)

len(train_dl), len(val_dl)


(93, 24)

In [21]:
xb, yb = next(iter(train_dl))
xb[0], yb[0]

(tensor([[[0.5098, 0.4078, 0.4039,  ..., 0.3725, 0.3216, 0.1961],
          [0.5059, 0.4471, 0.3961,  ..., 0.3490, 0.3176, 0.1725],
          [0.4980, 0.4706, 0.3804,  ..., 0.3490, 0.3059, 0.1725],
          ...,
          [0.0627, 0.0549, 0.0471,  ..., 0.1020, 0.1020, 0.1294],
          [0.0627, 0.0353, 0.0549,  ..., 0.1020, 0.0980, 0.1176],
          [0.0627, 0.0392, 0.0549,  ..., 0.1137, 0.1137, 0.1216]],
 
         [[0.6353, 0.4627, 0.3294,  ..., 0.3098, 0.2392, 0.1137],
          [0.6314, 0.5098, 0.3333,  ..., 0.2863, 0.2353, 0.0980],
          [0.6196, 0.5529, 0.3373,  ..., 0.2824, 0.2196, 0.1059],
          ...,
          [0.0980, 0.1020, 0.1059,  ..., 0.1294, 0.1333, 0.1373],
          [0.1059, 0.0980, 0.1098,  ..., 0.1373, 0.1333, 0.1255],
          [0.0980, 0.1059, 0.1059,  ..., 0.1490, 0.1412, 0.1333]],
 
         [[0.6745, 0.4196, 0.1922,  ..., 0.2196, 0.1882, 0.1020],
          [0.6706, 0.4745, 0.2078,  ..., 0.1961, 0.1804, 0.0745],
          [0.6471, 0.5294, 0.2235,  ...,

In [22]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 3  # 1 class (person) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [26]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return
# only the features
backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features
# ``FasterRCNN`` needs to know the number of
# output channels in a backbone. For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 1280

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)

# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# ``OrderedDict[Tensor]``, and in ``featmap_names`` you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(
    featmap_names=['0'],
    output_size=7,
    sampling_ratio=2,
)

# put the pieces together inside a Faster-RCNN model
model = FasterRCNN(
    backbone,
    num_classes=3,
    rpn_anchor_generator=anchor_generator,
    box_roi_pool=roi_pooler,
)

In [24]:
# import utils


# model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

# # For Training
# images, targets = next(iter(train_dl))
# images = list(image for image in images)
# targets = [{k: v for k, v in t.items()} for t in targets]
# output = model(images, targets)  # Returns losses and detections
# print(output)

# # For inference
# model.eval()
# x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
# predictions = model(x)  # Returns predictions
# print(predictions[0])

In [27]:
from engine import train_one_epoch, evaluate
from torch.utils.data import Subset, random_split

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has two classes only - background and person
num_classes = 3
# use our dataset and defined transformations
ds = CustomObjectDetectionDataset(path, transform=get_transform(train=True))
ds = Subset(ds, range(500))


# Define the dataset size and the desired split ratio
dataset_size = len(ds)
validation_split = 0.2  # 20% of the data will be used for validation

# Calculate the sizes of the training and validation sets
valid_size = int(validation_split * dataset_size)
train_size = dataset_size - valid_size

# Use random_split to split the dataset into train and validation subsets
train_subset, valid_subset = random_split(ds, [train_size, valid_size])

# Create DataLoader objects for train and validation sets
batch_size = 4  # You can adjust this to your preference
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)

# Optionally, you can create a complete dataset for train and validation
train_ds = Subset(ds, train_subset.indices)
valid_ds = Subset(ds, valid_subset.indices)

train_dl=DataLoader(train_ds,batch_size=batch_size,shuffle=True,num_workers=2,
                    pin_memory=True if torch.cuda.is_available else False,
                    collate_fn=collate_fn)
val_dl=DataLoader(valid_ds,batch_size=batch_size,shuffle=False,num_workers=2,
                  pin_memory=True if torch.cuda.is_available else False,
                  collate_fn=collate_fn)

# get the model using our helper function
# model = get_model_instance_segmentation(num_classes)
# model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

# let's train it for 5 epochs
num_epochs = 1

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, train_dl, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    # evaluate(model, val_dl, device=device)

print("That's it!")

Epoch: [0]  [  0/100]  eta: 0:01:56  lr: 0.000055  loss: 1.8654 (1.8654)  loss_classifier: 1.0862 (1.0862)  loss_box_reg: 0.0000 (0.0000)  loss_objectness: 0.7220 (0.7220)  loss_rpn_box_reg: 0.0572 (0.0572)  time: 1.1624  data: 0.1864  max mem: 5586
Epoch: [0]  [ 10/100]  eta: 0:00:57  lr: 0.000560  loss: 1.8076 (3.4857)  loss_classifier: 1.0524 (1.0219)  loss_box_reg: 0.0000 (0.0000)  loss_objectness: 0.7114 (0.7116)  loss_rpn_box_reg: 0.0327 (1.7522)  time: 0.6422  data: 0.0197  max mem: 5903
Epoch: [0]  [ 20/100]  eta: 0:00:49  lr: 0.001065  loss: 1.5803 (2.4634)  loss_classifier: 0.8116 (0.7958)  loss_box_reg: 0.0000 (0.0000)  loss_objectness: 0.6951 (0.6862)  loss_rpn_box_reg: 0.0440 (0.9813)  time: 0.5960  data: 0.0024  max mem: 5903
Epoch: [0]  [ 30/100]  eta: 0:00:43  lr: 0.001569  loss: 0.9486 (1.9282)  loss_classifier: 0.2615 (0.5992)  loss_box_reg: 0.0000 (0.0000)  loss_objectness: 0.6105 (0.6424)  loss_rpn_box_reg: 0.0680 (0.6866)  time: 0.6013  data: 0.0014  max mem: 5903
