# 230129 - TorchVision Object Detection Tutorial

[Object detection tutorial link](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html)  
[Image labelling tool link](https://labelstud.io/)

In Docker: 
```
docker run -it -p 8080:8080 -v ${pwd}/mydata:/label-studio/data heartexlabs/label-studio:latest
```
May need
```
export DOCKER_DEFAULT_PLATFORM=linux/amd64
```
before this to enable architecture emulation.

In [None]:
!pip install torchvision
!pip install pycocotools

## 1. Define Dataset

[Download this with `wget` and extract it with `unzip`](https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip).

Then write a `torch.utils.data.Dataset` for it.

In [7]:
from pathlib import Path
from typing import List, Tuple

import numpy as np
import torch
from PIL import Image

from detection.engine import train_one_epoch, evaluate
from detection.utils import collate_fn
import detection.transforms as T

def bbox_from_mask(mask: np.ndarray):
    # boolean mask has shape (height, width)
    pos = np.where(mask)
    xmin = np.min(pos[1])
    xmax = np.max(pos[1])
    ymin = np.min(pos[0])
    ymax = np.max(pos[0])
    return [xmin, ymin, xmax, ymax]

class PennFudanDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        self.image_paths = sorted((Path(root) / "PNGImages").rglob("*"))
        self.mask_paths = sorted((Path(root) / "PedMasks").rglob("*"))

    def __getitem__(self, idx: int):
        # Get image and mask paths.
        image_path = self.image_paths[idx]
        mask_path = self.mask_paths[idx]
        
        # Load image.
        image = Image.open(image_path).convert("RGB")
        
        # Load mask. NB object_ids denote distinct objects, not classes.
        mask = Image.open(mask_path)
        mask = np.array(mask)
        object_ids = np.unique(mask) # Get object ids from distinct values in mask.
        object_ids = object_ids[1:] # First id (0) is the background, so remove it.
        masks = (mask == object_ids[:, None, None]) # Each channel of masks corresponds to a different object id.

        # Extract bounding box coordinates from each segmentation mask.
        n_objects = len(object_ids)
        bboxes = [bbox_from_mask(masks[i, :, :]) for i in range(n_objects)]
        bboxes = torch.as_tensor(bboxes, dtype=torch.float32)

        # Package into dict.
        target = {
            "boxes": bboxes, # Bounding boxes.
            "labels": torch.ones((n_objects,), dtype=torch.int64), # Class labels (only one class for PennFudan).
            "masks": torch.as_tensor(masks, dtype=torch.uint8), # Segmentation masks.
            "image_id": torch.tensor([idx]),
            "area": (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0]),
            "is_crowd": torch.zeros((n_objects,), dtype=torch.int64) # Set iscrowd = False for all.
        }

        # Apply transformations if applicable.
        if self.transforms is not None:
            image, target = self.transforms(image, target)

        return image, target

    def __len__(self):
        return len(self.image_paths)
    
    
def get_dataloaders() -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
    # Returns training and testing data loaders.
    
    # Load dataset.
    num_classes = 2 # Dataset only has background and person classes.
    dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))
    
    # Create train/test splits.
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # Training and testing loaders.
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=2,
        shuffle=True,
        num_workers=4,
        collate_fn=collate_fn
    )
    data_loader_test = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=1,
        shuffle=False,
        num_workers=4,
        collate_fn=collate_fn
    )
    return data_loader, data_loader_test

## 2. Create Model

The model we'll be using is Mask-RCNN, which is like Faster-RCNN except it also outputs a segmentation mask.

![](https://pytorch.org/tutorials/_static/img/tv_tutorial/tv_image03.png)

In [8]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


def get_model_instance_segmentation(num_classes: int) \
    -> torchvision.models.detection.mask_rcnn.MaskRCNN:
    """
    num_classes includes background class e.g. PennFudan has
    two classes - background and person.
    
    model.__call__ 
    * List[FloatTensor] -> returns List[Dict] where each Dict has keys "boxes", "labels", "scores".
    * FloatTensors can be different sizes, must be three channels.
    * Gradients are attached to "boxes", "labels", "scores" tensors.
    
    """
    # Load an instance segmentation model pre-trained on COCO.
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # Replace the pre-trained box predictor head with a new one.
    in_features = model.roi_heads.box_predictor.cls_score.in_features    
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # Replace the pre-trained mask predictor head with a new one.
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_channels = in_features_mask, # Number of input channels.
        dim_reduced = 256, # Number of hidden layer units.
        num_classes = num_classes # Number of object classes.
    )

    return model

You need the helper functions in `references/detection` [here](https://github.com/pytorch/vision). Clone the repo and copy the `detection` dir to the same directory as this notebook.

In [9]:
import detection.transforms as T

def get_transform(train):
    # Returns transform used in model.
    transforms = []
    transforms.append(T.PILToTensor())
    transforms.append(T.ConvertImageDtype(torch.float))
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        
    # Images are not resized?
    return T.Compose(transforms)

In [10]:
def get_optimizer_and_lr_scheduler(params: List[torch.nn.parameter.Parameter]):
    optimizer = torch.optim.SGD(
        params,
        lr=0.005,
        momentum=0.9,
        weight_decay=0.0005
    )
    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=3,
        gamma=0.1
    )
    return optimizer, lr_scheduler

## 3. Create Training Loop

In [11]:
# Create main function which performs the training and validation.

def main():
    # Config.
    num_classes = 2
    num_epochs = 10
    
    # Train on the GPU or on the CPU, if a GPU is not available
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Get data loaders.
    data_loader, data_loader_test = get_dataloaders()
    
    # Get the model.
    model = get_model_instance_segmentation(num_classes)
    model.to(device)

    # Get optimizer.
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer, lr_scheduler = get_optimizer_and_lr_scheduler(params)

    for epoch in range(num_epochs):
        # Train for one epoch, printing every 10 iterations.
        train_one_epoch(
            model,
            optimizer,
            data_loader,
            device,
            epoch,
            print_freq=10
        )
        
        # Update the learning rate.
        lr_scheduler.step()
        
        # Evaluate on the test dataset.
        evaluate(
            model,
            data_loader_test,
            device=device
        )

In [None]:
main()

Epoch: [0]  [ 0/60]  eta: 0:17:21  lr: 0.000090  loss: 5.6852 (5.6852)  loss_classifier: 0.6384 (0.6384)  loss_box_reg: 0.3683 (0.3683)  loss_mask: 4.6103 (4.6103)  loss_objectness: 0.0614 (0.0614)  loss_rpn_box_reg: 0.0068 (0.0068)  time: 17.3612  data: 0.3620
Epoch: [0]  [10/60]  eta: 0:11:57  lr: 0.000936  loss: 1.5781 (2.7810)  loss_classifier: 0.4137 (0.4127)  loss_box_reg: 0.2194 (0.2163)  loss_mask: 0.9623 (2.1290)  loss_objectness: 0.0136 (0.0184)  loss_rpn_box_reg: 0.0050 (0.0045)  time: 14.3515  data: 0.0379
