## Image Object detection - Introduction
The data consists of images extracted from videos (3 videos) which contains star fish. These videos are captured by surfers using camera's under water. we will be 
- splitting the data into train (video_0 and video_1) and val (video_2)
- implement pytorch dataset.
- implement transforms for object detection.
- implement pytorch lightning data loader.
- implement metrics - f2score
- implement Lightning module for training and validation.
- train the model for n_epochs. 

This is a very minimilistic implementation. I got a validation score of 0.431 f-score.

I have used this Pytorch [Faster-RCNN](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html) tutorial and refactored the code using lightning module.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import torch 
import torchvision
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pytorch_lightning as pl

from torchvision.transforms import functional as F
from torchvision.transforms import transforms as T
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.models.detection.faster_rcnn import RPNHead, MultiScaleRoIAlign, TwoMLPHead, FastRCNNPredictor

from torch import nn, Tensor
from typing  import Optional, Dict, Tuple, List, Union
from dataclasses import dataclass, asdict 
from pathlib import Path, PosixPath
from PIL import Image
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.progress import TQDMProgressBar

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(f"lightning: {pl.__version__}")
print(f"torch: {torch.__version__}")
print(f"torchvision: {torchvision.__version__}")

### List all the files

In [None]:
root = Path("/kaggle/input/tensorflow-great-barrier-reef/")
print(root)
list(root.iterdir())

- read the train dataset

In [None]:
df = pd.read_csv(root / "train.csv")
print(df.shape)
df.head()

## Create train and test data
- We can ignore images which doesn't have star-fish (for training) as these cannot be used for training object detection models.
- we will consider "video_0", "video_1" for training and "video_2" for validation

In [None]:
df_only_annots = df[df["annotations"] != "[]"].reset_index(drop=True)
train_df = df_only_annots[df_only_annots["video_id"] !=2].reset_index(drop=True)
val_df = df_only_annots[df_only_annots["video_id"] ==2].reset_index(drop=True)
print(df_only_annots.shape, train_df.shape, val_df.shape)

## Define the dataset. 

In [None]:
## Define a dataset 
class GBRDataset(torch.utils.data.Dataset):
    def __init__(self, root, df, transforms):
        self.root = root
        self.df = df 
        self.transforms = transforms
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        meta = self.df.iloc[idx]
        video_id = f"video_{meta['video_id']}"
        ## read image 
        loc = self.root / "train_images" / video_id / (str(meta["video_frame"])+".jpg")
        img = Image.open(loc).convert("RGB")
        
        ## get bbox 
        bbox=np.asarray([[i["x"], i["y"], i["x"]+i["width"], i["y"]+i["height"]] for i in eval(meta["annotations"]) if i != "[]"])
        bbox = torch.as_tensor(bbox, dtype=torch.float32)
        
        labels = torch.ones((len(bbox),), dtype=torch.int64)
        
        image_id = torch.tensor([idx])
        area = (bbox[:, 3] - bbox[:, 1]) * (bbox[:, 2] - bbox[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((len(bbox),), dtype=torch.int64)

        target = {}
        target["boxes"] = bbox
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)
        return img, target

In [None]:
## check if it is working 
train_ds = GBRDataset(root, train_df, None)
img, target = train_ds[0]
img.size

## transforms 
- RandomHorizontalFlip 
- Compose 
- ToTensor

In [None]:
class Compose:
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, image, target):
        for t in self.transforms:
            image, target = t(image, target)
        return image, target


class RandomHorizontalFlip(T.RandomHorizontalFlip):
    def forward(
        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
        if torch.rand(1) < self.p:
            image = F.hflip(image)
            if target is not None:
                width, _ = image.shape[1:][::-1] if isinstance(image, Tensor) else image.size
                target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
        return image, target


class ToTensor(nn.Module):
    def forward(
        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
        image = torch.as_tensor(np.array(image))
        image = F.convert_image_dtype(image)
        return image.permute((2, 0, 1)), target
    
def collate_fn(batch):
    return tuple(zip(*batch))

## Define the lighting dataloader

In [None]:
@dataclass
class GDRDataLoader(pl.LightningDataModule):
    root: PosixPath
    train_df: pd.DataFrame
    val_df: pd.DataFrame
    train_batch_size: int=4
    val_batch_size: int=1
    stage: int = None

    def __post_init__(self):
        super().__init__()
        self.setup(self.stage)

    def setup(self, stage=None):        
        self.train_ds = GBRDataset(self.root, self.train_df, self.get_transform(train=True))
        self.val_ds = GBRDataset(self.root, self.val_df, self.get_transform(train=False))

    def train_dataloader(self):
        train_loader = torch.utils.data.DataLoader(self.train_ds, batch_size=self.train_batch_size, shuffle=True, collate_fn=collate_fn, num_workers=2)
        return train_loader
    
    def val_dataloader(self):
        val_loader = torch.utils.data.DataLoader(self.val_ds, batch_size=self.val_batch_size, shuffle=False, collate_fn=collate_fn, num_workers=2)
        return val_loader
    
    @staticmethod
    def get_transform(train):
        transforms = []
        transforms.append(ToTensor())
        if train:
            transforms.append(RandomHorizontalFlip(0.5))
        return Compose(transforms)

In [None]:
## check dataset outputs with transforms
train_ds = GBRDataset(root, train_df, GDRDataLoader.get_transform(True))
img, target = train_ds[0]
img.shape

In [None]:
## check the dataloader outputs
dl = GDRDataLoader(root, train_df, val_df)
images, targets = next(iter(dl.train_dataloader()))
print([i.shape for i in images])

## Metric implementation
- bboxes = [x, y, x, y, c]
- pred_bbox = [x, y, x, y, c, score]

In this case, since we have only one class, lets ignore c and make it 
- bboxes = [x, y, x, y]
- pred_bbox = [score, x, y, x, y]

where score is the probability score. It is a hyper-parameter to tune. Since the competition guidelines said it is okay to have false positives inorder to not miss any of the actual star_fish, we will keep this `score` as small as possible.

### Sudo Algorithm for metric
```markdown
- For a particular IOU threshold [0.3:0.8:0.05]
-   for each image
-     Get bboxes and pred_bboxes of the image. 
-     calculate total fp, fn, tp of the image.
-   aggregate the score. 
```

I copied the following functions from [here](https://www.kaggle.com/bamps53/competition-metric-implementation?scriptVersionId=81087805)

In [None]:
def calc_iou(bboxes1, bboxes2, bbox_mode='xywh'):
    assert len(bboxes1.shape) == 2 and bboxes1.shape[1] == 4
    assert len(bboxes2.shape) == 2 and bboxes2.shape[1] == 4
    
    bboxes1 = bboxes1.copy()
    bboxes2 = bboxes2.copy()
    
    if bbox_mode == 'xywh':
        bboxes1[:, 2:] += bboxes1[:, :2]
        bboxes2[:, 2:] += bboxes2[:, :2]

    x11, y11, x12, y12 = np.split(bboxes1, 4, axis=1)
    x21, y21, x22, y22 = np.split(bboxes2, 4, axis=1)
    xA = np.maximum(x11, np.transpose(x21))
    yA = np.maximum(y11, np.transpose(y21))
    xB = np.minimum(x12, np.transpose(x22))
    yB = np.minimum(y12, np.transpose(y22))
    interArea = np.maximum((xB - xA + 1), 0) * np.maximum((yB - yA + 1), 0)
    boxAArea = (x12 - x11 + 1) * (y12 - y11 + 1)
    boxBArea = (x22 - x21 + 1) * (y22 - y21 + 1)
    iou = interArea / (boxAArea + np.transpose(boxBArea) - interArea)
    return iou

def f_beta(tp, fp, fn, beta=2):
    return (1+beta**2)*tp / ((1+beta**2)*tp + beta**2*(fn+fp))

def calc_is_correct_at_iou_th(gt_bboxes, pred_bboxes, iou_th, verbose=False):
    ## gt bboxes and pred_bboxes are numpy arrays with [N, 4] and [N, 5] in shape. iou_th is the threshold.
    gt_bboxes = gt_bboxes.copy()
    pred_bboxes = pred_bboxes.copy()
    
    tp = 0
    fp = 0
    for pred_bbox in pred_bboxes:
        ious = calc_iou(gt_bboxes, pred_bbox[None, 1:], bbox_mode="xyxy")
        max_iou = ious.max()
        if max_iou > iou_th:
            tp += 1
            gt_bboxes = np.delete(gt_bboxes, ious.argmax(), axis=0)
        else:
            fp += 1
        if len(gt_bboxes) == 0:
            fp += len(pred_bboxes)
            break

    fn = len(gt_bboxes)
    return tp, fp, fn

def calc_is_correct(gt_bboxes, pred_bboxes):
    """
    gt_bboxes: (N, 4) np.array in xyxy format
    pred_bboxes: (N, 5) np.array in conf+xyxy format
    """
    if len(gt_bboxes) == 0 and len(pred_bboxes) == 0:
        tps, fps, fns = 0, 0, 0
        return tps, fps, fns
    
    elif len(gt_bboxes) == 0:
        tps, fps, fns = 0, len(pred_bboxes), 0
        return tps, fps, fns
    
    elif len(pred_bboxes) == 0:
        tps, fps, fns = 0, 0, len(gt_bboxes)
        return tps, fps, fns
    
    pred_bboxes = pred_bboxes[pred_bboxes[:,0].argsort()[::-1]] # sort by conf
    
    # https://peltarion.com/knowledge-center/documentation/evaluation-view/classification-loss-metrics/micro-f1-score
    # Micro-averaging F1-score is performed by first calculating the sum of all tp, fp, and fn over all the labels. 
    #Then we compute the micro-precision and micro-recall from the sums.
    #And finally we compute the harmonic mean to get the micro F1-score.
    # So as per above statement, we need to calculate tps, fps, fns for all the thresholds first and then calculate fscore. 
    
    tps, fps, fns = 0, 0, 0
    for iou_th in np.arange(0.3, 0.85, 0.05):
        tp, fp, fn = calc_is_correct_at_iou_th(gt_bboxes, pred_bboxes, iou_th)
        tps += tp
        fps += fp
        fns += fn
    return tps, fps, fns

def calc_f2_score(gt_bboxes_list, pred_bboxes_list, verbose=False):
    """
    gt_bboxes_list: list of (N, 4) np.array in xyxy format
    pred_bboxes_list: list of (N, 5) np.array in conf+xyxy format
    """
    tps, fps, fns = 0, 0, 0
    for gt_bboxes, pred_bboxes in zip(gt_bboxes_list, pred_bboxes_list):
        tp, fp, fn = calc_is_correct(gt_bboxes, pred_bboxes)
        tps += tp
        fps += fp
        fns += fn
        if verbose:
            num_gt = len(gt_bboxes)
            num_pred = len(pred_bboxes)
            print(f'num_gt:{num_gt:<3} num_pred:{num_pred:<3} tp:{tp:<3} fp:{fp:<3} fn:{fn:<3}')
    return f_beta(tps, fps, fns, beta=2)

## Define LightningModule - Network, optimizer and forward function

In [None]:
@dataclass
class ModelParams:
    # https://github.com/pytorch/vision/blob/93ec8bfd31ac6aed58b79d7764070fcc2a1dfd51/torchvision/models/detection/faster_rcnn.py
    num_classes: int=2 
    pretrained_backbone: bool=True
    min_size: int=800
    max_size: int =1333
    # RPN parameters
    rpn_anchor_generator: Optional[AnchorGenerator]=None 
    rpn_head: Optional[RPNHead]=None 
    rpn_pre_nms_top_n_train: int=2000
    rpn_pre_nms_top_n_test: int=1000
    rpn_post_nms_top_n_train: int=2000
    rpn_post_nms_top_n_test: int=1000
    rpn_nms_thresh: float=0.7
    rpn_fg_iou_thresh: float=0.7
    rpn_bg_iou_thresh: float=0.3
    rpn_batch_size_per_image: int=256
    rpn_positive_fraction: float=0.5
    rpn_score_thresh: float=0.0
    # Box parameters
    box_roi_pool: Optional[MultiScaleRoIAlign]=None
    box_head: Optional[TwoMLPHead]=None
    box_predictor: Optional[FastRCNNPredictor]=None
    box_score_thresh: float=0.05
    box_nms_thresh: float=0.5
    box_detections_per_img: int=100
    box_fg_iou_thresh: float=0.5
    box_bg_iou_thresh: float=0.5
    box_batch_size_per_image: int=512
    box_positive_fraction: float=0.25
    bbox_reg_weights: Optional[List[float]]=None

In [None]:
## define train params. 

@dataclass
class TrainParams:
    val_batch_size: int=1
    train_batch_size: int=4
        
    ## optimizers 
    lr: float= 0.01
    momentum: float=0.9
    weight_decay: float=0.0005
    
    ## stuff
    epochs: int = 20 
    gpus: Union[int, List[int]]=1

In [None]:
class GDRTrainer(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg 
        self.model = torchvision.models.detection.fasterrcnn_resnet50_fpn(**asdict(ModelParams()))
    
    def training_step(self, batch, batch_idx):
        images, targets = batch 
        images = list(image.to(self.device) for image in images)
        targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]
        loss_dict = self.model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
            
        self.log(f"train_loss", losses, prog_bar=True)
        return {"loss": losses, "outputs": {k:v.detach() for k, v in loss_dict.items()}} 

    def validation_step(self, batch, batch_idx):
        images, targets = batch 
        images = list(image.to(self.device) for image in images)
        targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]
        ## we are using image size of 1 
        ## Example: {'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores': tensor([], device='cuda:0')}
        preds = self.model(images, targets)[0]
        return {"preds": preds, "targets": targets[0]} 

    def training_epoch_end(self, outputs):
        self.epoch_end(outputs, "train")

    def validation_epoch_end(self, outputs):
        self.epoch_end(outputs, "val") 

    def epoch_end(self, outputs, phase):
        if phase == "val":
            preds = [i["preds"] for i in outputs]
            pred_bboxes = [torch.hstack([i["scores"].reshape(-1, 1), i["boxes"]]).cpu().numpy() for i in preds]
            target_bboxes = [i["targets"]["boxes"].cpu().numpy() for i in outputs]
            pred_count = sum([i.shape[0] for i in pred_bboxes])
            f2_score = calc_f2_score(target_bboxes, pred_bboxes, False)
            self.log("val_f2_score", f2_score, prog_bar=True)
            self.log("pred_count", torch.as_tensor(pred_count).float(), prog_bar=True)
            
        elif phase == "train":
            final_train_loss = torch.mean(torch.stack([i["loss"] for i in outputs]))
            self.log("train_epoch_loss", final_train_loss, prog_bar=True)
            
            for loss_type in ["loss_classifier", "loss_box_reg", "loss_objectness", "loss_rpn_box_reg"]:
                final_loss = torch.mean(torch.stack([i["outputs"][loss_type] for i in outputs]))
                self.log(f"train_{loss_type}", final_train_loss, prog_bar=True)
        

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.cfg["lr"], momentum=self.cfg["momentum"], weight_decay=self.cfg["weight_decay"])
        # and a learning rate scheduler
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
        return {"optimizer": optimizer, "lr_scheduler": { "scheduler": lr_scheduler, "interval": "epoch",},} 

In [None]:
tc = TrainParams() ## train_configs 
model = GDRTrainer(cfg=asdict(tc))
dl = GDRDataLoader(root, train_df, val_df, train_batch_size=tc.train_batch_size, val_batch_size=tc.val_batch_size)
checkpoint_callback = ModelCheckpoint(
        monitor="val_f2_score",
        save_top_k=5,
        filename="{epoch}-{step}-{val_f2_score:.3f}",
        save_last=True,
        mode="max", 
    )

## Train the model 

In [None]:
trainer = pl.Trainer(
        gpus=tc.gpus,
        max_epochs=tc.epochs,
        callbacks=[checkpoint_callback, TQDMProgressBar(refresh_rate=2)],
        logger=pl.loggers.TensorBoardLogger(f"/kaggle/working/lightning_logs/exp1/", name="faster_rcnn"))

In [None]:
trainer.fit(model, dl)

## End Notes. 
We have trained the model for 20 epochs and In the next tutorial, we will use the best model for inference and make a submission