This notebook is inspired from -
[previous_competition on object detection](https://www.kaggle.com/pestipeti/pytorch-starter-fasterrcnn-train/notebook)

This is a training notebook.
The notebook to infer the model can be found [here](https://www.kaggle.com/palash97/gbr-fasterrcnn-pytorch-inference/)

## Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from tqdm import tqdm
from collections import defaultdict

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torch
import torchvision
import torch.nn as nn

from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN

## Load train data

In [None]:
train_csv_path = '../input/tensorflow-great-barrier-reef/train.csv'
train_df = pd.read_csv(train_csv_path)
train_df.head()

In [None]:
print(len(train_df))

## Add image path for each row in the train dataframe

Image path for given video_id and video_frame is like: **video_id/video_frame.jpg**

In [None]:
def add_image_path(row):
    video_id = row['video_id']
    video_frame = row['video_frame']
    return "video_" + str(video_id) + "/" + str(video_frame) + ".jpg"

train_df['image_path'] = train_df.apply(lambda x: add_image_path(x), axis=1)
train_df.head()

## Build CustomDataset

Since we are using FasterRCNN model, </br>
During **training**,the model excepts **input tensors** and **targets**(list of dictionary). </br>
During **inference**, the model expects only **input tensors**.

More info on input~output of FasterRCNN model - [here](https://pytorch.org/vision/main/generated/torchvision.models.detection.fasterrcnn_resnet50_fpn.html)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, image_dir, train_df, transform):
        self.train_df = train_df
        self.image_dir = image_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.train_df)

    def __getitem__(self, index):
        
        ##################
        # Read the image #
        ##################
        image_path = os.path.join(self.image_dir, self.train_df.loc[index, "image_path"])
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0    # (fasterrcnn model expects input to be in range [0-1])
        
        ##########################
        # Get the bounding boxes #
        ##########################
        annots = self.train_df.loc[index, "annotations"]
        boxes = pd.DataFrame(eval(annots), columns=['x','y','width','height']).astype(np.float32).values
        # Shape of boxes: (num_of_bounding_boxes, 4)
        # Columns of boxes: (x,y,w,h)
        
        ########################
        # Convert xywh to xyxy #
        ########################
        # xyxy is nothing but (x_min, y_min, x_max, y_max) (since fasterrcnn model expects xyxy)
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]  # (x_max = x_min + w)
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]  # (y_max = y_min + h)
        
        ##############################
        # Create a target dictionary #
        ##############################
        # (consisting of boxes and labels as its keys)
        target = {}
        target['boxes'] = torch.as_tensor(boxes, dtype=torch.float32)
        target['labels'] = torch.ones((boxes.shape[0],), dtype=torch.int64)
        # Label 1: COTS-Fish
        # Label 0: Background (default)
        
        # Some extra info
        target['image_id'] = torch.tensor([index])
        
        ############################
        # Transform the input data #
        ############################

        # Before transforming check if the bboxes are are valid (i.e., not partially covering the image)
        # The image size in the given dataset is (h=720, w=1280)
        is_not_valid = ((boxes[:, 0] < 0).any() or
                        (boxes[:, 1] < 0).any() or
                        (boxes[:, 2] > 1280).any() or
                        (boxes[:, 3] > 720).any())
        
        
        if is_not_valid:
            image = ToTensorV2()(image=image)['image']
            return image, target
        else:
            # Transform
            sample = {
                'image': image,
                'bboxes': target['boxes'],
                'labels': target['labels']
            }

            sample = self.transform(**sample)
            image = sample['image']
            
            if len(boxes) > 0:
                target['boxes'] = torch.stack(tuple(map(torch.tensor, zip(*sample['bboxes'])))).permute(1,0)
            
            return image, target

In [None]:
transform = A.Compose([A.Flip(0.5),
                       ToTensorV2()
                      ], bbox_params={'format': 'pascal_voc','label_fields': ['labels']})


image_dir = '../input/tensorflow-great-barrier-reef/train_images'

dataset = CustomDataset(image_dir=image_dir, train_df=train_df, transform=transform)

## Create PyTorch DataLoader

In [None]:
BATCH_SIZE = 4

def collate_fn(batch):
    return tuple(zip(*batch))

train_loader = DataLoader(dataset,
                          shuffle=True,
                          batch_size=BATCH_SIZE,
                          collate_fn=collate_fn)

## Visualize one sample from our train_loader

In [None]:
num_bboxes = 0
tries = 10
while (num_bboxes == 0):
    images, targets = next(iter(train_loader))
    # images and targets are of list type
    idx = np.random.randint(0, BATCH_SIZE)
    img = images[idx]
    target = targets[idx]
    num_bboxes = len(target['boxes'])
    tries -= 1
    if tries == 0:
        break

if num_bboxes > 0:        
    print(img.shape)
    print(target.keys())
    print(target['boxes'])

    img = img.permute(1,2,0).numpy()
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    if num_bboxes > 0:
        boxes = target['boxes'].numpy()
        for box in boxes:
            c1, c2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
            cv2.rectangle(img, c1, c2,
                      (220, 0, 0), 3)

    plt.title(print(target['image_id']))
    plt.imshow(img)
    plt.show()
else:
    print(':(')

## Load the model pretrained on COCO

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model = model.to(device)

In [None]:
num_classes = 2  # 1 class (cots) + background

# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes).to(device)

## Test the model architecture on sample data 

In [None]:
# Model needs list of input tensors and list of targets
images = list(image.to(device) for image in images)
targets = [{k: v.long().to(device) for k, v in t.items()} for t in targets]

In [None]:
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
loss = losses.item()
loss

Awesome!!

## Let's train the model

We will be using gradient accumulation, to solve the problem of cuda out of memory (since we have set batch size to be very low).

More on gradient accumulation - [here](https://towardsdatascience.com/i-am-so-done-with-cuda-out-of-memory-c62f42947dca#:~:text=one%20by%20one.-,Gradient%20Accumulation,-This%20solution%20has)

In [None]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=1e-3)

In [None]:
def train_epoch(model, scaler, data_loader, device, optimizer, gradient_accumulations=32):
    model.train()
    losses = []
    for batch_idx, data in enumerate(tqdm(data_loader)):
        images, targets = data
        
        images = list(image.to(device) for image in images)
        targets = [{k: v.long().to(device) for k, v in t.items()} for t in targets]
        
        with autocast():        
            loss_dict = model(images, targets)
            loss = sum(loss for loss in loss_dict.values())

        scaler.scale(loss / gradient_accumulations).backward()
            
        if (batch_idx + 1) % gradient_accumulations == 0:
            scaler.step(optimizer)
            scaler.update()
            model.zero_grad()
            
        losses.append(loss.item())
        
    return np.mean(losses)

In [None]:
# Train
EPOCHS = 5

scaler = GradScaler()
model.zero_grad()

for epoch in range(EPOCHS):
    print(f'Epoch: {epoch+1}/{EPOCHS}')
    train_loss = train_epoch(model, scaler, train_loader, device, optimizer)
    print(f'Train Loss: {train_loss}')

## Save the model

In [None]:
torch.save(model.state_dict(), 'gbr_fasterrcnn_resnet50_fpn.pth.tar')