In [None]:
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision as tv

# **Faster R-CNN Implementation** - in PyTorch
Thomas Hopkins

We will start by reading in the training and testing metadata.

In [None]:
BASE_DIR = '/kaggle/input/tensorflow-great-barrier-reef'
training_data = pd.read_csv(f'{BASE_DIR}/train.csv')
testing_data = pd.read_csv(f'{BASE_DIR}/test.csv')
training_data.info()

Now we can view what a single image looks like with the correct bounding box.

*Note*: For some reason `im.show()` is not working for Firefox on Ubuntu.

In [None]:
example = training_data.iloc[5100, :]
video_id = example['video_id']
video_frame = example['video_frame']
sequence = example['sequence']
sequence_frame = example['sequence_frame']
image_id = example['image_id']
annotations = eval(example['annotations'])[0]
print(f'Image ID: {image_id}')
xy = [annotations['x'], annotations['y'], annotations['x'] + annotations['width'], annotations['y'] + annotations['height']]
with Image.open(f'{BASE_DIR}/train_images/video_{video_id}/{video_frame}.jpg') as im:
    draw = ImageDraw.Draw(im)
    draw.rectangle(xy, outline='red', width=2)
    im.save('fig.jpg')
Image.open('fig.jpg')

Let's also see what the input size of the image looks like.

In [None]:
print(f'Number of images: {len(training_data)}')
with Image.open(f'{BASE_DIR}/train_images/video_{video_id}/{video_frame}.jpg') as im:
    transform = tv.transforms.PILToTensor()
    tensor = transform(im)
print(f'Channels: {tensor.shape[0]}\nHeight: {tensor.shape[1]}\nWidth: {tensor.shape[2]}')

Now that we see what kind of data we are working with and the problem we are trying to solve, we can implement a neural network that will solve this problem. The network architecture I chose is from the paper *Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks*. I believe this should yield decent results without too much tweaking necessary. Here is my implementation of this architecture in PyTorch:

In [None]:
class VGG(nn.Module):
    ''' VGG-16 Network with only convolutions '''
    def __init__(self):
        super().__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(256, 512, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2))
        
    def forward(self, x):
        return self.feature_extractor(x)


class RPN(nn.Module):
    ''' Region Proposal Network '''
    def __init__(self, k=9):
        self.preprocess = nn.Sequential(
            nn.Conv2d(512, 512, 3),
            nn.ReLU())
        self.classifier = nn.Conv2d(512, 2 * k, 1)
        self.regressor = nn.Conv2d(512, 4 * k, 1)
        
    def transform(self, bbox_preds):
        ''' Converts predictions into [x1, y1, x2, y2] coordinates '''
        pass
    
    def forward(self, x):
        x = self.preprocess(x)
        obj_preds = self.classifier(x)
        bbox_preds = self.regressor(x)
        # TODO: need to convert classification and regression to
        return obj_preds, bbox_preds
    

class FastRCNN(nn.Module):
    ''' Fast R-CNN Network '''
    def __init__(self, crop_size=7):
        self.feature_extractor = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512 * crop_size * crop_size, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU())
        self.crop_size = crop_size
        # 4 is refined bounding boxes for detecting starfish
        self.regressor = nn.Linear(1024, 4)
        # 2 is for whether the box contains the starfish or not
        # might not need this since RPN detects object or not
        self.classifier = nn.Linear(1024, 2)
         
    def forward(self, x, proposals):
        x = tv.ops.roi_pool(x, proposals, output_size=self.crop_size)
        x = self.feature_extractor(x)
        bbox_preds = self.regressor(x)
        class_preds = self.classifier(x)
        return class_preds, bbox_preds


class FasterRCNN(nn.Module):
    def __init__(self):
        self.VGG = VGG()
        self.RPN = RPN()
        self.FastRCNN = FastRCNN()
        
    def forward(self, x):
        x = self.VGG(x)
        proposals = self.RPN(x)
        outputs = self.FastRCNN(x, proposals)
        return proposals, outputs

With that out of the way, we can now define the multi-task loss functions that we will use to optimize the network.

This will be a combination of smooth $L_1$ (for bounding box prediction) and binary cross-entropy (for classification).

In [None]:
class RPNLoss(nn.Module):
    ''' Computes combined loss for classification and regression '''
    def __init__(self, n_cls=256, n_reg=2400, l=10):
        ''' Defines regularization parameters (usually batch size and total # of anchors) '''
        self.n_cls = n_cls
        self.n_reg = n_reg
        self.l = l
    
    def forward(self, predictions, targets):
        bbox_preds = predictions[0]
        class_preds = predictions[1]
        bbox_truth = targets[0]
        class_truth = targets[1]
        return (1 / self.n_cls) * F.binary_cross_entropy_with_logits(class_preds, class_truth) \
                + (self.l / self.n_reg) * (class_truth * F.smooth_l1_loss(bbox_preds, bbox_truth))

Now, finally we can perform the 4-step training process which proceeds as follows:
1. train the RPN end-to-end, using pre-trained ImageNet model for VGG-16 (?).
2. train the Fast R-CNN using proposals generated by RPN (RPN is fixed now), VGG-16 is re-initialized with pre-trained ImageNet model (?). 
3. share VGG-16 with RPN and fine-tune only the RPN parameters.
4. keeping VGG-16 fixed, fine-tune Fast R-CNN parameters.

(?) means I am unsure that I need to do this.

In [None]:
def convert_annotations(annotations):
    ''' Convert annotations from string to array '''
    if annotations == '[]':
        return torch.tensor([])
    list_of_bbox = eval(annotations)
    return torch.tensor([[d['x'] + d['width'] // 2, d['y'] + d['height'] // 2,
                          d['width'], d['height']]
                         for d in list_of_bbox])
    

def get_anchor_boxes(width, height, 
                     aspect_ratios=[[1, 1], [1, 2], [2, 1]],
                     scales=[128, 256, 512]):
    ''' Generate the set of possible anchor boxes (size (W * H * k) x 4) '''
    
    '''
    TODO: continue working on generating all possible (valid) anchor boxes
    The size output of VGG is (1x512x22x40) with an input image of size (1x3x720x1280)
    So we need to figure out the anchor points (x, y) that each coordinate of 22x40 maps
    to in 720x1280. Then we can generate the set of possible boxes and check for validity.
    Valid boxes do not overlap with the image boundaries.
    '''
    W = torch.arange(1, width)
    H = torch.arange(1, height)
    ratios = torch.tensor(aspect_ratios).flatten()
    scales = torch.tensor(scales)
    
    print(torch.outer(scales, ratios))
    

def sample_annotations(true_annotations, sample_size,
                       iou_positive_thresh=0.7, iou_negative_thresh=0.3,
                       aspect_ratios=[[1, 1], [1, 2], [2, 1]], scales=[128, 256, 512]):
    ''' Randomly generates positive/negative anchor boxes '''
    print(sample_size)
    print(true_annotations)
    

def get_images(dataframe, transform=tv.transforms.PILToTensor(),
               num_annotations=256):
    ''' Generator which loads an image for training '''
    ''' 
    TODO: Look into generating a batch from a single image.
        This would be something like a batch size of 8 where there are
        positive and negative bounding boxes. I.E. bounding boxes that
        contain a starfish are positive, bounding boxes that do not are
        negative. Striking a balance in positive and negative samples
        here is important. The data has many images without any bounding boxes.
        Not sure how to handle this just yet.
    '''
    data = dataframe.sample()
    with Image.open(f'{BASE_DIR}/train_images/video_{data.iloc[0]["video_id"]}/{data.iloc[0]["video_frame"]}.jpg') as im:
        x = transform(im)
    y = convert_annotations(data.iloc[0]['annotations'])
    yield x, sample_annotations(x, y, num_annotations)

In [None]:
transform = tv.transforms.Compose([
    tv.transforms.PILToTensor(),
    tv.transforms.ConvertImageDtype(torch.float)
])
im, annotations = next(iter(get_images(training_data, transform=transform)))
print(im.shape)
test_net = VGG()
print(test_net(im.unsqueeze(0)).shape)