This is the second notebook on the TensorFlow Great Barrier Reef Competition. The first one can be found here: https://www.kaggle.com/scr0ll0/great-barrier-reef-eda-animation

# Part 2. Modeling

Our goal this time is to create a baseline model. To do this, we'll use a FasterRCNN model for training and StratifiedKFolds for validation.

In [None]:
import cv2
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torch
import torchvision
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
import ast
from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
BASE_DIR = "../input/tensorflow-great-barrier-reef/train_images/"
NUM_EPOCHS = 3

In [None]:
train = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv')
test = pd.read_csv('../input/tensorflow-great-barrier-reef/test.csv')

In [None]:
#Set annotations to list instead of String
train['annotations'] = train['annotations'].apply(lambda x: ast.literal_eval(x))

In [None]:
#Set video path in training data.
train['image_path'] = "video_" + train['video_id'].astype(str) + "/" + train['video_frame'].astype(str) + ".jpg"

Before loading up a model, we need to split the training data into training and validation data, which is not as easy as just dividing up every image into two sets because the images are tied to videos. In addition, we know that there are 20 sequences, but every sequence has vastly different lengths, making splitting by sequence not ideal also.

Instead, we'll do the following:

1. Define a subsequence as a set of frames that continuously have or don't have annotations.
2. Number all of the different subsequences.
3. Use Stratified K-Folds to split the data into training and validation sets. 

Credit to Julián Peller for coming up with this subsequence idea. https://www.kaggle.com/julian3833/reef-a-cv-strategy-subsequences?scriptVersionId=80623179

In [None]:
def subsequence(train = train):
    start = 0
    num_annotations = train['annotations'].str.len()
    annotation = (num_annotations == 0) & (num_annotations.shift(1) != 0)
    no_annotation = (num_annotations != 0) & (num_annotations.shift(1) == 0)
    new_sequence = train['sequence'] != train['sequence'].shift(1)
    final = train.index == len(train) - 1
    cuts = annotation | no_annotation | new_sequence | final
    for sub_id, end in enumerate(cuts[cuts == True].index):
        train.loc[start:end, 'subsequence_number'] = sub_id
        start = end
    train['subsequence_number'] = train['subsequence_number'].astype(int)
    train['has_annotations'] = train['annotations'].str.len() > 0
    return train

In [None]:
train = subsequence(train)
train.tail()

In [None]:
train_split  = train.groupby("subsequence_number").agg({'has_annotations': 'max', 'video_frame': 'count'}).astype(int).reset_index()
train_split

In [None]:
kf = StratifiedKFold(n_splits=10, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(kf.split(train_split['subsequence_number'], train_split['has_annotations'])):
    train.loc[train['subsequence_number'].isin(train_split['subsequence_number'].iloc[val_idx]), 'fold'] = fold
train['fold'] = train['fold'].astype(int)
train.head()

All good. Now to deal with training.

Credit to Julián Peller again for most of the code that is to follow; this code uses the FasterRCNN Model to train and evaluate the data; I will mainly be trying to adapt this code for K-Folding. Note that this code will not work on more recent environments as it will treat certain tensors as Doubles when Floats are expected. https://www.kaggle.com/julian3833/reef-starter-torch-fasterrcnn-train-lb-0-416/notebook

In [None]:
class ReefDataset:
    def __init__(self, df, transforms=None):
        self.df = df
        self.transforms = transforms
    def can_augment(self, boxes):
        box_outside_image = ((boxes[:, 0] < 0).any() or (boxes[:, 1] < 0).any() or (boxes[:, 2] > 1280).any() or (boxes[:, 3] > 720).any())
        return not box_outside_image
    def get_boxes(self, row):
        boxes = pd.DataFrame(row['annotations'], columns=['x', 'y', 'width', 'height']).astype(float).values
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        return boxes
    def get_image(self, row):
        image = cv2.imread(f'{BASE_DIR}/{row["image_path"]}', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        return image
    def __getitem__(self, i):
        row = self.df.iloc[i]
        image = self.get_image(row)
        boxes = self.get_boxes(row)
        n_boxes = boxes.shape[0]
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        target = {
            'boxes': torch.as_tensor(boxes, dtype=torch.float32),
            'area': torch.as_tensor(area, dtype=torch.float32),
            'image_id': torch.tensor([i]),
            'labels': torch.ones((n_boxes,), dtype=torch.int64),
            'iscrowd': torch.zeros((n_boxes,), dtype=torch.int64)            
        }
        if self.transforms and self.can_augment(boxes):
            sample = {
                'image': image,
                'bboxes': target['boxes'],
                'labels': target['labels']
            }
            sample = self.transforms(**sample)
            image = sample['image']
            if n_boxes > 0:
                target['boxes'] = torch.stack(tuple(map(torch.tensor, zip(*sample['bboxes'])))).permute(1, 0)
        else:
            image = ToTensorV2(p=1.0)(image=image)['image']
        return image, target
    def __len__(self):
        return len(self.df)

In [None]:
def get_train_transform():
    return A.Compose([
        A.Flip(0.5),
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})


def get_valid_transform():
    return A.Compose([
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [None]:
def get_model():
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    model.roi_heads.box_predictor = FastRCNNPredictor(model.roi_heads.box_predictor.cls_score.in_features, 2)
    model.to(DEVICE)
    return model

In [None]:
model = get_model()

In [None]:
def train_model(fold):
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.0025, momentum=0.9, weight_decay=0.0005)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    lr_scheduler = None
    n_batches, n_batches_val = len(dl_train), len(dl_val)
    validation_losses = []
    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        loss_accum = 0
        for batch_idx, (images, targets) in enumerate(dl_train, 1):
            images = list(image.to(DEVICE) for image in images)
            targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            loss_value = losses.item()
            loss_accum += loss_value
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
        if lr_scheduler is not None:
            lr_scheduler.step()
        val_loss_accum = 0
        with torch.no_grad():
            for batch_idx, (images, targets) in enumerate(dl_val, 1):
                images = list(image.to(DEVICE) for image in images)
                targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
                val_loss_dict = model(images, targets)
                val_batch_loss = sum(loss for loss in val_loss_dict.values())
                val_loss_accum += val_batch_loss.item()
        val_loss = val_loss_accum / n_batches_val
        train_loss = loss_accum / n_batches
        validation_losses.append(val_loss)
        chk_name = f'fasterrcnn_resnet50_fpn-e{epoch}-fold{fold}.bin'
        torch.save(model.state_dict(), chk_name)
        elapsed = time.time() - time_start
        print(f"[Epoch {epoch+1:2d} / {NUM_EPOCHS:2d}] Train loss: {train_loss:.3f}. Val loss: {val_loss:.3f} --> {chk_name}  [{elapsed:.0f} secs]")   

In [None]:
for i in range(10):
    train_split = train[train['fold'] != i]
    val_split = train[train['fold'] == i]
    train_split = train_split[train_split.annotations.str.len() > 0 ].reset_index(drop=True)
    val_split = val_split[val_split.annotations.str.len() > 0 ].reset_index(drop=True)
    ds_train = ReefDataset(train_split, get_train_transform())
    ds_val = ReefDataset(val_split, get_valid_transform())
    dl_train = DataLoader(ds_train, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn)
    dl_val = DataLoader(ds_val, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn)
    train_model(i)

I'll be moving to a separate notebook where the predictions will be made.