In [1]:
# docs: https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
# dependencies: pip install tqdm pandas pillow torch torchvision

import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

import torch
import torchvision
from torchvision import transforms
import torchvision.transforms.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn

### Params

In [2]:
DATSET_FILE = "../../data/oid/parquets/train_remote.parquet"

DEVICE = "cuda:0"
BATCH_SIZE = 8
START_EPOCH = 0
TRAIN_HEAD_ONLY = False

NUM_EPOCHS = 5
TQDM_ITERS = 100
TQDM_INTERVAL = 60

TEST_RUN = True
TEST_SIZE = 0.1
PRECISION = 3

### Dataset


In [3]:
class DetectionDataset(Dataset):
    def __init__(self, df, device):
        self.df = df
        self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
            ]
        )
        self.device = device

    def __len__(self):
        return self.df.shape[0]

    def resize(
        self, image: Image.Image, min_size: int = 800, max_size: int = 1333
    ) -> Image.Image:
        orig_width, orig_height = image.size
        min_orig_size = float(min((orig_width, orig_height)))
        max_orig_size = float(max((orig_width, orig_height)))

        scale = min_size / min_orig_size
        if max_orig_size * scale > max_size:
            scale = max_size / max_orig_size

        new_width = int(round(orig_width * scale))
        new_height = int(round(orig_height * scale))

        return F.resize(image, (new_height, new_width))

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        image = self.resize(Image.open(row["image_path"]))
        width, height = image.size
        image = self.transform(image.convert("RGB")).to(self.device)
        boxes = (
            torch.Tensor(
                np.array([width, height, width, height]) * np.array(row["bbox"])
            )
            .reshape(-1, 4)
            .to(self.device)
        )
        labels = torch.tensor([row["class"]], dtype=torch.int64).to(self.device)

        target = {"boxes": boxes, "labels": labels}

        return image, target

### Metrics

In [4]:
def calc_intersection(box1, box2):
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2

    x_left = max(x1_1, x1_2)
    y_top = max(y1_1, y1_2)
    x_right = min(x2_1, x2_2)
    y_bottom = min(y2_1, y2_2)

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    intersection_area = (x_right - x_left) * (y_bottom - y_top)
    box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)

    iou_value = intersection_area / float(box1_area + box2_area - intersection_area)
    return iou_value


def eval_segmentation(model, dataloader):
    boxes_true, boxes_pred = [], []
    labels_true, labels_pred = [], []
    for images, targets in dataloader:
        images = list(image.to(DEVICE) for image in images)
        target = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        with torch.no_grad():
            predict = model(images)

        for j in range(len(predict)):
            target_boxes, pred_boxes = target[j]["boxes"], predict[j]["boxes"]
            boxes_true.append(target_boxes[0].to("cpu").numpy())
            boxes_pred.append(
                pred_boxes[0].to("cpu").numpy() if len(pred_boxes) else None
            )

            target_label, pred_label = target[j]["labels"], predict[j]["labels"]
            labels_true.append(target_label[0].to("cpu"))
            labels_pred.append(pred_label[0].to("cpu") if len(pred_label) else -1)

    metrics = {}
    metrics["intersection"] = float(
        np.mean(
            [
                calc_intersection(box_true, box_pred) if box_pred is not None else 0
                for box_true, box_pred in zip(boxes_true, boxes_pred)
            ]
        )
    )
    metrics["precision"] = precision_score(
        labels_true, labels_pred, average="macro", zero_division=1
    )
    metrics["recall"] = recall_score(
        labels_true, labels_pred, average="macro", zero_division=1
    )
    metrics["f1"] = f1_score(labels_true, labels_pred, average="macro", zero_division=1)
    return metrics

### Train


In [5]:
# load parquet
df = pd.read_parquet(DATSET_FILE)
train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=0)
train_dataset, test_dataset = DetectionDataset(train_df[:10], DEVICE), DetectionDataset(
    test_df, DEVICE
)

In [6]:
# create dataloader
def collate_fn(batch):
    return tuple(zip(*batch))


train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

In [7]:
# create model
model = fasterrcnn_resnet50_fpn(pretrained=True)

num_classes = df["class"].max() + 1
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = (
    torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
)
if START_EPOCH > 0:
    model.load_state_dict(torch.load(f"fasterrcnn{START_EPOCH}e.pth"))

model.to(DEVICE)



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [8]:
# create optimizer
if TRAIN_HEAD_ONLY:
    for param in model.backbone.parameters():
        param.requires_grad = False

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.001, weight_decay=0)

In [9]:
# train loop
for epoch in range(START_EPOCH, NUM_EPOCHS):
    model.train()
    for images, targets in tqdm(
        train_loader, miniters=TQDM_ITERS, mininterval=TQDM_INTERVAL
    ):
        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        losses.backward()
        optimizer.step()

    model.eval()
    metrics = eval_segmentation(model, test_loader)

    print(
        f"Epoch #{epoch} Loss: {round(losses.item(), PRECISION)} "
        + f"Intersection: {round(metrics['intersection'], PRECISION)} "
        + f"Precision: {round(metrics['precision'], PRECISION)} "
        + f"Recall: {round(metrics['recall'], PRECISION)} "
        + f"F1: {round(metrics['f1'], PRECISION)}"
    )
    torch.save(model.state_dict(), f"fasterrcnn{epoch+1}e.pth")

    if TEST_RUN:
        break

100%|██████████| 2/2 [00:01<00:00,  1.03it/s]


Epoch #0 Loss: 6.08 Intersection: 0.0 Precision: 0.985 Recall: 0.015 F1: 0.0
