In this notebook, I plan on learning more about object detection by going over the following notebooks below. Most of the code is taken from the notebooks below except for some minor changes - all credits go to the original authors.

Relevant notebooks:

- https://www.kaggle.com/code/pestipeti/pytorch-starter-fasterrcnn-train
- https://www.kaggle.com/code/nvnnghia/fasterrcnn-pseudo-labeling

In [None]:
import os
import re
import cv2
import time
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.cuda.amp as amp
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SequentialSampler
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

from matplotlib import pyplot as plt

In [None]:
cfg = {
    "DIR_INPUT": '/kaggle/input/global-wheat-detection',
    "DIR_TRAIN": f'/kaggle/input/global-wheat-detection/train',
    "DIR_TEST": f'/kaggle/input/global-wheat-detection/test',
    "num_epochs": 5,
    'use_amp': True,
    'model_file': 'best_loss_min.pth',
    'detection_threshold': 0.5,
    'log_dir': './logs/'
}

os.makedirs(cfg['log_dir'], exist_ok=True)

In [None]:
def load_data(cfg):
    train_df = pd.read_csv(f'{cfg["DIR_INPUT"]}/train.csv')
    bboxs = np.stack(train_df['bbox'].apply(lambda x: np.fromstring(x[1:-1], sep=',')))
    for i, column in enumerate(['x', 'y', 'w', 'h']):
        train_df[column] = bboxs[:,i]
    train_df.drop(columns=['bbox'], inplace=True)
    return train_df

In [None]:
train_df = load_data(cfg)
train_df.head()

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

kf = KFold(5)
train_df['fold'] = -1
for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df, train_df)):
    train_df.loc[valid_idx, 'fold'] = fold

train_df.tail()

In [None]:
class WheatDataset(Dataset):
    def __init__(self, df, image_dir, transforms=None):
        super().__init__()
        
        self.image_ids = df['image_id'].unique()
        self.df = df
        self.image_dir = image_dir
        self.transforms = transforms
    
    def __len__(self):
        return self.image_ids.shape[0]

    def __getitem__(self, index):
        image_id = self.image_ids[index]
        records = self.df[self.df['image_id'] == image_id]
        
        image = cv2.imread(f'{self.image_dir}/{image_id}.jpg', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0

        boxes = records[['x', 'y', 'w', 'h']].values
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        area = torch.as_tensor(area, dtype=torch.float32)
        
        # there is only one class
        labels = torch.ones((records.shape[0],), dtype=torch.int64)
        
        # suppose all instances are not crowd
        iscrowd = torch.zeros((records.shape[0],), dtype=torch.int64)
        
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        # target['masks'] = None
        target['image_id'] = torch.tensor([index])
        target['area'] = area
        target['iscrowd'] = iscrowd

        if self.transforms:
            sample = {
                'image': image,
                'bboxes': target['boxes'],
                'labels': labels
            }
            sample = self.transforms(**sample)
            image = sample['image']
            
            target['boxes'] = torch.stack(tuple(map(torch.tensor, zip(*sample['bboxes'])))).permute(1, 0)

        return image, target, image_id

In [None]:
def get_train_transform():
    return A.Compose([
        A.Flip(0.5),
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

def get_valid_transform():
    return A.Compose([
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

def collate_fn(batch):
        return tuple(zip(*batch))
    
def train_func(model, loader_train, optimizer, scaler, device):
    model.train()
    
    train_loss = []
    bar = tqdm(loader_train)
    for images, targets, image_ids in bar:
        optimizer.zero_grad()
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        with amp.autocast():
            loss_dict = model(images, targets)
            loss = sum(loss for loss in loss_dict.values())
    
        train_loss.append(loss.item())
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        bar.set_description(f'smth:{np.mean(train_loss[-30:]):.4f}')
    return np.mean(train_loss)

# def valid_func(model, loader_valid, device):
#     model.eval()
    
#     result = []
#     testdf_psuedo = []
#     bar = tqdm(loader_valid)
#     with torch.no_grad():
#         for images, targets, image_ids in bar:
#             images = list(image.to(device) for image in images)
#             targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
#             outputs = model(images)
            
#             for i, image in enumerate(images):
                
#                 boxes = outputs[i]['boxes'].data.cpu().numpy()
#                 scores = outputs[i]['scores'].data.cpu().numpy()

#                 boxes = boxes[scores >= cfg['detection_threshold']].astype(np.int32)
#                 scores = scores[scores >= cfg['detection_threshold']]
#                 image_id = image_ids[i]

#                 boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
#                 boxes[:, 3] = boxes[:, 3] - boxes[:, 1]

#                 for box in boxes:
#                     #print(box)
#                     result = {
#                         'image_id': 'nvnn'+image_id,
#                         'width': 1024,
#                         'height': 1024,
#                         'source': 'nvnn',
#                         'x': box[0],
#                         'y': box[1],
#                         'w': box[2],
#                         'h': box[3]
#                     }
                    
#                     testdf_psuedo.append(result)
#     return testdf_pseudo

        
def run(fold):
    log_file = os.path.join(cfg['log_dir'], f'log.txt')
    
    train_ = train_df[train_df['fold'] != fold].reset_index(drop=True)
    valid_ = train_df[train_df['fold'] == fold].reset_index(drop=True)
    dataset_train = WheatDataset(train_, cfg['DIR_TRAIN'], get_train_transform())
    dataset_valid = WheatDataset(valid_, cfg['DIR_TRAIN'], get_valid_transform())
    
    loader_train = DataLoader(
        dataset_train,
        batch_size=16,
        shuffle=False,
        num_workers=4,
        collate_fn=collate_fn
    )

    loader_valid = DataLoader(
        dataset_valid,
        batch_size=8,
        shuffle=False,
        num_workers=4,
        collate_fn=collate_fn
    )
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    num_classes = 2  # 1 class (wheat) + background
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    model.to(device)
    
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, cfg['num_epochs'])
    scaler = torch.cuda.amp.GradScaler() if cfg['use_amp'] else None
    
    loss_min = np.inf
    
    for epoch in range(1, cfg['num_epochs']+1):
        scheduler_cosine.step(epoch-1)
        
        train_loss = train_func(model, loader_train, optimizer, scaler, device)
        # valid_loss = valid_func(model, loader_valid, device)
        
        content = time.ctime() + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}.'
        print(content)
        with open(log_file, 'a') as appender:
            appender.write(content + '\n')
        
        if train_loss < loss_min:
            print(f'loss_min ({loss_min:.6f} --> {train_loss:.6f}). Saving model ...')
            torch.save(model.state_dict(), cfg['model_file'])
            loss_min = train_loss

In [None]:
if __name__ == "__main__":
    run(0)
    run(1)
    run(2)
    run(3)
    run(4)