In [1]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torch.utils.data import DataLoader, Dataset
import os
from PIL import Image
import time
from torchvision.ops import box_iou
from collections import defaultdict

class CustomDataset(Dataset):
    def __init__(self, image_dir, label_dir, transforms=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transforms = transforms
        self.images = list(sorted(os.listdir(image_dir)))
        self.labels = list(sorted(os.listdir(label_dir)))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.images[idx])
        label_path = os.path.join(self.label_dir, self.labels[idx])

        img = Image.open(img_path).convert("RGB")
        labels, boxes = self.parse_annotations(label_path)

        # Ensure the bounding boxes are valid
        boxes = [self.validate_box(box) for box in boxes]

        if self.transforms is not None:
            img = self.transforms(img)

        target = {}
        target["boxes"] = torch.tensor(boxes, dtype=torch.float32)
        target["labels"] = torch.tensor(labels, dtype=torch.int64)

        # Debugging output
        # print(f"Image ID: {idx}")
        # print(f"Labels: {labels}, Boxes: {boxes}")

        return img, target

    def parse_annotations(self, ann_path):
        labels = []
        boxes = []
        with open(ann_path, 'r') as f:
            for line in f:
                data = line.strip().split()
                label = int(data[0])
                box = [float(x) for x in data[1:]]
                labels.append(label)
                boxes.append(box)
        return labels, boxes

    def validate_box(self, box):
        # Ensure the box coordinates are valid
        xmin, ymin, xmax, ymax = box
        if xmax < xmin:
            xmin, xmax = xmax, xmin
        if ymax < ymin:
            ymin, ymax = ymax, ymin
        return [xmin, ymin, xmax, ymax]


In [2]:
# Define a function to collate data samples
def collate_fn(batch):
    return tuple(zip(*batch))

# Define transforms
transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize((256, 256)),
    torchvision.transforms.ToTensor()
])

# Define the dataset paths
train_images_path = "../pidray/train/images"
train_ann_path = "../pidray/train/labels"
test_images_path = "../pidray/test/images"
test_ann_path = "../pidray/test/labels"

# Create datasets
train_dataset = CustomDataset(train_images_path, train_ann_path, transforms=transforms)
test_dataset = CustomDataset(test_images_path, test_ann_path, transforms=transforms)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=False, collate_fn=collate_fn)

In [3]:
# Define the Faster R-CNN model
backbone = torchvision.models.mobilenet_v2(weights="MobileNet_V2_Weights.DEFAULT").features
backbone.out_channels = 1280
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], output_size=7, sampling_ratio=2)

num_classes = 12 + 1  # 12 classes (foreground) + 1 class (background)
model = FasterRCNN(backbone, num_classes=num_classes, rpn_anchor_generator=anchor_generator, box_roi_pool=roi_pooler)

In [4]:
# Use CPU for debugging
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
model.to(device)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4060


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidu

In [5]:
# Define the optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

In [6]:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10):
    model.train()
    header = 'Epoch: [{}]'.format(epoch)
    for i, (images, targets) in enumerate(data_loader):
        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if i % print_freq == 0:
            print('{}\tBatch: [{}]\tLoss: {:.4f}'.format(header, i, losses.item()))

def evaluate(model, data_loader, device):
    model.eval()
    header = 'Test:'
    all_boxes = []
    all_labels = []
    all_scores = []
    all_gt_boxes = []
    all_gt_labels = []
    
    with torch.no_grad():
        for i, (images, targets) in enumerate(data_loader):
            images = [image.to(device) for image in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            outputs = model(images)
            
            for output, target in zip(outputs, targets):
                all_boxes.append(output['boxes'].cpu())
                all_labels.append(output['labels'].cpu())
                all_scores.append(output['scores'].cpu())
                all_gt_boxes.append(target['boxes'].cpu())
                all_gt_labels.append(target['labels'].cpu())

    # Calculate mAP
    map_value = calculate_map(all_boxes, all_labels, all_scores, all_gt_boxes, all_gt_labels)
    print(f"mAP: {map_value:.4f}")

def calculate_map(all_boxes, all_labels, all_scores, all_gt_boxes, all_gt_labels, iou_threshold=0.5):
    # Flatten all predictions and ground truths
    pred_boxes = torch.cat(all_boxes, dim=0)
    pred_labels = torch.cat(all_labels, dim=0)
    pred_scores = torch.cat(all_scores, dim=0)
    gt_boxes = torch.cat(all_gt_boxes, dim=0)
    gt_labels = torch.cat(all_gt_labels, dim=0)
    
    average_precisions = []
    
    for c in range(1, num_classes):  # Skip background class
        c_pred_indices = pred_labels == c
        c_gt_indices = gt_labels == c
        
        c_pred_boxes = pred_boxes[c_pred_indices]
        c_pred_scores = pred_scores[c_pred_indices]
        c_gt_boxes = gt_boxes[c_gt_indices]
        
        if c_gt_boxes.numel() == 0:
            continue
        
        sorted_indices = torch.argsort(c_pred_scores, descending=True)
        c_pred_boxes = c_pred_boxes[sorted_indices]
        
        true_positives = torch.zeros(c_pred_boxes.size(0))
        false_positives = torch.zeros(c_pred_boxes.size(0))
        detected_gt_boxes = torch.zeros(c_gt_boxes.size(0))
        
        for pred_idx, pred_box in enumerate(c_pred_boxes):
            ious = box_iou(pred_box.unsqueeze(0), c_gt_boxes).squeeze(0)
            max_iou, max_iou_idx = ious.max(0)
            
            if max_iou >= iou_threshold and not detected_gt_boxes[max_iou_idx]:
                true_positives[pred_idx] = 1
                detected_gt_boxes[max_iou_idx] = 1
            else:
                false_positives[pred_idx] = 1
        
        tp_cumsum = torch.cumsum(true_positives, dim=0)
        fp_cumsum = torch.cumsum(false_positives, dim=0)
        
        precisions = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-6)
        recalls = tp_cumsum / c_gt_boxes.size(0)
        
        precisions = torch.cat([torch.tensor([1]), precisions])
        recalls = torch.cat([torch.tensor([0]), recalls])
        
        average_precisions.append(torch.trapz(precisions, recalls))
    
    return sum(average_precisions) / len(average_precisions)

def main():
    num_epochs = 10
    for epoch in range(num_epochs):
        train_one_epoch(model, optimizer, train_loader, device, epoch)
        evaluate(model, test_loader, device)
        torch.save(model.state_dict(),f'model_{epoch}.pt')

if __name__ == "__main__":
    main()

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.44 GiB. GPU 