In [2]:
import cv2 as cv
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os

from safetensors import torch
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image
import torch
from torchvision.transforms import transforms

In [3]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device available")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA device available")
else:
    device = torch.device("cpu")
    print("CPU device available")

CUDA device available


In [4]:
train_dir = './data/object_detection/train/'
val_dir = './data/object_detection/val/'

In [5]:
class CustomDataset(Dataset):
    def __init__(self, root, transform=None):
        self.root = root
        self.transform = transform
        self.images = list(sorted(os.listdir(self.root + 'images/')))
        self.labels = list(sorted(os.listdir(self.root + 'labels/')))
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img = read_image(self.root + 'images/' + self.images[idx]).float() / 255.0
        image_height, image_width, _ = img.shape
        
        label_path = self.root + 'labels/'+ self.labels[idx]
        boxes = []
        labels = []
        
        with open(label_path, 'r') as f:
            for line in f:
                data = line.strip().split()
                label = data[0]
                class_id, x_center, y_center, width, height = map(float, data)
                
                # Convert relative coordinates to absolute coordinates
                x_center *= image_width
                y_center *= image_height
                width *= image_width
                height *= image_height
                
                # Calculate top-left and bottom-right coordinates
                x1 = int(x_center - width / 2)
                y1 = int(y_center - height / 2)
                x2 = int(x_center + width / 2)
                y2 = int(y_center + height / 2)
                
                if (x2 > x1 and 
                    y2 > y1 and 
                    0 <= x1 < image_width and 
                    0 <= y1 < image_height and 
                    0 <= x2 < image_width and 
                    0 <= y2 < image_height):
                    boxes.append([x1, y1, x2, y2])
                    labels.append(int(label))
            
            boxes = torch.tensor(boxes, dtype=torch.float32, device=device)
            labels = torch.tensor(labels, dtype=torch.int64, device=device)
            img = img.to(device)
            target = {"boxes": boxes, "labels": labels}
            
            #TODO: from each image, get random patches for a image as NEGATIVE examples using iou
            
            if self.transform:
                img = self.transform(img)
            
            return img, target

In [6]:
img_transform = transforms.Compose([
    transforms.Resize((224, 224)),
])

In [7]:
train_dataset = CustomDataset(train_dir, img_transform)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

val_dataset = CustomDataset(val_dir, img_transform)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [8]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights

# Load a pre-trained model
model = fasterrcnn_resnet50_fpn_v2(weights=FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT)

# Get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# Define the number of classes (including the background)
num_classes = 2  # 1 class (object) + background

# Replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [9]:
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       

In [10]:
from tqdm import tqdm
import torch
import torch.optim as optim

# Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move model to the right device
model.to(device)

# Parameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
num_epochs = 10

# Training loop
for epoch in range(1, num_epochs + 1):
    skipped_images = 0
    epoch_loss = 0.0
    model.train()
    for images, targets in tqdm(train_dataloader):
        # Filter out images with no bounding boxes
        valid_targets = [t for t in targets if t['boxes'].numel() > 0]
        valid_images = [images[idx] for idx, t in enumerate(targets) if t['boxes'].numel() > 0]

        skipped_images += len(targets) - len(valid_targets)

        # Ensure the images and valid targets are aligned
        if not valid_images:
            continue

        valid_images = list(img.to(device) for img in valid_images)
        valid_targets = [{k: v.to(device) for k, v in t.items()} for t in valid_targets]

        loss_dict = model(valid_images, valid_targets)
        batch_losses = sum(loss for loss in loss_dict.values())
        
        optimizer.zero_grad()
        batch_losses.backward()
        optimizer.step()
        
        epoch_loss += batch_losses.item()

    # Calculate average loss
    average_loss = epoch_loss / (len(train_dataloader) - skipped_images)
    print(f"Epoch: {epoch}, Loss: {average_loss:.4f}, Skipped images: {skipped_images}")

100%|██████████| 100/100 [00:29<00:00,  3.34it/s]


Epoch: 1, Loss: 57.0521, Skipped images: 55


100%|██████████| 100/100 [00:26<00:00,  3.80it/s]


Epoch: 2, Loss: 41.9617, Skipped images: 55


100%|██████████| 100/100 [00:24<00:00,  4.05it/s]


Epoch: 3, Loss: 33.2916, Skipped images: 55


100%|██████████| 100/100 [00:25<00:00,  3.98it/s]


Epoch: 4, Loss: 30.6183, Skipped images: 55


100%|██████████| 100/100 [00:24<00:00,  4.06it/s]


Epoch: 5, Loss: 30.9575, Skipped images: 55


100%|██████████| 100/100 [00:25<00:00,  4.00it/s]


Epoch: 6, Loss: 29.1937, Skipped images: 55


100%|██████████| 100/100 [00:24<00:00,  4.03it/s]


Epoch: 7, Loss: 29.3622, Skipped images: 55


100%|██████████| 100/100 [00:25<00:00,  3.91it/s]


Epoch: 8, Loss: 27.1946, Skipped images: 55


100%|██████████| 100/100 [00:24<00:00,  4.05it/s]


Epoch: 9, Loss: 25.3571, Skipped images: 55


100%|██████████| 100/100 [00:24<00:00,  4.00it/s]

Epoch: 10, Loss: 24.7029, Skipped images: 55





In [11]:
torch.save(model.state_dict(), 'car_detection_model.pth')

In [None]:
model.load_state_dict(torch.load('car_detection_model.pth', map_location=device))

In [None]:
from torchvision.ops import box_iou


def compute_iou(box1, box2):
    return box_iou(box1, box2)

model.eval()
skipped_images = 0
for images, targets in tqdm(val_dataloader):
    # Filter out images with no bounding boxes
    valid_targets = [t for t in targets if t['boxes'].numel() > 0]
    valid_images = [images[idx] for idx, t in enumerate(targets) if t['boxes'].numel() > 0]

    skipped_images += len(targets) - len(valid_targets)

    # Ensure the images and valid targets are aligned
    if not valid_images:
        continue

    valid_images = list(img.to(device) for img in valid_images)
    valid_targets = [{k: v.to(device) for k, v in t.items()} for t in valid_targets]
    
    for image, target in zip(images, valid_targets):
        image = image.unsqueeze(0)
        outputs = model(image)
        print(outputs)