# Project: Deep Learning Systems
## Detection Transformer

Import Necessary Libraries

In [1]:
import os
import random
import numpy as np
from collections import defaultdict
from PIL import Image
import time  # For measuring time

# PyTorch imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

# Transformers imports
from transformers import DetrForObjectDetection, DetrConfig, DetrImageProcessor

# torchvision imports
from torchvision.transforms import functional as F

# pycocotools imports
from pycocotools.coco import COCO

# Matplotlib imports
import matplotlib.pyplot as plt

# timm (PyTorch Image Models) import
import timm

# Import necessary libraries
import torch
from transformers import DetrImageProcessor
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import matplotlib.pyplot as plt
from PIL import Image
import os

  from .autonotebook import tqdm as notebook_tqdm
2024-12-11 06:53:59.754885: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733900039.776358   79718 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733900039.783117   79718 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-11 06:53:59.809735: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Data Preparation

In [2]:
# Define paths
data_folder = "Self Driving Car/export/"
annotation_file = os.path.join(data_folder, "_annotations.coco.json")

# Load COCO Annotations
coco = COCO(annotation_file)

# Print dataset statistics
total_images = len(coco.getImgIds())
print(f"Total images in the dataset: {total_images}")
print("Keys in the COCO annotations:", list(coco.dataset.keys()))

loading annotations into memory...
Done (t=0.80s)
creating index...
index created!
Total images in the dataset: 29800
Keys in the COCO annotations: ['info', 'licenses', 'categories', 'images', 'annotations']


#### **Inspect the annotation file to make sure it is suitable fro DETR**

In [3]:
print ("--------------------------------------------------------")
# Check the structure of the JSON file
print("Keys in the COCO annotations:", list(coco.dataset.keys()))

print ("--------------------------------------------------------")
print ("--------------------------------------------------------")
# Total number of images
num_images = len(coco.getImgIds())
print(f"Total images in the dataset: {num_images}")

# Total number of annotations
num_annotations = len(coco.getAnnIds())
print(f"Total annotations in the dataset: {num_annotations}")

# Total number of categories
num_categories = len(coco.getCatIds())
print(f"Total categories in the dataset: {num_categories}")

# List all categories
categories = coco.loadCats(coco.getCatIds())
print("Categories:", [cat['name'] for cat in categories])

print ("--------------------------------------------------------")
print ("--------------------------------------------------------")

# Map image IDs to annotations
img_to_anns = defaultdict(list)
for ann in coco.loadAnns(coco.getAnnIds()):
    img_to_anns[ann['image_id']].append(ann['category_id'])

# Calculate the number of unique classes per image
classes_per_image = [len(set(img_to_anns[img_id])) for img_id in coco.getImgIds()]

# Statistics
max_classes = max(classes_per_image)
min_classes = min(classes_per_image)
avg_classes = np.mean(classes_per_image)

print(f"Max classes per image: {max_classes}")
print(f"Min classes per image: {min_classes}")
print(f"Average classes per image: {avg_classes:.2f}")

print ("--------------------------------------------------------")
print ("--------------------------------------------------------")
# Example image entry
sample_image_id = coco.getImgIds()[0]
sample_image = coco.loadImgs(sample_image_id)[0]
print("Sample image entry:")
print(sample_image)

# Example annotation
sample_ann_id = coco.getAnnIds(imgIds=sample_image_id)[0]
sample_annotation = coco.loadAnns(sample_ann_id)[0]
print("Sample annotation entry:")
print(sample_annotation)


print ("--------------------------------------------------------")
print ("--------------------------------------------------------")




--------------------------------------------------------
Keys in the COCO annotations: ['info', 'licenses', 'categories', 'images', 'annotations']
--------------------------------------------------------
--------------------------------------------------------
Total images in the dataset: 29800
Total annotations in the dataset: 194539
Total categories in the dataset: 12
Categories: ['obstacles', 'biker', 'car', 'pedestrian', 'trafficLight', 'trafficLight-Green', 'trafficLight-GreenLeft', 'trafficLight-Red', 'trafficLight-RedLeft', 'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck']
--------------------------------------------------------
--------------------------------------------------------
Max classes per image: 7
Min classes per image: 0
Average classes per image: 1.88
--------------------------------------------------------
--------------------------------------------------------
Sample image entry:
{'id': 0, 'license': 1, 'file_name': '1478897026627294725_jpg.rf.6828a4e82

The output confirms that the dataset is in a **COCO-compatible format** and is suitable for use with **DETR**.

---

#### **1. Key Details About the Dataset**
- **Total Images**: 29,800
- **Total Annotations**: 194,539
- **Total Categories**: 12
- **Categories**:
  ```plaintext
  ['obstacles', 'biker', 'car', 'pedestrian', 'trafficLight', 
   'trafficLight-Green', 'trafficLight-GreenLeft', 'trafficLight-Red',
   'trafficLight-RedLeft', 'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck']
  ```

---

#### **2. Classes Per Image**
- **Max Classes per Image**: 7
  - Some images have up to 7 unique object categories.
- **Min Classes per Image**: 0
  - Some images have no objects annotated (annotations might be missing or labeled as empty).
- **Average Classes per Image**: 1.88
  - On average, each image contains nearly 2 unique categories.

---

#### **3. Annotation Format (Sample Entry)**
- **Image Entry**:
  ```python
  {'id': 0, 'license': 1, 'file_name': '1478897026627294725_jpg.rf.6828a4e821cbab4c2c277d74df291f00.jpg', 
   'height': 512, 'width': 512, 'date_captured': '2021-06-09T12:24:25+00:00'}
  ```
  - Contains `id`, `file_name`, and dimensions (`height` and `width`), which are essential for DETR training.

- **Annotation Entry**:
  ```python
  {'id': 0, 'image_id': 0, 'category_id': 2, 'bbox': [140, 262, 21, 25.5], 
   'area': 535.5, 'segmentation': [], 'iscrowd': 0}
  ```
  - **`image_id`** links the annotation to the corresponding image.
  - **`category_id`** identifies the object category.
  - **`bbox`** is in the correct COCO format `[x_min, y_min, width, height]`.
  - **`area`** is calculated (but not mandatory for DETR).
  - **`iscrowd`** is set to `0`, which is the expected value for single objects (not groups).

In [4]:
# Dataset splits
num_train = 25330
num_val = 2235
num_test = 2235

# Shuffle and split the dataset
all_image_ids = coco.getImgIds()
random.seed(42)  # Ensure reproducibility
random.shuffle(all_image_ids)

train_ids = all_image_ids[:num_train]
val_ids = all_image_ids[num_train:num_train + num_val]
test_ids = all_image_ids[num_train + num_val:num_train + num_val + num_test]

print(f"Training images: {len(train_ids)}, Validation images: {len(val_ids)}, Testing images: {len(test_ids)}")

Training images: 25330, Validation images: 2235, Testing images: 2235


### Create the Dataset Class

In [5]:
# Define the custom COCO dataset
class COCODataset(Dataset):
    def __init__(self, image_dir, coco, processor, image_ids):
        self.image_dir = image_dir
        self.coco = coco
        self.processor = processor
        self.image_ids = image_ids

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_info = self.coco.loadImgs(image_id)[0]
        image_path = os.path.join(self.image_dir, image_info["file_name"])
        image = Image.open(image_path).convert("RGB")

        # Get annotations
        ann_ids = self.coco.getAnnIds(imgIds=image_id)
        anns = self.coco.loadAnns(ann_ids)
        annotations = []

        for ann in anns:
            bbox = ann["bbox"]  # [x_min, y_min, width, height]
            area = bbox[2] * bbox[3]  # width * height
            annotations.append({
                "bbox": bbox,
                "category_id": ann["category_id"],  # Preserve original category IDs
                "area": area,
                "iscrowd": ann.get("iscrowd", 0),
            })

        # Prepare the target dictionary
        target = {"image_id": image_id, "annotations": annotations}

        # Process the image and annotations
        encoding = self.processor(images=image, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze(0)
        labels = encoding["labels"][0]
        return pixel_values, labels

### Prepare Datasets

In [6]:
# Path to images
image_dir = data_folder

# Load pre-trained processor
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

# Create datasets for each split
train_dataset = COCODataset(image_dir, coco, processor, train_ids)
val_dataset = COCODataset(image_dir, coco, processor, val_ids)
test_dataset = COCODataset(image_dir, coco, processor, test_ids)

print(f"Datasets prepared: {len(train_dataset)} training, {len(val_dataset)} validation, {len(test_dataset)} testing.")

Datasets prepared: 25330 training, 2235 validation, 2235 testing.


### Create DataLoaders

In [7]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

print("DataLoaders ready.")

DataLoaders ready.


In [8]:
# Test a batch
for pixel_values, targets in train_loader:
    print(f"Pixel values shape: {pixel_values[0].shape}")
    print(f"Target example: {targets[0]}")
    break

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Pixel values shape: torch.Size([3, 800, 800])
Target example: {'size': tensor([800, 800]), 'image_id': tensor([3684]), 'class_labels': tensor([2, 2, 2, 2, 2, 2, 2]), 'boxes': tensor([[0.8896, 0.5536, 0.2207, 0.2792],
        [0.3986, 0.4909, 0.0354, 0.0600],
        [0.4305, 0.4890, 0.0406, 0.0600],
        [0.5262, 0.4988, 0.0875, 0.1500],
        [0.6637, 0.5074, 0.1125, 0.1750],
        [0.7467, 0.4942, 0.0990, 0.0900],
        [0.7479, 0.4912, 0.0896, 0.0683]]), 'area': tensor([39455.5547,  1360.0001,  1560.0000,  8400.0000, 12600.0000,  5700.0000,
         3917.7778]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0]), 'orig_size': tensor([512, 512])}


### Model Setup and Initial Training

#### **Modify the Classification Head**

In [9]:
# Initialize DETR with a custom configuration
num_classes = 12 # 12 object classes + 1 background (created automatically)
config = DetrConfig(num_labels=num_classes,ignore_mismatched_sizes=True, backbone="resnet50")
model = DetrForObjectDetection(config)

# Create the Backbone Using timm
# Create the pre-trained timm ResNet-50 backbone
timm_backbone = timm.create_model("resnet50", pretrained=True, features_only=True, out_indices=(1, 2, 3, 4))

# Transfer Weights and Buffers
for name_backbone, parameter_backbone in timm_backbone.named_parameters():
    for name, parameter in model.model.backbone.conv_encoder.model.named_parameters():
        if name_backbone == name:
            parameter.data.copy_(parameter_backbone.data)

# Transfer buffers (e.g., running mean and variance in BatchNorm)
for name_backbone, buffer_backbone in timm_backbone.named_buffers():
    for name, buffer in model.model.backbone.conv_encoder.model.named_buffers():
        if name_backbone == name:
            buffer.data.copy_(buffer_backbone.data)

print((model.model.backbone.conv_encoder.model.conv1.weight == timm_backbone.conv1.weight).all())

print("Number of classes:", model.class_labels_classifier.out_features)


tensor(True)
Number of classes: 13


#### Fine-Tuning the Model

Since our dataset is relatively small, freezing the ResNet backbone will prevent overfitting.

In [10]:
# Freeze the backbone of the DETR model
for param in model.model.backbone.parameters():
    param.requires_grad = False

Let's check the architechture of the model.

In [11]:
print(model)

DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): FeatureListNet(
          (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (bn1): DetrFrozenBatchNorm2d()
          (act1): ReLU(inplace=True)
          (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (layer1): Sequential(
            (0): Bottleneck(
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn1): DetrFrozenBatchNorm2d()
              (act1): ReLU(inplace=True)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (bn2): DetrFrozenBatchNorm2d()
              (drop_block): Identity()
              (act2): ReLU(inplace=True)
              (aa): Identity()
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

Set Up Optimizer and Learning Rate Scheduler

In [12]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=20, gamma=0.1)  # Updated step_size for 100 epochs
torch.cuda.empty_cache()

#### **Training Loop**

In [13]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop for 100 epochs
num_epochs = 50  # Updated from 10 to 100

# Record the total training start time
total_training_start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Record the start time for the epoch
    epoch_start_time = time.time()

    for step, (pixel_values, targets) in enumerate(train_loader):
        # Move pixel values to device
        pixel_values = torch.stack(pixel_values).to(device)

        # Filter valid targets
        valid_targets = []
        for t in targets:
            if "class_labels" in t and len(t["class_labels"]) > 0:
                valid_targets.append({
                    "class_labels": t["class_labels"].to(device),
                    "boxes": t["boxes"].to(device)
                })

        # Add properly formatted dummy targets if necessary
        while len(valid_targets) < len(pixel_values):
            valid_targets.append({
                "class_labels": torch.empty((0,), dtype=torch.int64).to(device),
                "boxes": torch.empty((0, 4), dtype=torch.float32).to(device)
            })

        # Skip the batch if no valid targets exist
        if all(len(t["class_labels"]) == 0 for t in valid_targets):
            print(f"Skipping batch {step}: No valid targets.")
            continue

        # Forward pass
        outputs = model(pixel_values=pixel_values, labels=valid_targets)
        loss = outputs.loss  # Loss is computed internally by DETR

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Step the learning rate scheduler
    scheduler.step()

    # Record the end time for the epoch and calculate the duration
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    # Print epoch summary with time taken
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_duration:.2f} seconds")

# Record the total training end time and calculate the duration
total_training_end_time = time.time()
total_training_duration = total_training_end_time - total_training_start_time

# Print total training time
print(f"Total Training Time: {total_training_duration:.2f} seconds")

Epoch 1/50, Loss: 3.6737, Time: 1362.32 seconds
Skipping batch 1153: No valid targets.
Skipping batch 4476: No valid targets.
Skipping batch 6255: No valid targets.
Epoch 2/50, Loss: 3.0280, Time: 1309.49 seconds
Skipping batch 1666: No valid targets.
Epoch 3/50, Loss: 2.9384, Time: 1314.49 seconds
Epoch 4/50, Loss: 2.8998, Time: 1335.51 seconds
Epoch 5/50, Loss: 2.8662, Time: 1298.14 seconds
Skipping batch 1577: No valid targets.
Skipping batch 2594: No valid targets.
Skipping batch 4338: No valid targets.
Epoch 6/50, Loss: 2.8346, Time: 1332.59 seconds
Skipping batch 2248: No valid targets.
Skipping batch 4461: No valid targets.
Epoch 7/50, Loss: 2.8127, Time: 1333.54 seconds
Skipping batch 282: No valid targets.
Skipping batch 733: No valid targets.
Skipping batch 6021: No valid targets.
Epoch 8/50, Loss: 2.7891, Time: 1286.97 seconds
Epoch 9/50, Loss: 2.7606, Time: 1287.29 seconds
Skipping batch 3982: No valid targets.
Skipping batch 5660: No valid targets.
Epoch 10/50, Loss: 2.738

Save the fine-tuned model

In [14]:
model.save_pretrained("DETR/detr-finetuned")
processor.save_pretrained("DETR/detr-processor")

['DETR/detr-processor/preprocessor_config.json']

### Model Evaluation

### Explainability Techniques

##### Grad-CAM for DETR

##### Saliency Maps

### Conclusion