# Project: Deep Learning Systems
## Detection Transformer
#### In this implementation I fine tune the model first with backbone frozen on a higher learning rate and then train the whole network including the backbone on a very small learning rate.

Import Necessary Libraries

In [1]:
# Standard library imports
import os
import random
import time  # For measuring time

# Third-party imports
# NumPy
import numpy as np

# Matplotlib
import matplotlib.pyplot as plt

# PIL (Python Imaging Library)
from PIL import Image

# PyTorch imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

# torchvision imports
from torchvision.transforms import functional as F

# Transformers imports
from transformers import DetrForObjectDetection, DetrConfig, DetrImageProcessor

# pycocotools imports
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# timm (PyTorch Image Models) import
import timm

# Collections
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm
2024-12-12 12:38:36.097608: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734007116.114464  158992 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734007116.119566  158992 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-12 12:38:36.138704: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Data Preparation

In [2]:
# Define paths
data_folder = "Self Driving Car/export/"
annotation_file = os.path.join(data_folder, "_annotations.coco.json")

# Load COCO Annotations
coco = COCO(annotation_file)

# Print dataset statistics
total_images = len(coco.getImgIds())
print(f"Total images in the dataset: {total_images}")
print("Keys in the COCO annotations:", list(coco.dataset.keys()))

loading annotations into memory...
Done (t=0.66s)
creating index...
index created!
Total images in the dataset: 29800
Keys in the COCO annotations: ['info', 'licenses', 'categories', 'images', 'annotations']


#### **Inspect the annotation file to make sure it is suitable fro DETR**

In [3]:
print ("--------------------------------------------------------")
# Check the structure of the JSON file
print("Keys in the COCO annotations:", list(coco.dataset.keys()))

print ("--------------------------------------------------------")
print ("--------------------------------------------------------")
# Total number of images
num_images = len(coco.getImgIds())
print(f"Total images in the dataset: {num_images}")

# Total number of annotations
num_annotations = len(coco.getAnnIds())
print(f"Total annotations in the dataset: {num_annotations}")

# Total number of categories
num_categories = len(coco.getCatIds())
print(f"Total categories in the dataset: {num_categories}")

# List all categories
categories = coco.loadCats(coco.getCatIds())
print("Categories:", [cat['name'] for cat in categories])

print ("--------------------------------------------------------")
print ("--------------------------------------------------------")

# Map image IDs to annotations
img_to_anns = defaultdict(list)
for ann in coco.loadAnns(coco.getAnnIds()):
    img_to_anns[ann['image_id']].append(ann['category_id'])

# Calculate the number of unique classes per image
classes_per_image = [len(set(img_to_anns[img_id])) for img_id in coco.getImgIds()]

# Statistics
max_classes = max(classes_per_image)
min_classes = min(classes_per_image)
avg_classes = np.mean(classes_per_image)

print(f"Max classes per image: {max_classes}")
print(f"Min classes per image: {min_classes}")
print(f"Average classes per image: {avg_classes:.2f}")

print ("--------------------------------------------------------")
print ("--------------------------------------------------------")
# Example image entry
sample_image_id = coco.getImgIds()[0]
sample_image = coco.loadImgs(sample_image_id)[0]
print("Sample image entry:")
print(sample_image)

# Example annotation
sample_ann_id = coco.getAnnIds(imgIds=sample_image_id)[0]
sample_annotation = coco.loadAnns(sample_ann_id)[0]
print("Sample annotation entry:")
print(sample_annotation)


print ("--------------------------------------------------------")
print ("--------------------------------------------------------")




--------------------------------------------------------
Keys in the COCO annotations: ['info', 'licenses', 'categories', 'images', 'annotations']
--------------------------------------------------------
--------------------------------------------------------
Total images in the dataset: 29800
Total annotations in the dataset: 194539
Total categories in the dataset: 12
Categories: ['obstacles', 'biker', 'car', 'pedestrian', 'trafficLight', 'trafficLight-Green', 'trafficLight-GreenLeft', 'trafficLight-Red', 'trafficLight-RedLeft', 'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck']
--------------------------------------------------------
--------------------------------------------------------
Max classes per image: 7
Min classes per image: 0
Average classes per image: 1.88
--------------------------------------------------------
--------------------------------------------------------
Sample image entry:
{'id': 0, 'license': 1, 'file_name': '1478897026627294725_jpg.rf.6828a4e82

The output confirms that the dataset is in a **COCO-compatible format** and is suitable for use with **DETR**.

---

#### **1. Key Details About the Dataset**
- **Total Images**: 29,800
- **Total Annotations**: 194,539
- **Total Categories**: 12
- **Categories**:
  ```plaintext
  ['obstacles', 'biker', 'car', 'pedestrian', 'trafficLight', 
   'trafficLight-Green', 'trafficLight-GreenLeft', 'trafficLight-Red',
   'trafficLight-RedLeft', 'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck']
  ```

---

#### **2. Classes Per Image**
- **Max Classes per Image**: 7
  - Some images have up to 7 unique object categories.
- **Min Classes per Image**: 0
  - Some images have no objects annotated (annotations might be missing or labeled as empty).
- **Average Classes per Image**: 1.88
  - On average, each image contains nearly 2 unique categories.

---

#### **3. Annotation Format (Sample Entry)**
- **Image Entry**:
  ```python
  {'id': 0, 'license': 1, 'file_name': '1478897026627294725_jpg.rf.6828a4e821cbab4c2c277d74df291f00.jpg', 
   'height': 512, 'width': 512, 'date_captured': '2021-06-09T12:24:25+00:00'}
  ```
  - Contains `id`, `file_name`, and dimensions (`height` and `width`), which are essential for DETR training.

- **Annotation Entry**:
  ```python
  {'id': 0, 'image_id': 0, 'category_id': 2, 'bbox': [140, 262, 21, 25.5], 
   'area': 535.5, 'segmentation': [], 'iscrowd': 0}
  ```
  - **`image_id`** links the annotation to the corresponding image.
  - **`category_id`** identifies the object category.
  - **`bbox`** is in the correct COCO format `[x_min, y_min, width, height]`.
  - **`area`** is calculated (but not mandatory for DETR).
  - **`iscrowd`** is set to `0`, which is the expected value for single objects (not groups).

In [4]:
# Dataset splits
num_train = 25330
num_val = 2235
num_test = 2235

# Shuffle and split the dataset
all_image_ids = coco.getImgIds()
random.seed(42)  # Ensure reproducibility
random.shuffle(all_image_ids)

train_ids = all_image_ids[:num_train]
val_ids = all_image_ids[num_train:num_train + num_val]
test_ids = all_image_ids[num_train + num_val:num_train + num_val + num_test]

print(f"Training images: {len(train_ids)}, Validation images: {len(val_ids)}, Testing images: {len(test_ids)}")

Training images: 25330, Validation images: 2235, Testing images: 2235


### Create the Dataset Class

In [5]:
# Define the custom COCO dataset
class COCODataset(Dataset):
    def __init__(self, image_dir, coco, processor, image_ids):
        self.image_dir = image_dir
        self.coco = coco
        self.processor = processor
        self.image_ids = image_ids

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_info = self.coco.loadImgs(image_id)[0]
        image_path = os.path.join(self.image_dir, image_info["file_name"])
        image = Image.open(image_path).convert("RGB")

        # Get annotations
        ann_ids = self.coco.getAnnIds(imgIds=image_id)
        anns = self.coco.loadAnns(ann_ids)
        annotations = []

        for ann in anns:
            bbox = ann["bbox"]  # [x_min, y_min, width, height]
            area = bbox[2] * bbox[3]  # width * height
            annotations.append({
                "bbox": bbox,
                "category_id": ann["category_id"],  # Preserve original category IDs
                "area": area,
                "iscrowd": ann.get("iscrowd", 0),
            })

        # Prepare the target dictionary
        target = {"image_id": image_id, "annotations": annotations}

        # Process the image and annotations
        encoding = self.processor(images=image, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze(0)
        labels = encoding["labels"][0]
        return pixel_values, labels

### Prepare Datasets

In [6]:
# Path to images
image_dir = data_folder

# Load pre-trained processor
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

# Create datasets for each split
train_dataset = COCODataset(image_dir, coco, processor, train_ids)
val_dataset = COCODataset(image_dir, coco, processor, val_ids)
test_dataset = COCODataset(image_dir, coco, processor, test_ids)

print(f"Datasets prepared: {len(train_dataset)} training, {len(val_dataset)} validation, {len(test_dataset)} testing.")

Datasets prepared: 25330 training, 2235 validation, 2235 testing.


### Create DataLoaders

In [7]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

print("DataLoaders ready.")

DataLoaders ready.


In [8]:
# Test a batch
for pixel_values, targets in train_loader:
    print(f"Pixel values shape: {pixel_values[0].shape}")
    print(f"Target example: {targets[0]}")
    break

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Pixel values shape: torch.Size([3, 800, 800])
Target example: {'size': tensor([800, 800]), 'image_id': tensor([24418]), 'class_labels': tensor([2, 2, 2, 2, 2, 2, 2]), 'boxes': tensor([[0.0996, 0.5605, 0.1992, 0.2148],
        [0.2197, 0.5493, 0.0645, 0.1338],
        [0.2827, 0.5352, 0.0850, 0.1133],
        [0.3262, 0.5288, 0.0469, 0.0850],
        [0.3545, 0.5234, 0.0332, 0.0820],
        [0.6870, 0.5322, 0.0967, 0.1152],
        [0.8516, 0.5488, 0.2539, 0.2383]]), 'area': tensor([27392.5781,  5518.7988,  6159.6680,  2548.8281,  1743.1641,  7130.1270,
        38720.7031]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0]), 'orig_size': tensor([512, 512])}


### Model Setup and Initial Training

#### **Modify the Classification Head**

In [9]:
# Initialize DETR with a custom configuration
num_classes = 12  # 12 object classes + 1 background (created automatically)
config = DetrConfig(num_labels=num_classes, ignore_mismatched_sizes=True, backbone="resnet50")
model = DetrForObjectDetection(config)

# Create the Backbone Using timm
# Create the pre-trained timm ResNet-50 backbone
timm_backbone = timm.create_model("resnet50", pretrained=True, features_only=True, out_indices=(1, 2, 3, 4))

# Transfer Weights and Buffers
for name_backbone, parameter_backbone in timm_backbone.named_parameters():
    for name, parameter in model.model.backbone.conv_encoder.model.named_parameters():
        if name_backbone == name:
            parameter.data.copy_(parameter_backbone.data)

# Transfer buffers (e.g., running mean and variance in BatchNorm)
for name_backbone, buffer_backbone in timm_backbone.named_buffers():
    for name, buffer in model.model.backbone.conv_encoder.model.named_buffers():
        if name_backbone == name:
            buffer.data.copy_(buffer_backbone.data)

print((model.model.backbone.conv_encoder.model.conv1.weight == timm_backbone.conv1.weight).all())
print("Number of classes:", model.class_labels_classifier.out_features)

tensor(True)
Number of classes: 13


#### Stage 1: Fine-Tuning the Model

Since our dataset is relatively small, freezing the ResNet backbone will prevent overfitting.

In [10]:
# Stage 1: Freeze the backbone
for param in model.model.backbone.parameters():
    param.requires_grad = False

Let's check the architechture of the model.

In [11]:
print(model)

DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): FeatureListNet(
          (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (bn1): DetrFrozenBatchNorm2d()
          (act1): ReLU(inplace=True)
          (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (layer1): Sequential(
            (0): Bottleneck(
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn1): DetrFrozenBatchNorm2d()
              (act1): ReLU(inplace=True)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (bn2): DetrFrozenBatchNorm2d()
              (drop_block): Identity()
              (act2): ReLU(inplace=True)
              (aa): Identity()
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

Set Up Optimizer and Learning Rate Scheduler

In [12]:
# Optimizer for Stage 1
optimizer_stage1 = AdamW(model.parameters(), lr=0.001, weight_decay=1e-3)

# Learning rate scheduler for Stage 1
scheduler_stage1 = StepLR(optimizer_stage1, step_size=10, gamma=0.1)

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): FeatureListNet(
          (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (bn1): DetrFrozenBatchNorm2d()
          (act1): ReLU(inplace=True)
          (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (layer1): Sequential(
            (0): Bottleneck(
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn1): DetrFrozenBatchNorm2d()
              (act1): ReLU(inplace=True)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (bn2): DetrFrozenBatchNorm2d()
              (drop_block): Identity()
              (act2): ReLU(inplace=True)
              (aa): Identity()
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

#### **Stage 1: Training Loop**

In [13]:
# Stage 1: Train the model with a frozen backbone
num_epochs_stage1 = 10  # Initial training epochs with frozen backbone

print("Starting Stage 1: Training with frozen backbone...")
total_training_start_time = time.time()
for epoch in range(num_epochs_stage1):
    model.train()
    total_loss = 0
    epoch_start_time = time.time()

    for step, (pixel_values, targets) in enumerate(train_loader):
        pixel_values = torch.stack(pixel_values).to(device)

        # Filter valid targets
        valid_targets = []
        for t in targets:
            if "class_labels" in t and len(t["class_labels"]) > 0:
                valid_targets.append({
                    "class_labels": t["class_labels"].to(device),
                    "boxes": t["boxes"].to(device)
                })

        while len(valid_targets) < len(pixel_values):
            valid_targets.append({
                "class_labels": torch.empty((0,), dtype=torch.int64).to(device),
                "boxes": torch.empty((0, 4), dtype=torch.float32).to(device)
            })

        if all(len(t["class_labels"]) == 0 for t in valid_targets):
            #print(f"Skipping batch {step}: No valid targets.")
            continue

        # Forward pass
        outputs = model(pixel_values=pixel_values, labels=valid_targets)
        loss = outputs.loss

        # Backward pass
        optimizer_stage1.zero_grad()
        loss.backward()
        optimizer_stage1.step()

        total_loss += loss.item()

    scheduler_stage1.step()

    epoch_end_time = time.time()
    print(f"Epoch {epoch + 1}/{num_epochs_stage1}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_end_time - epoch_start_time:.2f} seconds")


Starting Stage 1: Training with frozen backbone...
Epoch 1/10, Loss: 3.5675, Time: 1259.18 seconds
Epoch 2/10, Loss: 3.3067, Time: 1291.24 seconds
Epoch 3/10, Loss: 3.2784, Time: 1299.65 seconds
Epoch 4/10, Loss: 3.2585, Time: 1363.01 seconds
Epoch 5/10, Loss: 3.2090, Time: 1301.09 seconds
Epoch 6/10, Loss: 3.2017, Time: 1305.78 seconds
Epoch 7/10, Loss: 3.2035, Time: 1295.48 seconds
Epoch 8/10, Loss: 3.2247, Time: 1290.83 seconds
Epoch 9/10, Loss: 3.1858, Time: 1257.32 seconds
Epoch 10/10, Loss: 3.1841, Time: 1294.08 seconds


#### Stage 2: Fine Tuning

Setup Optimizer

In [14]:
# Stage 2: Unfreeze the backbone and train the entire model
print("Starting Stage 2: Training with unfrozen backbone...")

for param in model.model.backbone.parameters():
    param.requires_grad = True

# Optimizer for Stage 2
optimizer_stage2 = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Learning rate scheduler for Stage 2
scheduler_stage2 = StepLR(optimizer_stage2, step_size=10, gamma=0.1)

Starting Stage 2: Training with unfrozen backbone...


#### **Stage 2: Training Loop**

In [15]:
num_epochs_stage2 = 5  # Remaining epochs for fine-tuning

for epoch in range(num_epochs_stage2):
    model.train()
    total_loss = 0
    epoch_start_time = time.time()

    for step, (pixel_values, targets) in enumerate(train_loader):
        pixel_values = torch.stack(pixel_values).to(device)

        valid_targets = []
        for t in targets:
            if "class_labels" in t and len(t["class_labels"]) > 0:
                valid_targets.append({
                    "class_labels": t["class_labels"].to(device),
                    "boxes": t["boxes"].to(device)
                })

        while len(valid_targets) < len(pixel_values):
            valid_targets.append({
                "class_labels": torch.empty((0,), dtype=torch.int64).to(device),
                "boxes": torch.empty((0, 4), dtype=torch.float32).to(device)
            })

        if all(len(t["class_labels"]) == 0 for t in valid_targets):
            #print(f"Skipping batch {step}: No valid targets.")
            continue

        outputs = model(pixel_values=pixel_values, labels=valid_targets)
        loss = outputs.loss

        optimizer_stage2.zero_grad()
        loss.backward()
        optimizer_stage2.step()

        total_loss += loss.item()

    scheduler_stage2.step()

    epoch_end_time = time.time()
    print(f"Epoch {epoch + 1}/{num_epochs_stage2}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_end_time - epoch_start_time:.2f} seconds")


Epoch 1/5, Loss: 3.1169, Time: 1459.90 seconds
Epoch 2/5, Loss: 3.1069, Time: 1409.05 seconds
Epoch 3/5, Loss: 3.1050, Time: 1454.91 seconds
Epoch 4/5, Loss: 3.0984, Time: 1459.00 seconds
Epoch 5/5, Loss: 3.0973, Time: 1433.01 seconds


In [16]:
total_training_end_time = time.time()
print(f"Total Training Time: {total_training_end_time - total_training_start_time:.2f} seconds")

Total Training Time: 20173.56 seconds


Save the fine-tuned model

In [17]:
model.save_pretrained("DETR_BackboneTrained/detr-finetuned")
processor.save_pretrained("DETR_BackboneTrained/detr-processor")

['DETR_BackboneTrained/detr-processor/preprocessor_config.json']

#### Stage 3: Fine Tuning

In [18]:
# Stage 3: Additional fine-tuning for 10 epochs
print("Starting Stage 3: Additional fine-tuning...")
num_epochs_stage3 = 10  # Additional fine-tuning epochs

for epoch in range(num_epochs_stage3):
    model.train()
    total_loss = 0
    epoch_start_time = time.time()

    for step, (pixel_values, targets) in enumerate(train_loader):
        pixel_values = torch.stack(pixel_values).to(device)

        valid_targets = []
        for t in targets:
            if "class_labels" in t and len(t["class_labels"]) > 0:
                valid_targets.append({
                    "class_labels": t["class_labels"].to(device),
                    "boxes": t["boxes"].to(device)
                })

        while len(valid_targets) < len(pixel_values):
            valid_targets.append({
                "class_labels": torch.empty((0,), dtype=torch.int64).to(device),
                "boxes": torch.empty((0, 4), dtype=torch.float32).to(device)
            })

        if all(len(t["class_labels"]) == 0 for t in valid_targets):
            print(f"Skipping batch {step}: No valid targets.")
            continue

        outputs = model(pixel_values=pixel_values, labels=valid_targets)
        loss = outputs.loss

        optimizer_stage2.zero_grad()
        loss.backward()
        optimizer_stage2.step()

        total_loss += loss.item()

    scheduler_stage2.step()

    epoch_end_time = time.time()
    print(f"Epoch {epoch + 1}/{num_epochs_stage3}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_end_time - epoch_start_time:.2f} seconds")

total_training_end_time = time.time()
print(f"Total Training Time: {total_training_end_time - total_training_start_time:.2f} seconds")

Starting Stage 3: Additional fine-tuning...
Skipping batch 1057: No valid targets.
Skipping batch 3036: No valid targets.
Skipping batch 3213: No valid targets.
Epoch 1/10, Loss: 3.0967, Time: 1460.79 seconds
Skipping batch 4472: No valid targets.
Epoch 2/10, Loss: 3.0974, Time: 1430.20 seconds
Skipping batch 5051: No valid targets.
Skipping batch 5250: No valid targets.
Epoch 3/10, Loss: 3.0951, Time: 1430.22 seconds
Skipping batch 2406: No valid targets.
Skipping batch 5561: No valid targets.
Skipping batch 6085: No valid targets.
Epoch 4/10, Loss: 3.0914, Time: 1434.37 seconds
Epoch 5/10, Loss: 3.0917, Time: 1437.66 seconds
Epoch 6/10, Loss: 3.0952, Time: 1420.01 seconds
Skipping batch 1972: No valid targets.
Epoch 7/10, Loss: 3.0934, Time: 1413.08 seconds
Skipping batch 6090: No valid targets.
Epoch 8/10, Loss: 3.0899, Time: 1436.00 seconds
Skipping batch 5718: No valid targets.
Epoch 9/10, Loss: 3.0849, Time: 1447.30 seconds
Skipping batch 2590: No valid targets.
Epoch 10/10, Loss

In [19]:
# Save the model after Stage 3
model.save_pretrained("DETR_BackboneTrained_30epochs")
processor.save_pretrained("DETR_BackboneTrained_30epochs/detr-processor")

['DETR_BackboneTrained_30epochs/detr-processor/preprocessor_config.json']

#### Stage 4: Fine Tuning

In [20]:
# Stage 4: Additional fine-tuning for 10 more epochs
print("Starting Stage 4: Additional fine-tuning...")
num_epochs_stage4 = 10  # Additional fine-tuning epochs

for epoch in range(num_epochs_stage4):
    model.train()
    total_loss = 0
    epoch_start_time = time.time()

    for step, (pixel_values, targets) in enumerate(train_loader):
        pixel_values = torch.stack(pixel_values).to(device)

        valid_targets = []
        for t in targets:
            if "class_labels" in t and len(t["class_labels"]) > 0:
                valid_targets.append({
                    "class_labels": t["class_labels"].to(device),
                    "boxes": t["boxes"].to(device)
                })

        while len(valid_targets) < len(pixel_values):
            valid_targets.append({
                "class_labels": torch.empty((0,), dtype=torch.int64).to(device),
                "boxes": torch.empty((0, 4), dtype=torch.float32).to(device)
            })

        if all(len(t["class_labels"]) == 0 for t in valid_targets):
            #print(f"Skipping batch {step}: No valid targets.")
            continue

        outputs = model(pixel_values=pixel_values, labels=valid_targets)
        loss = outputs.loss

        optimizer_stage2.zero_grad()
        loss.backward()
        optimizer_stage2.step()

        total_loss += loss.item()

    scheduler_stage2.step()

    epoch_end_time = time.time()
    print(f"Epoch {epoch + 1}/{num_epochs_stage4}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_end_time - epoch_start_time:.2f} seconds")

Starting Stage 4: Additional fine-tuning...
Epoch 1/10, Loss: 3.0896, Time: 1428.01 seconds
Epoch 2/10, Loss: 3.0886, Time: 1438.90 seconds
Epoch 3/10, Loss: 3.0916, Time: 1438.33 seconds
Epoch 4/10, Loss: 3.0906, Time: 1527.62 seconds
Epoch 5/10, Loss: 3.0868, Time: 1526.95 seconds
Epoch 6/10, Loss: 3.0909, Time: 1503.87 seconds
Epoch 7/10, Loss: 3.0930, Time: 1453.90 seconds
Epoch 8/10, Loss: 3.0886, Time: 1423.42 seconds
Epoch 9/10, Loss: 3.0916, Time: 1496.36 seconds
Epoch 10/10, Loss: 3.0915, Time: 1488.25 seconds


In [21]:
# Save the model after Stage 4
model.save_pretrained("DETR_BackboneTrained_40epochs")
processor.save_pretrained("DETR_BackboneTrained_40epochs/detr-processor-stage4")

['DETR_BackboneTrained_40epochs/detr-processor-stage4/preprocessor_config.json']

#### Stage 5: Fine Tuning

In [22]:
# Stage 5: Additional fine-tuning for 10 more epochs
print("Starting Stage 5: Final fine-tuning stage...")
num_epochs_stage5 = 10  # Additional fine-tuning epochs

for epoch in range(num_epochs_stage5):
    model.train()
    total_loss = 0
    epoch_start_time = time.time()

    for step, (pixel_values, targets) in enumerate(train_loader):
        pixel_values = torch.stack(pixel_values).to(device)

        valid_targets = []
        for t in targets:
            if "class_labels" in t and len(t["class_labels"]) > 0:
                valid_targets.append({
                    "class_labels": t["class_labels"].to(device),
                    "boxes": t["boxes"].to(device)
                })

        while len(valid_targets) < len(pixel_values):
            valid_targets.append({
                "class_labels": torch.empty((0,), dtype=torch.int64).to(device),
                "boxes": torch.empty((0, 4), dtype=torch.float32).to(device)
            })

        if all(len(t["class_labels"]) == 0 for t in valid_targets):
            #print(f"Skipping batch {step}: No valid targets.")
            continue

        outputs = model(pixel_values=pixel_values, labels=valid_targets)
        loss = outputs.loss

        optimizer_stage2.zero_grad()
        loss.backward()
        optimizer_stage2.step()

        total_loss += loss.item()

    scheduler_stage2.step()

    epoch_end_time = time.time()
    print(f"Epoch {epoch + 1}/{num_epochs_stage5}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_end_time - epoch_start_time:.2f} seconds")

Starting Stage 5: Final fine-tuning stage...
Epoch 1/10, Loss: 3.0906, Time: 1438.83 seconds
Epoch 2/10, Loss: 3.0874, Time: 1450.17 seconds
Epoch 3/10, Loss: 3.0899, Time: 1429.25 seconds
Epoch 4/10, Loss: 3.0862, Time: 1431.89 seconds
Epoch 5/10, Loss: 3.0882, Time: 1439.48 seconds
Epoch 6/10, Loss: 3.0882, Time: 1443.86 seconds
Epoch 7/10, Loss: 3.0939, Time: 1433.91 seconds
Epoch 8/10, Loss: 3.0893, Time: 1428.52 seconds
Epoch 9/10, Loss: 3.0928, Time: 1473.26 seconds
Epoch 10/10, Loss: 3.0915, Time: 1435.82 seconds


In [23]:
# Final training duration
final_training_end_time = time.time()
print(f"Total Training Time Across All Stages: {final_training_end_time - total_training_start_time:.2f} seconds")

Total Training Time Across All Stages: 108374.85 seconds


In [25]:
# Save the model after Stage 5
model.save_pretrained("DETR_BackboneTrained_50epochs")
processor.save_pretrained("DETR_BackboneTrained_50epochs/detr-processor-stage5")

['DETR_BackboneTrained_50epochs/detr-processor-stage5/preprocessor_config.json']

#### Stage 6 Fine-tuning

In [26]:
# Stage 6: Train the entire network (including the backbone) for 30 epochs
print("Starting Stage 6: Training the entire network for 30 epochs...")

# Unfreeze the backbone
for param in model.model.backbone.parameters():
    param.requires_grad = True

# Define a new optimizer for Stage 6
optimizer_stage6 = AdamW(model.parameters(), lr=5e-3, weight_decay=1e-3)  
scheduler_stage6 = StepLR(optimizer_stage6, step_size=10, gamma=0.1) 


Starting Stage 6: Training the entire network for 30 epochs...


In [27]:
# Number of epochs for Stage 6
num_epochs_stage6 = 30

# Start training for Stage 6
for epoch in range(num_epochs_stage6):
    model.train()
    total_loss = 0
    epoch_start_time = time.time()

    for step, (pixel_values, targets) in enumerate(train_loader):
        # Move pixel values to the device
        pixel_values = torch.stack(pixel_values).to(device)

        # Filter valid targets
        valid_targets = []
        for t in targets:
            if "class_labels" in t and len(t["class_labels"]) > 0:
                valid_targets.append({
                    "class_labels": t["class_labels"].to(device),
                    "boxes": t["boxes"].to(device)
                })

        # Add properly formatted dummy targets if necessary
        while len(valid_targets) < len(pixel_values):
            valid_targets.append({
                "class_labels": torch.empty((0,), dtype=torch.int64).to(device),
                "boxes": torch.empty((0, 4), dtype=torch.float32).to(device)
            })

        # Skip the batch if no valid targets exist
        if all(len(t["class_labels"]) == 0 for t in valid_targets):
            #print(f"Skipping batch {step}: No valid targets.")
            continue

        # Forward pass
        outputs = model(pixel_values=pixel_values, labels=valid_targets)
        loss = outputs.loss

        # Backward pass
        optimizer_stage6.zero_grad()
        loss.backward()
        optimizer_stage6.step()

        total_loss += loss.item()

    # Step the learning rate scheduler
    scheduler_stage6.step()

    # Record epoch duration
    epoch_end_time = time.time()
    print(f"Epoch {epoch + 1}/{num_epochs_stage6}, Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_end_time - epoch_start_time:.2f} seconds")

Epoch 1/30, Loss: 3.6027, Time: 1414.52 seconds
Epoch 2/30, Loss: 3.5686, Time: 1452.51 seconds
Epoch 3/30, Loss: 3.6112, Time: 1418.59 seconds
Epoch 4/30, Loss: 3.6268, Time: 1424.36 seconds
Epoch 5/30, Loss: 3.5275, Time: 1436.30 seconds
Epoch 6/30, Loss: 3.5985, Time: 1488.77 seconds
Epoch 7/30, Loss: 3.5090, Time: 1437.03 seconds
Epoch 8/30, Loss: 3.5450, Time: 1469.96 seconds
Epoch 9/30, Loss: 3.5824, Time: 1477.51 seconds
Epoch 10/30, Loss: 3.5784, Time: 1439.48 seconds
Epoch 11/30, Loss: 3.2199, Time: 1456.53 seconds
Epoch 12/30, Loss: 3.1701, Time: 1465.53 seconds
Epoch 13/30, Loss: 3.1668, Time: 1481.28 seconds
Epoch 14/30, Loss: 3.1625, Time: 1457.98 seconds
Epoch 15/30, Loss: 3.1657, Time: 1431.68 seconds
Epoch 16/30, Loss: 3.1648, Time: 1468.02 seconds
Epoch 17/30, Loss: 3.1702, Time: 1448.78 seconds
Epoch 18/30, Loss: 3.1626, Time: 1459.74 seconds
Epoch 19/30, Loss: 3.1610, Time: 1500.99 seconds
Epoch 20/30, Loss: 3.1572, Time: 1447.51 seconds
Epoch 21/30, Loss: 3.1346, Ti

In [28]:
# Save the model and processor after Stage 6
model.save_pretrained("DETR_FullNetworkTrained_80epochs/detr-finetuned")
processor.save_pretrained("DETR/detr-processor-stage6")

# Final training time for Stage 6
stage6_end_time = time.time()
print(f"Stage 6 Training Time: {stage6_end_time - total_training_start_time:.2f} seconds")

Stage 6 Training Time: 222919.86 seconds


### Model Evaluation

### Explainability Techniques

##### Grad-CAM for DETR

##### Saliency Maps

### Conclusion