In [2]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
%pip install pycocotools opencv-python matplotlib

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp310-cp310-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Collecting filelock (from torch)
  Using cached https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting networkx (from torch)
  Using cached https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch)
  Using cached https://download.pytorch.org/whl/fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)



In [7]:
# All imports are handled here

import os
import torch
import torchvision
from torch.utils.data import DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.datasets import CocoDetection
from torchvision.transforms import functional as F

# We set up the dataset for Faster RCNN here

class CocoDetectionForFasterRCNN(CocoDetection):
    def __init__(self, img_folder, ann_file, transforms=None):
        super().__init__(img_folder, ann_file)
        self._transforms = transforms

    def __getitem__(self, idx):
        img, targets = super().__getitem__(idx)
        img_id = self.ids[idx]
        anno = [obj for obj in targets if obj.get("iscrowd", 0) == 0]

        boxes = []
        labels = []
        for obj in anno:
            x, y, w, h = obj["bbox"]
            boxes.append([x, y, x + w, y + h])
            labels.append(obj["category_id"])

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([img_id])
        }

        if self._transforms:
            img = self._transforms(img)

        return img, target

# All relevant paths are listed here

DATASET_PATH = "dataset_coco"
TRAIN_PATH = os.path.join(DATASET_PATH, "train")
VAL_PATH = os.path.join(DATASET_PATH, "val")
TRAIN_ANN = os.path.join(DATASET_PATH, "annotations", "instances_train.json")
VAL_ANN = os.path.join(DATASET_PATH, "annotations", "instances_val.json")
NUM_CLASSES = 3  # Only classes and laptops, but an unused class was also accidentally included

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Build the dataset and the DataLoader

train_dataset = CocoDetectionForFasterRCNN(TRAIN_PATH, TRAIN_ANN, transforms=F.to_tensor)
val_dataset = CocoDetectionForFasterRCNN(VAL_PATH, VAL_ANN, transforms=F.to_tensor)

def collate_fn(batch):
    return tuple(zip(*batch))

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# Load the Faster RCNN ResNet 50 model

from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights

weights = FasterRCNN_ResNet50_FPN_Weights.COCO_V1
model = fasterrcnn_resnet50_fpn(weights=weights)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
model.to(device)

# Train the model here

import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

num_epochs = 20
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for images, targets in train_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}] - Total Loss: {total_loss:.4f}")


Using device: cuda
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Epoch [1/20] - Total Loss: 44.9601
Epoch [2/20] - Total Loss: 28.5018
Epoch [3/20] - Total Loss: 23.4342
Epoch [4/20] - Total Loss: 21.2129
Epoch [5/20] - Total Loss: 19.9704
Epoch [6/20] - Total Loss: 18.1485
Epoch [7/20] - Total Loss: 17.0658
Epoch [8/20] - Total Loss: 16.8957
Epoch [9/20] - Total Loss: 15.0604
Epoch [10/20] - Total Loss: 14.3655
Epoch [11/20] - Total Loss: 13.5928
Epoch [12/20] - Total Loss: 12.9297
Epoch [13/20] - Total Loss: 12.7256
Epoch [14/20] - Total Loss: 12.4759
Epoch [15/20] - Total Loss: 11.6398
Epoch [16/20] - Total Loss: 11.3505
Epoch [17/20] - Total Loss: 11.6008
Epoch [18/20] - Total Loss: 10.7517
Epoch [19/20] - Total Loss: 10.4242
Epoch [20/20] - Total Loss: 9.8146


In [9]:
torch.save(model.state_dict(), "fasterrcnn_model.pth")
print("✅ Model saved to fasterrcnn_model.pth")

✅ Model saved to fasterrcnn_model.pth


In [8]:
import os
import json
import time
import torch
from torch.utils.data import DataLoader
from torchvision.datasets import CocoDetection
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# All paths to data and models go here

TEST_IMG_DIR = "dataset_coco/test"
TEST_ANN_FILE = "dataset_coco/annotations/instances_test.json"
MODEL_PATH = "fasterrcnn_model.pth"
NUM_CLASSES = 3  # Includes unused class

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom wrapper for the COCO dataset

class CocoDetectionForFRCNN(CocoDetection):
    def __getitem__(self, idx):
        img, targets = super().__getitem__(idx)
        img_id = self.ids[idx]
        anno = [obj for obj in targets if obj.get("iscrowd", 0) == 0]
        boxes = []
        labels = []
        for obj in anno:
            x, y, w, h = obj["bbox"]
            boxes.append([x, y, x + w, y + h])
            labels.append(obj["category_id"])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([img_id])
        }
        return F.to_tensor(img), target

def collate_fn(batch):
    return tuple(zip(*batch))

def get_model_size(model_path):
    size_bytes = os.path.getsize(model_path)
    return round(size_bytes / (1024 * 1024), 2)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Reload the model here

model = fasterrcnn_resnet50_fpn(weights=None)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
model.load_state_dict(torch.load(MODEL_PATH))
model.to(device)
model.eval()

# Load the test set here

test_dataset = CocoDetectionForFRCNN(TEST_IMG_DIR, TEST_ANN_FILE)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn)

# Code for inference and MAP evaluation

print("Running inference on test set...")
predictions = []
start = time.time()

for imgs, targets in test_loader:
    imgs = [img.to(device) for img in imgs]
    with torch.no_grad():
        outputs = model(imgs)

    for output, target in zip(outputs, targets):
        boxes = output['boxes'].cpu().tolist()
        scores = output['scores'].cpu().tolist()
        labels = output['labels'].cpu().tolist()

        for box, score, label in zip(boxes, scores, labels):
            predictions.append({
                "image_id": int(target['image_id']),
                "category_id": int(label),
                "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]],
                "score": float(score)
            })

total_time = time.time() - start
avg_time = total_time / len(test_dataset)

# Save predictions and compute the COCO map

with open("frcnn_test_predictions.json", "w") as f:
    json.dump(predictions, f)

coco_gt = COCO(TEST_ANN_FILE)
coco_dt = coco_gt.loadRes("frcnn_test_predictions.json")

evaluator = COCOeval(coco_gt, coco_dt, iouType='bbox')
evaluator.evaluate()
evaluator.accumulate()
evaluator.summarize()

# Evaluate all results here

mAP_50_95 = evaluator.stats[0]
mAP_50 = evaluator.stats[1]
model_file_size = get_model_size(MODEL_PATH)
param_count = count_parameters(model) / 1e6  # in millions

# Summary

print(f"\nmAP@0.5:         {mAP_50:.4f}")
print(f"mAP@0.5:0.95:    {mAP_50_95:.4f}")
print(f"Inference Speed: {avg_time * 1000:.2f} ms/image")
print(f"Model File Size: {model_file_size:.2f} MB")
print(f"Trainable Params: {param_count:.2f} million")

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Running inference on test set...
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.03s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.756
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.908
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.908
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.689
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.760
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.630
 Average Recall     (AR) @[ 