In [2]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

print("cuDNN version:", torch.backends.cudnn.version())

PyTorch version: 2.8.0+cu128
CUDA available: True
CUDA version: 12.8
cuDNN version: 91002


In [3]:
import onnxruntime

print(onnxruntime.__version__)
print(onnxruntime.get_device())

1.22.0
GPU


<br>

# Object Detection with Faster R-CNN and ONNX Runtime GPU

This notebook demonstrates how to:
1. Export a pre-trained Faster R-CNN model to ONNX format
2. Run the model on video using ONNX Runtime with GPU acceleration
3. Visualize object detection results in real-time

<br>

## 1. Export Faster R-CNN model to ONNX format

In [4]:
import torch
import torchvision

# Define model input dimensions
model_width = 800
model_height = 800

# Load pretrained Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Create dummy input - typical input size for Faster R-CNN
dummy_input = torch.randn(1, 3, model_height, model_width)

# Export the model to ONNX format
torch.onnx.export(
    model, 
    dummy_input, 
    "models/fasterrcnn.onnx",
    opset_version=11,
    input_names=["images"],
    output_names=["boxes", "labels", "scores"],
    dynamic_axes={
        "images": {0: "batch_size"},
        "boxes": {0: "num_detections"},
        "labels": {0: "num_detections"},
        "scores": {0: "num_detections"}
    }
)

print(f"Faster R-CNN model exported to models/fasterrcnn.onnx with input size {model_width}x{model_height}")

  torch.onnx.export(
  * torch.tensor(scale_factors[i], dtype=torch.float32)
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
  assert condition, message
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)


Faster R-CNN model exported to models/fasterrcnn.onnx with input size 800x800


<br>

## 2. Run Faster R-CNN model on video with ONNX Runtime GPU

In [5]:
import onnxruntime as ort
import cv2
import numpy as np
import time

# Check available providers first
print("Available ONNX Runtime providers:", ort.get_available_providers())
print("CUDA provider available:", 'CUDAExecutionProvider' in ort.get_available_providers())

# Load pretrained Faster R-CNN model
session = ort.InferenceSession("models/fasterrcnn.onnx", providers=['CUDAExecutionProvider'])
print("Session providers:", session.get_providers())

# Get input details
input_details = session.get_inputs()[0]
print(f"Input name: {input_details.name}")

# Check output shapes
for i, output in enumerate(session.get_outputs()):
    print(f"Output {i}: {output.name}, shape: {output.shape}")

# Load COCO class names from file
with open('resources/coco_labels_rcnn.txt', 'r') as f:
    coco_classes = [line.strip() for line in f.readlines()]
print(f"Loaded {len(coco_classes)} class names")
print(f"Using model input dimensions: {model_width}x{model_height}")

Available ONNX Runtime providers: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
CUDA provider available: True
Session providers: ['CUDAExecutionProvider', 'CPUExecutionProvider']
Input name: images
Output 0: boxes, shape: ['num_detections', 4]
Output 1: labels, shape: ['num_detections']
Output 2: scores, shape: ['num_detections']
Loaded 81 class names
Using model input dimensions: 800x800


[0;93m2025-08-24 16:20:49.851928577 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-24 16:20:49.851958398 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [6]:
def preprocess_frame(frame, target_size=(model_width, model_height)):
    """Preprocess frame for Faster R-CNN"""
    # Convert BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Resize image
    resized = cv2.resize(rgb_frame, target_size)
    
    # Normalize to [0, 1]
    normalized = resized.astype(np.float32) / 255.0
    
    # Convert HWC to CHW
    chw = np.transpose(normalized, (2, 0, 1))
    
    # Add batch dimension
    batch = np.expand_dims(chw, axis=0)
    
    return batch

def postprocess_detections(outputs, original_shape, conf_threshold=0.5):
    """Post-process Faster R-CNN outputs"""
    # Faster R-CNN typically outputs: boxes, labels, scores
    # The exact order depends on how the model was exported
    if len(outputs) == 3:
        boxes, labels, scores = outputs
    else:
        # If single output, it might be a dictionary-like structure
        # We'll need to adapt based on actual output format
        print(f"Unexpected number of outputs: {len(outputs)}")
        return [], [], []
    
    # Remove batch dimension if present
    if len(boxes.shape) == 3:
        boxes = boxes[0]
    if len(labels.shape) == 2:
        labels = labels[0]
    if len(scores.shape) == 2:
        scores = scores[0]
    
    # Filter by confidence threshold
    valid_indices = scores > conf_threshold
    filtered_boxes = boxes[valid_indices]
    filtered_labels = labels[valid_indices]
    filtered_scores = scores[valid_indices]
    
    # Scale boxes to original image size
    orig_h, orig_w = original_shape[:2]
    if len(filtered_boxes) > 0:
        # Scale box coordinates from model dimensions to image dimensions
        filtered_boxes[:, [0, 2]] *= orig_w / model_width  # Scale x coordinates
        filtered_boxes[:, [1, 3]] *= orig_h / model_height  # Scale y coordinates
    
    return filtered_boxes, filtered_labels, filtered_scores

In [7]:
# Process video with Faster R-CNN
# You can change the video path and confidence threshold here
video_path = "resources/test_video_street.mp4"
confidence_threshold = 0.5

# Open video
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# FPS calculation variables
fps_counter = 0
fps_start_time = time.time()
fps_display = 0.0

# Determine device string for display
device_str = "GPU" if 'CUDAExecutionProvider' in session.get_providers() else "CPU"
print(f"\nRunning Faster R-CNN with ONNX Runtime on {device_str}...")
print("Press 'q' to quit the video display")

# Loop through video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess frame 
    input_tensor = preprocess_frame(frame)

    # Run inference 
    try:
        outputs = session.run(None, {input_details.name: input_tensor})
        
        # Post-process 
        boxes, labels, scores = postprocess_detections(outputs, frame.shape, conf_threshold=confidence_threshold)
        
        # Draw detections 
        for box, label, score in zip(boxes, labels, scores):
            if len(box) >= 4:
                x1, y1, x2, y2 = map(int, box[:4])
                
                # Ensure coordinates are within frame bounds
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(width, x2), min(height, y2)
                
                # Draw bounding box
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # Draw label
                class_name = coco_classes[int(label)] if int(label) < len(coco_classes) else f"class_{int(label)}"
                label_text = f"{class_name}: {score:.2f}"
                label_size = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
                cv2.rectangle(frame, (x1, y1 - label_size[1] - 10), 
                             (x1 + label_size[0], y1), (0, 255, 0), -1)
                cv2.putText(frame, label_text, (x1, y1 - 5),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
    
    except Exception as e:
        print(f"Inference error: {e}")

    # Add title at top middle of screen
    title_text = "Faster R-CNN Object Detection"
    title_size = cv2.getTextSize(title_text, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2)[0]
    title_x = (width - title_size[0]) // 2
    cv2.rectangle(frame, (title_x - 10, 10), (title_x + title_size[0] + 10, 50), (0, 0, 0), -1)
    cv2.putText(frame, title_text, (title_x, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)

    # Calculate and display FPS
    fps_counter += 1
    if fps_counter % 10 == 0:
        fps_end_time = time.time()
        fps_display = 10 / (fps_end_time - fps_start_time)
        fps_start_time = fps_end_time
    
    # Draw FPS
    fps_text = f"FPS: {fps_display:.1f} (Faster R-CNN-{device_str})"
    cv2.rectangle(frame, (5, height - 40), (280, height - 10), (0, 0, 0), -1)
    cv2.putText(frame, fps_text, (10, height - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Display 
    cv2.imshow("Faster R-CNN ONNX Runtime", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release everything
cap.release()
cv2.destroyAllWindows()

print("Video processing completed!")


Running Faster R-CNN with ONNX Runtime on GPU...
Press 'q' to quit the video display
Video processing completed!
