In [1]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

print("cuDNN version:", torch.backends.cudnn.version())

PyTorch version: 2.8.0+cu128
CUDA available: True
CUDA version: 12.8
cuDNN version: 91002


In [2]:
import onnxruntime

print(onnxruntime.__version__)
print(onnxruntime.get_device())

1.22.0
GPU


<br>

# Instance Segmentation with Mask R-CNN and ONNX Runtime GPU

This notebook demonstrates how to:
1. Export a pre-trained Mask R-CNN model to ONNX format
2. Run the model on video using ONNX Runtime with GPU acceleration
3. Visualize instance segmentation results with colored masks

<br>

## 1. Export Mask R-CNN model to ONNX format

In [3]:
import torch
import torchvision

# Define model input dimensions
model_width = 800
model_height = 800

# Load pretrained Mask R-CNN model
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Create dummy input - typical input size for Mask R-CNN
dummy_input = torch.randn(1, 3, model_height, model_width)

# Export the model to ONNX format
torch.onnx.export(
    model, 
    dummy_input, 
    "models/maskrcnn.onnx",
    opset_version=11,
    input_names=["images"],
    output_names=["boxes", "labels", "scores", "masks"],
    dynamic_axes={
        "images": {0: "batch_size"},
        "boxes": {0: "num_detections"},
        "labels": {0: "num_detections"},
        "scores": {0: "num_detections"},
        "masks": {0: "num_detections"}
    }
)

print(f"Mask R-CNN model exported to models/maskrcnn.onnx with input size {model_width}x{model_height}")

  torch.onnx.export(
  * torch.tensor(scale_factors[i], dtype=torch.float32)
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
  assert condition, message
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
  return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


Mask R-CNN model exported to models/maskrcnn.onnx with input size 800x800


<br>

## 2. Run Mask R-CNN model on video with ONNX Runtime GPU

In [4]:
import onnxruntime as ort
import cv2
import numpy as np
import time

# Check available providers first
print("Available ONNX Runtime providers:", ort.get_available_providers())
print("CUDA provider available:", 'CUDAExecutionProvider' in ort.get_available_providers())

# Load pretrained Mask R-CNN model
session = ort.InferenceSession("models/maskrcnn.onnx", providers=['CUDAExecutionProvider'])
print("Session providers:", session.get_providers())

# Get input details
input_details = session.get_inputs()[0]
print(f"Input name: {input_details.name}")

# Check output shapes
for i, output in enumerate(session.get_outputs()):
    print(f"Output {i}: {output.name}, shape: {output.shape}")

# Load COCO class names from file
with open('resources/coco_labels_rcnn.txt', 'r') as f:
    coco_classes = [line.strip() for line in f.readlines()]
print(f"Loaded {len(coco_classes)} class names")
print(f"Using model input dimensions: {model_width}x{model_height}")

Available ONNX Runtime providers: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
CUDA provider available: True
Session providers: ['CUDAExecutionProvider', 'CPUExecutionProvider']
Input name: images
Output 0: boxes, shape: ['num_detections', 4]
Output 1: labels, shape: ['num_detections']
Output 2: scores, shape: ['num_detections']
Output 3: masks, shape: ['num_detections', 1, 'Unsqueezemasks_dim_2', 'Unsqueezemasks_dim_3']
Loaded 81 class names
Using model input dimensions: 800x800


[0;93m2025-08-24 16:21:43.598821202 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 2 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-08-24 16:21:43.599753135 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 12 Memcpy nodes are added to the graph sub_graph4 for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-08-24 16:21:43.605679264 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-08-24

In [5]:
# Define helper functions for video processing
def preprocess_frame(frame, target_size=(model_width, model_height)):
    """Preprocess frame for Mask R-CNN"""
    # Convert BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Resize image
    resized = cv2.resize(rgb_frame, target_size)
    
    # Normalize to [0, 1]
    normalized = resized.astype(np.float32) / 255.0
    
    # Convert HWC to CHW
    chw = np.transpose(normalized, (2, 0, 1))
    
    # Add batch dimension
    batch = np.expand_dims(chw, axis=0)
    
    return batch

def postprocess_detections(outputs, original_shape, conf_threshold=0.5):
    """Post-process Mask R-CNN outputs"""
    # Extract outputs
    boxes = outputs[0]    # Shape: (N, 4)
    labels = outputs[1]   # Shape: (N,)
    scores = outputs[2]   # Shape: (N,)
    masks = outputs[3]    # Shape: (N, 1, H, W)
    
    # Remove batch dimension if present
    if len(boxes.shape) == 3:
        boxes = boxes[0]
    if len(labels.shape) == 2:
        labels = labels[0]
    if len(scores.shape) == 2:
        scores = scores[0]
    if len(masks.shape) == 4 and masks.shape[1] == 1:
        masks = masks.squeeze(1)  # Remove channel dimension
    
    # Filter by confidence threshold
    valid_indices = scores > conf_threshold
    filtered_boxes = boxes[valid_indices]
    filtered_labels = labels[valid_indices]
    filtered_scores = scores[valid_indices]
    filtered_masks = masks[valid_indices]
    
    print(f"Detections after filtering: {len(filtered_scores)}")
    
    # Scale boxes to original image size
    orig_h, orig_w = original_shape[:2]
    if len(filtered_boxes) > 0:
        # Scale box coordinates from model dimensions to image dimensions
        filtered_boxes[:, [0, 2]] *= orig_w / model_width  # Scale x coordinates
        filtered_boxes[:, [1, 3]] *= orig_h / model_height  # Scale y coordinates
        
        # Scale masks to original image size
        if len(filtered_masks) > 0:
            scaled_masks = []
            for mask in filtered_masks:
                mask_resized = cv2.resize(mask, (orig_w, orig_h))
                scaled_masks.append(mask_resized)
            filtered_masks = np.array(scaled_masks)
    
    return filtered_boxes, filtered_labels, filtered_scores, filtered_masks

def apply_mask_overlay(frame, mask, color=(0, 255, 0), alpha=0.15):
    """Apply very subtle mask overlay to frame, single color for all masks to preserve video visibility"""
    # Create binary mask (True where mask > 0.5)
    mask_binary = mask > 0.5
    
    # Create colored mask image - using one consistent color 
    colored_mask = np.zeros_like(frame)
    colored_mask[mask_binary] = color
    
    # Use a very low alpha for much higher transparency
    result = cv2.addWeighted(frame, 1 - alpha, colored_mask, alpha, 0)
    return result

def apply_all_masks_at_once(frame, masks, color=(0, 255, 0), alpha=0.15):
    """Apply all masks at once to prevent darkening with multiple objects"""
    # Create a single combined mask for all objects
    combined_mask = np.zeros((frame.shape[0], frame.shape[1]), dtype=bool)
    
    # Add each mask to the combined mask
    for mask in masks:
        if mask.shape == (frame.shape[0], frame.shape[1]):
            combined_mask = np.logical_or(combined_mask, mask > 0.5)
    
    # Create colored mask image
    colored_mask = np.zeros_like(frame)
    colored_mask[combined_mask] = color
    
    # Apply the combined mask only once
    result = cv2.addWeighted(frame, 1 - alpha, colored_mask, alpha, 0)
    return result

# Define colors for different classes (BGR format)
colors = [
    (0, 255, 0),    # Green
    (255, 0, 0),    # Blue  
    (0, 0, 255),    # Red
    (255, 255, 0),  # Cyan
    (255, 0, 255),  # Magenta
    (0, 255, 255),  # Yellow
    (128, 0, 128),  # Purple
    (255, 165, 0),  # Orange
    (0, 128, 255),  # Light Blue
    (128, 128, 0)   # Olive
]

In [6]:
# Process video with Mask R-CNN
# You can change the video path and confidence threshold here
video_path = "resources/test_video_street.mp4"
confidence_threshold = 0.5

# Visualization options
show_masks = True  # Set to False to completely disable masks
mask_alpha = 0.2   # Mask transparency (0.2 = 20% opacity)
mask_color = (0, 255, 0)  # Single color (green) for all masks

# Open video
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# FPS calculation variables
fps_counter = 0
fps_start_time = time.time()
fps_display = 0.0

# Determine device string for display
device_str = "GPU" if 'CUDAExecutionProvider' in session.get_providers() else "CPU"
print(f"\nRunning Mask R-CNN with ONNX Runtime on {device_str}...")
print("Press 'q' to quit the video display")

# Loop through video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess frame 
    input_tensor = preprocess_frame(frame)

    # Run inference 
    try:
        outputs = session.run(None, {input_details.name: input_tensor})
        
        # Post-process 
        boxes, labels, scores, masks = postprocess_detections(outputs, frame.shape, conf_threshold=confidence_threshold)
        
        # First, apply all masks at once to prevent darkening with multiple detections
        if show_masks and len(masks) > 0:
            # Make sure we have valid masks before applying
            valid_masks = [mask for mask in masks if mask.shape == (height, width)]
            if valid_masks:
                frame = apply_all_masks_at_once(frame, valid_masks, mask_color, mask_alpha)
        
        # Then draw bounding boxes and labels
        for i, (box, label, score) in enumerate(zip(boxes, labels, scores)):
            if len(box) >= 4:
                x1, y1, x2, y2 = map(int, box[:4])
                
                # Ensure coordinates are within frame bounds
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(width, x2), min(height, y2)
                
                # Draw bounding box
                cv2.rectangle(frame, (x1, y1), (x2, y2), mask_color, 3)
                
                # Draw label
                class_name = coco_classes[int(label)] if int(label) < len(coco_classes) else f"class_{int(label)}"
                label_text = f"{class_name}: {score:.2f}"
                label_size = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
                cv2.rectangle(frame, (x1, y1 - label_size[1] - 10), 
                             (x1 + label_size[0] + 10, y1), mask_color, -1)
                cv2.putText(frame, label_text, (x1 + 5, y1 - 5),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
    
    except Exception as e:
        print(f"Inference error: {e}")

    # Add title at top middle of screen
    title_text = "Mask R-CNN Instance Segmentation"
    title_size = cv2.getTextSize(title_text, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2)[0]
    title_x = (width - title_size[0]) // 2
    cv2.rectangle(frame, (title_x - 10, 10), (title_x + title_size[0] + 10, 50), (0, 0, 0), -1)
    cv2.putText(frame, title_text, (title_x, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)

    # Calculate and display FPS
    fps_counter += 1
    if fps_counter % 10 == 0:
        fps_end_time = time.time()
        fps_display = 10 / (fps_end_time - fps_start_time)
        fps_start_time = fps_end_time
    
    # Draw FPS
    fps_text = f"FPS: {fps_display:.1f} (Mask R-CNN-{device_str})"
    cv2.rectangle(frame, (5, height - 40), (280, height - 10), (0, 0, 0), -1)
    cv2.putText(frame, fps_text, (10, height - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Display 
    cv2.imshow("Mask R-CNN ONNX Runtime", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release everything
cap.release()
cv2.destroyAllWindows()

print("Video processing completed!")


Running Mask R-CNN with ONNX Runtime on GPU...
Press 'q' to quit the video display
Detections after filtering: 6
Detections after filtering: 7
Detections after filtering: 9
Detections after filtering: 5
Detections after filtering: 5
Detections after filtering: 6
Detections after filtering: 7
Detections after filtering: 8
Detections after filtering: 8
Detections after filtering: 7
Detections after filtering: 11
Detections after filtering: 8
Detections after filtering: 6
Detections after filtering: 8
Detections after filtering: 9
Detections after filtering: 8
Video processing completed!
