In [1]:
!pip install onnxruntime-gpu
!pip install psutil

Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime_gpu-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (280.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.8/280.8 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hIns

In [2]:
import time
import psutil
import numpy as np
import onnxruntime as ort
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

# Initialize NVML for GPU memory tracking
nvmlInit()
device_handle = nvmlDeviceGetHandleByIndex(0)  # Assuming single GPU

# Load ONNX model with GPU support
onnx_session = ort.InferenceSession("/content/resnet50_dog_cat.onnx", providers=["CUDAExecutionProvider"])

# Define input shape
IMG_SIZE = 224  # Modify this if needed

def preprocess_dummy_image():
    """Create a dummy image and preprocess it to match model input."""
    image = np.random.rand(IMG_SIZE, IMG_SIZE, 3).astype(np.float32)  # Random image
    image = image / 255.0  # Normalize
    image = np.transpose(image, (2, 0, 1))  # Convert to (C, H, W)
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    return image

def benchmark_onnx_runtime(num_iterations=100):
    """Benchmark ONNX inference speed, GPU, and memory usage."""
    latencies = []
    gpu_memory_usages = []
    input_name = onnx_session.get_inputs()[0].name

    print(f"Running {num_iterations} inferences on ONNX Runtime (GPU)...")

    for _ in range(num_iterations):
        # Preprocess input
        image = preprocess_dummy_image()

        # Measure GPU memory before inference
        gpu_memory_before = nvmlDeviceGetMemoryInfo(device_handle).used / (1024 * 1024)  # Convert bytes to MB

        # Measure inference time
        start_time = time.time()
        _ = onnx_session.run(None, {input_name: image})
        end_time = time.time()

        # Measure GPU memory after inference
        gpu_memory_after = nvmlDeviceGetMemoryInfo(device_handle).used / (1024 * 1024)

        # Store metrics
        latencies.append((end_time - start_time) * 1000)  # Convert to ms
        gpu_memory_usages.append(gpu_memory_after - gpu_memory_before)

    # Compute statistics
    avg_latency = np.mean(latencies)
    max_latency = np.max(latencies)
    min_latency = np.min(latencies)
    avg_gpu_memory_usage = np.mean(gpu_memory_usages)
    throughput = num_iterations / (sum(latencies) / 1000)  # Inferences per second

    # Print results
    print("\n==== ONNX Runtime (GPU) Benchmark Results ====")
    print(f"Average Latency: {avg_latency:.2f} ms")
    print(f"Min Latency: {min_latency:.2f} ms")
    print(f"Max Latency: {max_latency:.2f} ms")
    print(f"Average GPU Memory Usage: {avg_gpu_memory_usage:.2f} MB")
    print(f"Throughput: {throughput:.2f} inferences per second")

if __name__ == "__main__":
    benchmark_onnx_runtime(num_iterations=100)



Running 100 inferences on ONNX Runtime (GPU)...

==== ONNX Runtime (GPU) Benchmark Results ====
Average Latency: 24.12 ms
Min Latency: 4.32 ms
Max Latency: 1659.68 ms
Average GPU Memory Usage: 1.38 MB
Throughput: 41.46 inferences per second
