In [1]:
# install dependencies
!apt-get update && apt-get install -y libnvinfer-dev libnvinfer-plugin-dev
!pip install onnx onnxruntime tensorrt onnx_graphsurgeon fastapi uvicorn nest-asyncio
!pip install fastapi uvicorn pyngrok pycuda pillow numpy tensorrt python-multipart

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,802 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,773 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packa

In [11]:
import time
import os
import psutil
import numpy as np
import onnxruntime as ort
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
from PIL import Image

# Paths
onnx_model_path = "/content/resnet50_dog_cat.onnx"
tensorrt_engine_path = "/content/resnet50_dog_cat.trt"
image_folder = "/content/test_images"

# Load ONNX model with GPU provider
onnx_session = ort.InferenceSession(onnx_model_path, providers=["CUDAExecutionProvider"])
onnx_input_name = onnx_session.get_inputs()[0].name

# Load TensorRT engine
def load_trt_engine(engine_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

tensorrt_engine = load_trt_engine(tensorrt_engine_path)
context = tensorrt_engine.create_execution_context()

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image = image.resize((224, 224))
    image_array = np.asarray(image).astype(np.float32) / 255.0
    image_array = np.transpose(image_array, (2, 0, 1))
    image_array = np.expand_dims(image_array, axis=0)
    return image_array

def load_batch_images(image_folder, batch_size=8):
    image_files = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith(("jpg", "png"))]
    images = [preprocess_image(img) for img in image_files[:batch_size]]
    images = np.vstack(images)
    return images, image_files[:batch_size]

def run_onnx_inference(session, images, input_name):
    return session.run(None, {input_name: images})[0]

def run_trt_inference(engine, context, images):
    # Ensure the input array is contiguous
    images = np.ascontiguousarray(images) # This line is added to make the array contiguous

    d_input = cuda.mem_alloc(images.nbytes)
    d_output = cuda.mem_alloc(images.nbytes)
    bindings = [int(d_input), int(d_output)]
    stream = cuda.Stream()

    cuda.memcpy_htod_async(d_input, images, stream)
    # Changed from execute_async_v2 to execute_v2
    context.execute_v2(bindings=bindings)
    output = np.empty_like(images)
    cuda.memcpy_dtoh_async(output, d_output, stream)
    stream.synchronize()
    return output

def benchmark_model(session, context, images, input_name, model_name, num_runs=50):
    times, gpu_usages, memory_usages = [], [], []

    for _ in range(num_runs):
        start_time = time.time()
        start_gpu = psutil.virtual_memory().percent
        start_memory = psutil.virtual_memory().used / (1024 * 1024)

        if model_name == "ONNX":
            result = run_onnx_inference(session, images, input_name)
        else:
            result = run_trt_inference(tensorrt_engine, context, images)

        end_time = time.time()
        end_gpu = psutil.virtual_memory().percent
        end_memory = psutil.virtual_memory().used / (1024 * 1024)

        times.append((end_time - start_time) * 1000)
        gpu_usages.append(end_gpu)
        memory_usages.append(end_memory - start_memory)

    avg_latency = np.mean(times)
    avg_throughput = (len(images) * num_runs) / (np.sum(times) / 1000)
    avg_gpu = np.mean(gpu_usages)
    avg_memory = np.mean(memory_usages)

    return avg_latency, avg_throughput, avg_gpu, avg_memory

if __name__ == "__main__":
    batch_size = 8
    num_runs = 50

    images, _ = load_batch_images(image_folder, batch_size)

    print("\n🔹 Running ONNX model benchmark...")
    onnx_metrics = benchmark_model(onnx_session, None, images, onnx_input_name, "ONNX", num_runs)

    print("\n🔹 Running TensorRT model benchmark...")
    tensorrt_metrics = benchmark_model(None, context, images, None, "TensorRT", num_runs)

    print("\n✅ Benchmark Results:")
    print(f"{'Metric':<25}{'ONNX Model':<20}{'TensorRT Model'}")
    print("="*70)
    print(f"{'Batch Size':<25}{batch_size:<20}{batch_size}")
    print(f"{'Avg Latency (ms)':<25}{onnx_metrics[0]:<20.2f}{tensorrt_metrics[0]:.2f}")
    print(f"{'Throughput (img/sec)':<25}{onnx_metrics[1]:<20.2f}{tensorrt_metrics[1]:.2f}")
    print(f"{'Avg GPU Usage (%)':<25}{onnx_metrics[2]:<20.2f}{tensorrt_metrics[2]:.2f}")
    print(f"{'Avg Memory Usage (MB)':<25}{onnx_metrics[3]:<20.2f}{tensorrt_metrics[3]:.2f}")


🔹 Running ONNX model benchmark...

🔹 Running TensorRT model benchmark...

✅ Benchmark Results:
Metric                   ONNX Model          TensorRT Model
Batch Size               8                   8
Avg Latency (ms)         888.73              8.69
Throughput (img/sec)     9.00                920.63
Avg GPU Usage (%)        20.44               20.20
Avg Memory Usage (MB)    -1.48               0.00
