In [1]:
import os
import time
import numpy as np
import torch
import onnx
import onnxruntime as ort
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd

In [2]:
class CheXpertDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        df = pd.read_csv(csv_file)
        self.image_paths = df['image_path'].values
        self.labels = df.drop(columns=['image_path']).values.astype('float32')
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx]).convert("RGB")
        img = self.transform(img)
        label = torch.tensor(self.labels[idx])
        return img, label

In [3]:
onnx_model_path = "models/demoONNX.onnx"

ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])

ort_session.get_providers()

['CPUExecutionProvider']

In [4]:
model_size = os.path.getsize(onnx_model_path)
print(f"Model Size on Disk: {model_size / 1e6 :.2f} MB")

Model Size on Disk: 0.02 MB


In [None]:
test_dataset = CheXpertDataset("test.csv")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) # adjust for GPU execution

In [8]:
def benchmark_session(ort_session):
    model_size = os.path.getsize(onnx_model_path)
    print(f"Model Size on Disk: {model_size / 1e6 :.2f} MB")
    
    print(f"Execution provider: {ort_session.get_providers()}")
    correct = 0
    total = 0
    
    for images, labels in test_loader:
        images_np = images.numpy()
        # Run ONNX model inference
        outputs = ort_session.run(None, {ort_session.get_inputs()[0].name: images_np})[0]  # shape: [B, 14]
        # Predicted class: index of max logit
        predicted = np.argmax(outputs, axis=1)
        
        # If labels are one-hot or multi-hot: use argmax
        target = np.argmax(labels.numpy(), axis=1)
        correct += np.sum(predicted == target)
        total += labels.size(0)
    
    accuracy = (correct / total) * 100
    print(f"ONNX Model Accuracy: {accuracy:.2f}%")

    num_trials = 100  # Number of trials

    # Get a single sample from the test data
    
    single_sample, _ = next(iter(test_loader))  
    single_sample = single_sample[:1].numpy()
    
    # Warm-up run
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
    
    latencies = []
    for _ in range(num_trials):
        start_time = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
        latencies.append(time.time() - start_time)
    print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")

    
    num_batches = 50  # Number of trials
    # Get a batch from the test data
    batch_input, _ = next(iter(test_loader))  
    batch_input = batch_input.numpy()
    
    # Warm-up run
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
    
    batch_times = []
    for _ in range(num_batches):
        start_time = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
        batch_times.append(time.time() - start_time)
        
    batch_fps = (batch_input.shape[0] * num_batches) / np.sum(batch_times) 
    print(f"Batch Throughput: {batch_fps:.2f} FPS")

In [9]:
benchmark_session(ort_session)

Execution provider: ['CPUExecutionProvider']


NameError: name 'test_loader' is not defined

In [None]:
import neural_compressor
from neural_compressor import quantization
     

# Load ONNX model into Intel Neural Compressor
model_path = "models/demoONNX.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

# Configure the quantizer
config_ptq = neural_compressor.PostTrainingQuantConfig(
    approach="dynamic"
)

# Fit the quantized model
q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq
)

# Save quantized model
q_model.save_model_to_file("models/demoModel_quantized_dynamic.onnx")

In [None]:
onnx_model_path = "models/demoModel_quantized_dynamic.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)