In [1]:
!pip install lightning
import os
import time
import numpy as np
import torch
import onnx
import onnxruntime as ort
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import pandas as pd
from model_def import *

Collecting lightning
  Downloading lightning-2.5.1.post0-py3-none-any.whl.metadata (39 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.5.1.post0-py3-none-any.whl.metadata (20 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<2026.0,>=2022.5.0->lightning)
  Downloading aiohttp-3.11.18-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning)
  Downloading aiosignal-1.3.2-py2.py3-none-an

In [2]:
class CheXpertDataset(torch.utils.data.Dataset):
    def __init__(self, csv_path, image_size=224):
        self.df = pd.read_csv(csv_path)
        self.image_paths = self.df["corrected_path"].values

        # Extract label columns from start to end
        start_col = "Enlarged Cardiomediastinum"
        end_col = "No Finding"
        label_columns = self.df.loc[:, start_col:end_col].columns

        # Load and convert -1 to 1
        self.labels = self.df[label_columns].astype(np.float32).values
        self.labels[self.labels == -1.0] = 1.0  # Convert -1s to 1s

        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.Grayscale(num_output_channels=3),
            transforms.ToTensor(),
            transforms.Normalize([0.5]*3, [0.5]*3)
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # path = self.image_paths[idx].replace("/data/", "/")
        path = self.image_paths[idx].replace("/mnt/data/", "/mnt/dataset/")
        image = Image.open(path).convert("RGB")
        image = self.transform(image)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return image, label

In [3]:
from torch.utils.data import DataLoader, Subset, random_split

csv_path = r"filtered_chexpert_paths.csv"

full_dataset = CheXpertDataset(csv_path)
total_len = len(full_dataset)

# Indices
sixty_percent = int(0.6 * total_len)
next_percent = int(0.62 * total_len)

# First 60% for training/validation
# dataset_60 = Subset(full_dataset, list(range(0, sixty_percent)))

# Middle 30% for testing
dataset_test = Subset(full_dataset, list(range(sixty_percent, next_percent)))

test_loader = DataLoader(dataset_test, batch_size=16, shuffle=False, num_workers=4)

In [4]:
model_path = "./mlflowModel1.pt"  
device = torch.device("cpu")
model = torch.load(model_path, map_location=device, weights_only=False)


onnx_model_path = "./mlflowModel1.onnx"
# dummy input - used to clarify the input shape
dummy_input = torch.randn(1, 3, 224, 224)  
torch.onnx.export(model, dummy_input, onnx_model_path,
                  export_params=True, opset_version=20,
                  do_constant_folding=True, input_names=['input'],
                  output_names=['output'], dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}})

print(f"ONNX model saved to {onnx_model_path}")

onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

ONNX model saved to ./mlflowModel1.onnx


In [5]:
onnx_model_path = "./mlflowModel1.onnx"

ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])

ort_session.get_providers()

['CPUExecutionProvider']

In [6]:
model_size = os.path.getsize(onnx_model_path)
print(f"Model Size on Disk: {model_size / 1e6 :.2f} MB")

Model Size on Disk: 28.28 MB


In [7]:
def benchmark_session(ort_session):
    model_size = os.path.getsize(onnx_model_path)
    print(f"Model Size on Disk: {model_size / 1e6 :.2f} MB")
    
    print(f"Execution provider: {ort_session.get_providers()}")
    correct = 0
    total = 0
    
    for images, labels in test_loader:
        images_np = images.numpy()
        # Run ONNX model inference
        outputs = ort_session.run(None, {ort_session.get_inputs()[0].name: images_np})[0]  # shape: [B, 14]
        # Predicted class: index of max logit
        # predicted = np.argmax(outputs, axis=1)
        preds = (1 / (1 + np.exp(-outputs))) > 0.5
        # If labels are one-hot or multi-hot: use argmax
        # target = np.argmax(labels.numpy(), axis=1)
        # correct += np.sum(predicted == target)
        # total += labels.size(0)
        labels_np = labels.numpy().astype(bool)
        correct += np.sum(preds == labels_np)
        total += labels_np.size
    
    accuracy = (correct / total) * 100
    print(f"ONNX Model Accuracy: {accuracy:.2f}%")

    num_trials = 100  # Number of trials

    # Get a single sample from the test data
    
    single_sample, _ = next(iter(test_loader))  
    single_sample = single_sample[:1].numpy()
    
    # Warm-up run
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
    
    latencies = []
    for _ in range(num_trials):
        start_time = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
        latencies.append(time.time() - start_time)
    print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")

    
    num_batches = 50  # Number of trials
    # Get a batch from the test data
    batch_input, _ = next(iter(test_loader))  
    batch_input = batch_input.numpy()
    
    # Warm-up run
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
    
    batch_times = []
    for _ in range(num_batches):
        start_time = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
        batch_times.append(time.time() - start_time)
        
    batch_fps = (batch_input.shape[0] * num_batches) / np.sum(batch_times) 
    print(f"Batch Throughput: {batch_fps:.2f} FPS")

In [8]:
onnx_model_path = "./mlflowModel1.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

Model Size on Disk: 28.28 MB
Execution provider: ['CPUExecutionProvider']
ONNX Model Accuracy: 77.96%
Inference Latency (single sample, median): 25.87 ms
Inference Latency (single sample, 95th percentile): 31.96 ms
Inference Latency (single sample, 99th percentile): 38.64 ms
Inference Throughput (single sample): 37.33 FPS
Batch Throughput: 58.78 FPS


In [9]:
onnx_model_path = "./mlflowModel1.onnx"
optimized_model_path = "./mlflowModel1_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED # apply graph optimizations
session_options.optimized_model_filepath = optimized_model_path

ort_session = ort.InferenceSession(onnx_model_path, sess_options=session_options, providers=['CPUExecutionProvider'])

In [10]:
onnx_model_path = "./mlflowModel1_optimized.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

Model Size on Disk: 28.27 MB
Execution provider: ['CPUExecutionProvider']
ONNX Model Accuracy: 77.96%
Inference Latency (single sample, median): 25.61 ms
Inference Latency (single sample, 95th percentile): 26.81 ms
Inference Latency (single sample, 99th percentile): 29.06 ms
Inference Throughput (single sample): 38.81 FPS
Batch Throughput: 60.90 FPS


In [11]:
import neural_compressor
from neural_compressor import quantization

# Load ONNX model into Intel Neural Compressor
model_path = "./mlflowModel1.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

# Configure the quantizer
config_ptq = neural_compressor.PostTrainingQuantConfig(
    approach="dynamic"
)

# Fit the quantized model
q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq
)

# Save quantized model
q_model.save_model_to_file("./mlflowModel1_quantized_dynamic.onnx")

2025-05-10 21:18:16 [INFO] Start auto tuning.
2025-05-10 21:18:16 [INFO] Quantize model without tuning!
2025-05-10 21:18:16 [INFO] Quantize the model with default configuration without evaluating the model.                To perform the tuning process, please either provide an eval_func or provide an                    eval_dataloader an eval_metric.
2025-05-10 21:18:16 [INFO] Adaptor has 5 recipes.
2025-05-10 21:18:16 [INFO] 0 recipes specified by user.
2025-05-10 21:18:16 [INFO] 3 recipes require future tuning.
2025-05-10 21:18:16 [INFO] *** Initialize auto tuning
2025-05-10 21:18:16 [INFO] {
2025-05-10 21:18:16 [INFO]     'PostTrainingQuantConfig': {
2025-05-10 21:18:16 [INFO]         'AccuracyCriterion': {
2025-05-10 21:18:16 [INFO]             'criterion': 'relative',
2025-05-10 21:18:16 [INFO]             'higher_is_better': True,
2025-05-10 21:18:16 [INFO]             'tolerable_loss': 0.01,
2025-05-10 21:18:16 [INFO]             'absolute': None,
2025-05-10 21:18:16 [INFO]     

In [12]:
onnx_model_path = "./mlflowModel1_quantized_dynamic.onnx"
model_size = os.path.getsize(onnx_model_path) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")

Model Size on Disk: 7.86 MB


In [13]:
onnx_model_path = "./mlflowModel1_quantized_dynamic.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

Model Size on Disk: 7.86 MB
Execution provider: ['CPUExecutionProvider']
ONNX Model Accuracy: 76.64%
Inference Latency (single sample, median): 46.04 ms
Inference Latency (single sample, 95th percentile): 58.55 ms
Inference Latency (single sample, 99th percentile): 58.65 ms
Inference Throughput (single sample): 20.15 FPS
Batch Throughput: 24.60 FPS


In [14]:

import neural_compressor
from neural_compressor import quantization
from torchvision import datasets, transforms

image_size = 224
test_transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.Grayscale(num_output_channels=3),
            transforms.ToTensor(),
            transforms.Normalize([0.5]*3, [0.5]*3)
        ])

# Load dataset
dataset_test = Subset(full_dataset, list(range(sixty_percent, next_percent)))
test_loader = DataLoader(dataset_test, batch_size=16, shuffle=False, num_workers=4)
eval_dataloader = neural_compressor.data.DataLoader(framework='onnxruntime', dataset=dataset_test)

In [15]:
import onnxruntime as ort
from neural_compressor.model.onnx_model import ONNXModel

onnx_model_path = "./mlflowModel1.onnx"
fp32_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
fp32_model = ONNXModel(onnx_model_path)  # still needed for INC's internal use

# Configure the quantizer
config_ptq = neural_compressor.PostTrainingQuantConfig(
    accuracy_criterion = neural_compressor.config.AccuracyCriterion(
        criterion="absolute",  
        tolerable_loss=0.1  # We will tolerate up to 0.01 less accuracy in the quantized model
    ),
    approach="static", 
    device='cpu', 
    quant_level=0,  # 0 is a less aggressive quantization level
    quant_format="QOperator", 
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"}, 
    calibration_sampling_size=128
)

def eval_func(_):
    correct = 0
    total = 0
    threshold = 0.5

    for images, labels in test_loader:
        images_np = images.numpy()
        labels_np = labels.numpy().astype(bool)

        outputs = fp32_session.run(None, {fp32_session.get_inputs()[0].name: images_np})[0]
        preds = (1 / (1 + np.exp(-outputs))) > threshold

        correct += np.sum(preds == labels_np)
        total += labels_np.size

    return correct / total

# Find the best quantized model meeting the accuracy criterion
q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq, 
    calib_dataloader=eval_dataloader,
    # eval_dataloader=eval_dataloader, 
    eval_func=eval_func
)

# Save quantized model
q_model.save_model_to_file("./mlflowModel1_quantized_conservative.onnx")

2025-05-10 21:19:14 [INFO] Start conservative tuning.
2025-05-10 21:19:14 [INFO] Execute the tuning process due to detect the evaluation function.
2025-05-10 21:19:14 [INFO] Adaptor has 5 recipes.
2025-05-10 21:19:14 [INFO] 0 recipes specified by user.
2025-05-10 21:19:14 [INFO] 3 recipes require future tuning.
2025-05-10 21:19:14 [INFO] *** Initialize conservative tuning
2025-05-10 21:19:14 [INFO] {
2025-05-10 21:19:14 [INFO]     'PostTrainingQuantConfig': {
2025-05-10 21:19:14 [INFO]         'AccuracyCriterion': {
2025-05-10 21:19:14 [INFO]             'criterion': 'absolute',
2025-05-10 21:19:14 [INFO]             'higher_is_better': True,
2025-05-10 21:19:14 [INFO]             'tolerable_loss': 0.1,
2025-05-10 21:19:14 [INFO]             'absolute': 0.1,
2025-05-10 21:19:14 [INFO]             'keys': <bound method AccuracyCriterion.keys of <neural_compressor.config.AccuracyCriterion object at 0x7a74d08867e0>>,
2025-05-10 21:19:14 [INFO]             'relative': None
2025-05-10 21:19

In [16]:
onnx_model_path = "./mlflowModel1_quantized_conservative.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)

Model Size on Disk: 19.88 MB
Execution provider: ['CPUExecutionProvider']
ONNX Model Accuracy: 78.29%
Inference Latency (single sample, median): 35.74 ms
Inference Latency (single sample, 95th percentile): 40.26 ms
Inference Latency (single sample, 99th percentile): 43.03 ms
Inference Throughput (single sample): 27.57 FPS
Batch Throughput: 36.62 FPS


In [17]:
onnx_model_path = "./mlflowModel1.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)     

Model Size on Disk: 28.28 MB
Execution provider: ['CPUExecutionProvider']
ONNX Model Accuracy: 77.96%
Inference Latency (single sample, median): 25.60 ms
Inference Latency (single sample, 95th percentile): 29.28 ms
Inference Latency (single sample, 99th percentile): 31.91 ms
Inference Throughput (single sample): 38.40 FPS
Batch Throughput: 60.75 FPS


In [18]:
onnx_model_path = "./mlflowModel1.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()  

Model Size on Disk: 28.28 MB
Execution provider: ['CUDAExecutionProvider', 'CPUExecutionProvider']


[0;93m2025-05-10 21:21:24.730677108 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 6 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 21:21:24.734874952 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 21:21:24.734887582 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


ONNX Model Accuracy: 77.96%
Inference Latency (single sample, median): 10.12 ms
Inference Latency (single sample, 95th percentile): 10.83 ms
Inference Latency (single sample, 99th percentile): 11.32 ms
Inference Throughput (single sample): 97.00 FPS
Batch Throughput: 381.62 FPS


'GPU'

In [19]:
#P100 GPU, TesnorRT does not work
onnx_model_path = "./mlflowModel1.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['TensorrtExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()  

Model Size on Disk: 28.28 MB
Execution provider: ['CPUExecutionProvider']


[0;93m2025-05-10 21:21:40.934399551 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:880 CreateExecutionProviderInstance] Failed to create TensorrtExecutionProvider. Please reference https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#requirements to ensure all dependencies are met.[m


ONNX Model Accuracy: 77.96%
Inference Latency (single sample, median): 26.76 ms
Inference Latency (single sample, 95th percentile): 29.92 ms
Inference Latency (single sample, 99th percentile): 30.93 ms
Inference Throughput (single sample): 36.46 FPS
Batch Throughput: 60.99 FPS


'GPU'

In [20]:
onnx_model_path = "./mlflowModel1_optimized.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()  

Model Size on Disk: 28.27 MB
Execution provider: ['CUDAExecutionProvider', 'CPUExecutionProvider']


[0;93m2025-05-10 21:22:33.403893605 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 6 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 21:22:33.407660425 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 21:22:33.407672881 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


ONNX Model Accuracy: 77.96%
Inference Latency (single sample, median): 12.21 ms
Inference Latency (single sample, 95th percentile): 14.04 ms
Inference Latency (single sample, 99th percentile): 17.73 ms
Inference Throughput (single sample): 79.74 FPS
Batch Throughput: 410.21 FPS


'GPU'

In [21]:
onnx_model_path = "./mlflowModel1_quantized_dynamic.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()  

Model Size on Disk: 7.86 MB
Execution provider: ['CUDAExecutionProvider', 'CPUExecutionProvider']


[0;93m2025-05-10 21:22:59.611330817 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 368 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 21:22:59.626312842 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 21:22:59.626325742 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


ONNX Model Accuracy: 76.79%
Inference Latency (single sample, median): 48.72 ms
Inference Latency (single sample, 95th percentile): 50.49 ms
Inference Latency (single sample, 99th percentile): 51.90 ms
Inference Throughput (single sample): 20.42 FPS
Batch Throughput: 27.13 FPS


'GPU'

In [22]:
onnx_model_path = "./mlflowModel1_quantized_conservative.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()  

Model Size on Disk: 19.88 MB
Execution provider: ['CUDAExecutionProvider', 'CPUExecutionProvider']


[0;93m2025-05-10 21:23:51.320203656 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 130 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-10 21:23:51.331849685 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-10 21:23:51.331862706 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


ONNX Model Accuracy: 78.34%
Inference Latency (single sample, median): 27.45 ms
Inference Latency (single sample, 95th percentile): 27.97 ms
Inference Latency (single sample, 99th percentile): 28.60 ms
Inference Throughput (single sample): 36.28 FPS
Batch Throughput: 82.66 FPS


'GPU'