TRT FP16

In [1]:
!pip install torch_tensorrt

Collecting torch_tensorrt
  Downloading torch_tensorrt-2.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_34_x86_64.whl.metadata (10 kB)
Collecting tensorrt<10.8.0,>=10.7.0.post1 (from torch_tensorrt)
  Downloading tensorrt-10.7.0.post1.tar.gz (35 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt-cu12<10.8.0,>=10.7.0.post1 (from torch_tensorrt)
  Downloading tensorrt_cu12-10.7.0.post1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt-cu12-bindings<10.8.0,>=10.7.0 (from torch_tensorrt)
  Downloading tensorrt_cu12_bindings-10.7.0.post1-cp311-none-manylinux_2_17_x86_64.whl.metadata (628 bytes)
Collecting tensorrt-cu12-libs<10.8.0,>=10.7.0 (from torch_tensorrt)
  Downloading tensorrt_cu12_libs-10.7.0.post1.tar.gz (710 bytes)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu1

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ResNet Model Evaluation on ImageNet Test Set Sample with TensorRT FP16 Quantization

import torch
import torch_tensorrt
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import matplotlib.pyplot as plt
import numpy as np
import time
import random
import os
from torch.utils.data import DataLoader, Dataset
from PIL import Image

# Print TensorRT version for debugging
print(f"Torch-TensorRT version: {torch_tensorrt.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the transformation pipeline for the validation set
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


imagenet_path = "/content/drive/My Drive/assignments_dl_cs7150/project/imagenet_data/dataset_dl/"
print(f"Loading from: {imagenet_path}")

# Get all class folders
class_folders = [d for d in os.listdir(imagenet_path)
                 if os.path.isdir(os.path.join(imagenet_path, d))]
all_image_files = []

# Collect all jpg files
for folder in class_folders:
    folder_path = os.path.join(imagenet_path, folder)
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.jpg')]
    all_image_files.extend(files)

print(f"Found a total of {len(all_image_files)} images")

# Randomly sample 50 images
num_samples = 50
if len(all_image_files) > num_samples:
    random.shuffle(all_image_files)
    sampled_images = all_image_files[:num_samples]
else:
    sampled_images = all_image_files

print(f"Randomly sampled {len(sampled_images)} images for testing")

# Create a simple mapping from folder name to class index
folder_to_idx = {folder: idx for idx, folder in enumerate(sorted(class_folders))}





Torch-TensorRT version: 2.6.0
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
Using device: cuda:0
Loading from: /content/drive/My Drive/assignments_dl_cs7150/project/imagenet_data/dataset_dl/
Found a total of 5450 images
Randomly sampled 50 images for testing


In [4]:
# Custom dataset for the sampled images
class SampledImageNetDataset(Dataset):
    def __init__(self, image_files, transform=None):
        self.image_files = image_files
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.image_files[idx]

        # Get the class folder from the path
        folder_name = os.path.basename(os.path.dirname(img_path))
        class_idx = folder_to_idx[folder_name]

        # Load and transform the image
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return image, class_idx

# Create the dataset and dataloader
sample_dataset = SampledImageNetDataset(sampled_images, transform=preprocess)
sample_loader = DataLoader(sample_dataset, batch_size=1, shuffle=False)

print(f"Created dataset with {len(sample_dataset)} images")
print(f"Number of classes represented: {len(set(folder_to_idx.values()))}")

# Create class mapping for visualization
idx_to_class = {v: k for k, v in folder_to_idx.items()}

# Print a few examples
print("\nSample images:")
for i in range(min(5, len(sampled_images))):
    img_path = sampled_images[i]
    folder = os.path.basename(os.path.dirname(img_path))
    print(f"{i+1}. {img_path} (Class: {folder})")

Created dataset with 50 images
Number of classes represented: 109

Sample images:
1. /content/drive/My Drive/assignments_dl_cs7150/project/imagenet_data/dataset_dl/00030/398679180989368.jpg (Class: 00030)
2. /content/drive/My Drive/assignments_dl_cs7150/project/imagenet_data/dataset_dl/00092/7475549699225013.jpg (Class: 00092)
3. /content/drive/My Drive/assignments_dl_cs7150/project/imagenet_data/dataset_dl/00048/954069468668238.jpg (Class: 00048)
4. /content/drive/My Drive/assignments_dl_cs7150/project/imagenet_data/dataset_dl/00033/5350927420497062.jpg (Class: 00033)
5. /content/drive/My Drive/assignments_dl_cs7150/project/imagenet_data/dataset_dl/00069/589123649299837.jpg (Class: 00069)


In [5]:
# Function to evaluate model on the test set
def evaluate_model(model, data_loader, num_runs=5):
    model.eval()

    all_top1 = []
    all_top5 = []

    for run in range(num_runs):
        print(f"Accuracy evaluation - Run {run+1}/{num_runs}")

        # For top-1 and top-5 accuracy
        top1_correct = 0
        top5_correct = 0
        total = 0

        with torch.no_grad():
            for images, labels in data_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)

                # Top-1 accuracy
                _, predicted = torch.max(outputs, 1)
                top1_correct += (predicted == labels).sum().item()

                # Top-5 accuracy
                _, top5_preds = torch.topk(outputs, 5, dim=1)
                for i in range(labels.size(0)):
                    if labels[i] in top5_preds[i]:
                        top5_correct += 1

                total += labels.size(0)

        top1_accuracy = 100 * top1_correct / total
        top5_accuracy = 100 * top5_correct / total

        all_top1.append(top1_accuracy)
        all_top5.append(top5_accuracy)

        print(f"  Run {run+1} - Top-1: {top1_accuracy:.2f}%, Top-5: {top5_accuracy:.2f}%")

    # Calculate average and standard deviation
    avg_top1 = np.mean(all_top1)
    std_top1 = np.std(all_top1)
    avg_top5 = np.mean(all_top5)
    std_top5 = np.std(all_top5)

    print(f"Average Top-1: {avg_top1:.2f}% ± {std_top1:.2f}%")
    print(f"Average Top-5: {avg_top5:.2f}% ± {std_top5:.2f}%")

    return avg_top1, std_top1, avg_top5, std_top5

In [9]:
def measure_fps(model, input_tensor, num_runs=20):
    """Measure FPS with improved statistical robustness."""
    input_tensor = input_tensor.to(device)
    model.eval()

    all_fps = []

    # More warm-up runs
    with torch.no_grad():
        for _ in range(10):  # Increased from 5 to 10
            model(input_tensor)

    for run in range(num_runs):
        print(f"FPS measurement - Run {run+1}/{num_runs}")

        # Actual timing
        start_time = time.time()
        test_iterations = 50  # Increased from 20 to 50 for more stability
        with torch.no_grad():
            for _ in range(test_iterations):
                model(input_tensor)

        end_time = time.time()
        time_per_image = (end_time - start_time) / test_iterations
        fps = 1.0 / time_per_image
        all_fps.append(fps)

        print(f"  Run {run+1} FPS: {fps:.2f}")

    # Calculate statistics
    all_fps = np.array(all_fps)

    # Outlier detection (remove values beyond 2 standard deviations)
    mean = np.mean(all_fps)
    std = np.std(all_fps)
    filtered_fps = all_fps[np.abs(all_fps - mean) <= 2 * std]

    # If too many points are removed, revert to using all points
    if len(filtered_fps) < 0.7 * len(all_fps):
        filtered_fps = all_fps
        print("Outlier filtering skipped (too many potential outliers)")
    else:
        print(f"Removed {len(all_fps) - len(filtered_fps)} outliers")

    # Calculate mean, median and their respective dispersion metrics
    mean_fps = np.mean(filtered_fps)
    std_fps = np.std(filtered_fps)

    # Calculate median and median absolute deviation (more robust)
    median_fps = np.median(filtered_fps)
    mad_fps = np.median(np.abs(filtered_fps - median_fps)) * 1.4826  # Factor to make MAD comparable to std dev

    cv = (std_fps / mean_fps) * 100  # Coefficient of variation

    print(f"Mean FPS: {mean_fps:.2f} ± {std_fps:.2f} (CV: {cv:.2f}%)")
    print(f"Median FPS: {median_fps:.2f} ± {mad_fps:.2f}")
    return median_fps


# Show a few sample images with predictions
def visualize_predictions(model_obj, dataset, loader, num_samples=5):
    model_obj.eval()
    fig, axes = plt.subplots(1, num_samples, figsize=(20, 4))

    with torch.no_grad():
        for i, (images, labels) in enumerate(loader):
            if i >= num_samples:
                break

            images, labels = images.to(device), labels.to(device)
            outputs = model_obj(images)

            # Get predicted class
            _, predicted = torch.max(outputs, 1)

            # Get class names (folder names)
            true_class_idx = labels.item()
            pred_class_idx = predicted.item()

            true_class = idx_to_class[true_class_idx]
            pred_class = idx_to_class.get(pred_class_idx, f"Unknown ({pred_class_idx})")

            # Get image
            img = images[0].cpu().numpy().transpose((1, 2, 0))
            # Denormalize
            img = img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
            img = np.clip(img, 0, 1)

            # Plot
            axes[i].imshow(img)
            axes[i].set_title(f"True: {true_class}\nPred: {pred_class}")
            axes[i].axis('off')

    plt.tight_layout()
    plt.savefig('sample_predictions_tensorrt_fp16.png')
    plt.show()



# Function to count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
# Function to calculate model size in MB for PyTorch models
def calculate_pytorch_model_size(model):
    # Get model state_dict
    state_dict = model.state_dict()

    # Calculate total size in bytes
    total_size = 0
    for param in state_dict.values():
        # Calculate bytes for each parameter
        total_size += param.numel() * param.element_size()

    # Convert to MB
    size_mb = total_size / (1024 * 1024)

    return size_mb

# Function to calculate TensorRT model size based on parameter count and precision
def calculate_tensorrt_model_size(param_count, precision='fp16'):
    # Determine bytes per parameter based on precision
    if precision.lower() == 'fp16':
        bytes_per_param = 2  # 16 bits = 2 bytes
    elif precision.lower() == 'fp32':
        bytes_per_param = 4  # 32 bits = 4 bytes
    elif precision.lower() == 'int8':
        bytes_per_param = 1  # 8 bits = 1 byte
    else:
        bytes_per_param = 4  # Default to FP32

    # Calculate total size in MB
    size_mb = (param_count * bytes_per_param) / (1024 * 1024)

    return size_mb

In [14]:
# Create FP16 TensorRT engine
def create_fp16_trt_engine():
    print("Loading base ResNet18 model...")
    model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
    model = model.to(device)
    model.eval()

    print("Tracing model...")
    example_input = torch.randn(1, 3, 224, 224).to(device)
    traced_model = torch.jit.trace(model, example_input)
    traced_model = torch.jit.freeze(traced_model)

    print("Setting up FP16 TensorRT compilation...")

    input_specs = [torch_tensorrt.Input(
        min_shape=[1, 3, 224, 224],
        opt_shape=[1, 3, 224, 224],
        max_shape=[1, 3, 224, 224],
        dtype=torch.float32
    )]

    trt_model = torch_tensorrt.compile(
        traced_model,
        inputs=input_specs,
        enabled_precisions={torch.float16},  # Use FP16 precision
        workspace_size=1 << 22,  # 4MB workspace
        truncate_long_and_double=True
    )
    print("Successfully compiled with FP16 precision")
    return trt_model

# Store results
results = {
    'model': [],
    'top1_accuracy': [],
    'top1_std': [],
    'top5_accuracy': [],
    'top5_std': [],
    'parameters': [],
    'model_size': [],     # Added for model size in MB
    'fps': [],
    'fps_std': []
}

# Number of runs for each evaluation
NUM_RUNS = 5
print(f"Evaluating each model with {NUM_RUNS} runs for averaging")

# Create dummy input for FPS measurement
dummy_input = torch.randn(1, 3, 224, 224).to(device)

# Evaluate original PyTorch ResNet-18
print("\nEvaluating original PyTorch ResNet-18...")
resnet18_original = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1).to(device).eval()
num_params_original = count_parameters(resnet18_original)
model_size_original = calculate_pytorch_model_size(resnet18_original)

# Run multiple times and average
fps_original = measure_fps(resnet18_original, dummy_input, num_runs=NUM_RUNS)
top1_acc_original, top1_std_original, top5_acc_original, top5_std_original = evaluate_model(resnet18_original, sample_loader, num_runs=NUM_RUNS)

print(f"Number of parameters: {num_params_original:,}")
print(f"Model size: {model_size_original:.2f} MB")
print(f"FPS: {fps_original:.2f}")
print(f"Top-1 accuracy: {top1_acc_original:.2f}% ± {top1_std_original:.2f}%")
print(f"Top-5 accuracy: {top5_acc_original:.2f}% ± {top5_std_original:.2f}%")

results['model'].append('ResNet18\nPyTorch')
results['top1_accuracy'].append(top1_acc_original)
results['top1_std'].append(top1_std_original)
results['top5_accuracy'].append(top5_acc_original)
results['top5_std'].append(top5_std_original)
results['parameters'].append(num_params_original)
results['model_size'].append(model_size_original)
results['fps'].append(fps_original)


# Evaluate TensorRT FP16 model
print("\nEvaluating ResNet-18 with TensorRT FP16...")
resnet18_trt_fp16 = create_fp16_trt_engine()
num_params_trt_fp16 = num_params_original  # Parameters count should be the same

# For TensorRT models, calculate size based on parameter count and precision
model_size_trt_fp16 = calculate_tensorrt_model_size(num_params_trt_fp16, precision='fp16')
print(f"TensorRT FP16 model size (based on parameter count): {model_size_trt_fp16:.2f} MB")

# Run multiple times and average
fps_trt_fp16 = measure_fps(resnet18_trt_fp16, dummy_input, num_runs=NUM_RUNS)
top1_acc_trt_fp16, top1_std_trt_fp16, top5_acc_trt_fp16, top5_std_trt_fp16 = evaluate_model(resnet18_trt_fp16, sample_loader, num_runs=NUM_RUNS)

print(f"Number of parameters: {num_params_trt_fp16:,}")
print(f"Model size: {model_size_trt_fp16:.2f} MB")
# print(f"FPS: {fps_trt_fp16:.2f} ± {fps_trt_fp16_std:.2f}")
print(f"Top-1 accuracy: {top1_acc_trt_fp16:.2f}% ± {top1_std_trt_fp16:.2f}%")
print(f"Top-5 accuracy: {top5_acc_trt_fp16:.2f}% ± {top5_std_trt_fp16:.2f}%")

results['model'].append('ResNet18\nTRT FP16')
results['top1_accuracy'].append(top1_acc_trt_fp16)
results['top1_std'].append(top1_std_trt_fp16)
results['top5_accuracy'].append(top5_acc_trt_fp16)
results['top5_std'].append(top5_std_trt_fp16)
results['parameters'].append(num_params_trt_fp16)
results['model_size'].append(model_size_trt_fp16)
results['fps'].append(fps_trt_fp16)
# results['fps_std'].append(fps_trt_fp16_std)

# Plot the results with error bars
fig, axs = plt.subplots(1, 3, figsize=(18, 6))  # Changed to 1 row, 3 columns

# Accuracy plot (combining top-1 and top-5 with error bars)
bar_width = 0.35
x = np.arange(len(results['model']))
axs[0].bar(x - bar_width/2, results['top1_accuracy'], bar_width, color='skyblue', label='Top-1')
axs[0].bar(x + bar_width/2, results['top5_accuracy'], bar_width, color='lightgreen', label='Top-5')

# Add error bars
axs[0].errorbar(x - bar_width/2, results['top1_accuracy'], yerr=results['top1_std'],
              fmt='none', ecolor='black', capsize=5)
axs[0].errorbar(x + bar_width/2, results['top5_accuracy'], yerr=results['top5_std'],
              fmt='none', ecolor='black', capsize=5)

axs[0].set_xlabel('Model')
axs[0].set_ylabel('Accuracy (%)')
axs[0].set_title(f'Test Accuracy (Average of {NUM_RUNS} runs)')
axs[0].set_xticks(x)
axs[0].set_xticklabels(results['model'])
axs[0].set_ylim([60, 100])  # Adjusted to zoom in on relevant accuracy range
axs[0].legend()
for i, v in enumerate(results['top1_accuracy']):
    axs[0].text(i - bar_width/2, v + 2, f"{v:.2f}%", ha='center', fontsize=9)
for i, v in enumerate(results['top5_accuracy']):
    axs[0].text(i + bar_width/2, v + 2, f"{v:.2f}%", ha='center', fontsize=9)

# FPS plot with error bars
axs[1].bar(x, results['fps'], color='gold')
axs[1].errorbar(x, results['fps'],
              fmt='none', ecolor='black', capsize=5)
axs[1].set_xlabel('Model')
axs[1].set_ylabel('FPS')
axs[1].set_title(f'Frames Per Second (Average of {NUM_RUNS} runs)')
axs[1].set_xticks(x)
axs[1].set_xticklabels(results['model'])
for i, v in enumerate(results['fps']):
    axs[1].text(i, v + 0.5, f"{v:.2f}", ha='center', fontsize=9)

# Model Size plot
axs[2].bar(x, results['model_size'], color='lightcoral')
axs[2].set_xlabel('Model')
axs[2].set_ylabel('Model Size (MB)')
axs[2].set_title('Model Size Comparison')
axs[2].set_xticks(x)
axs[2].set_xticklabels(results['model'])
for i, v in enumerate(results['model_size']):
    axs[2].text(i, v + 0.5, f"{v:.2f} MB", ha='center')

plt.tight_layout()
plt.savefig('resnet18_tensorrt_fp16_comparison_avg5runs.png', dpi=300)
plt.show()
# params_in_millions = []
params_in_millions = [p / 1_000_000 for p in results['parameters']]
# Create a table with all the results
try:
    from prettytable import PrettyTable
    table = PrettyTable()
    table.field_names = ["Model", "Top-1 Acc (%)", "Top-5 Acc (%)", "Parameters (M)", "FPS"]

    for i, model in enumerate(results['model']):
        table.add_row([
            model,
            f"{results['top1_accuracy'][i]:.2f} ± {results['top1_std'][i]:.2f}",
            f"{results['top5_accuracy'][i]:.2f} ± {results['top5_std'][i]:.2f}",
            f"{params_in_millions[i]:.2f}",
            f"{results['fps'][i]:.2f}"
        ])

    print("\nPerformance Comparison (Averaged over 5 runs):")
    print(table)
except ImportError:
    print("\nPerformance Comparison (Averaged over 5 runs):")
    for i, model in enumerate(results['model']):
        print(f"{model}: Top-1={results['top1_accuracy'][i]:.2f}% ± {results['top1_std'][i]:.2f}, "
              f"Top-5={results['top5_accuracy'][i]:.2f}% ± {results['top5_std'][i]:.2f}, "
              f"Params={params_in_millions[i]:.2f}M, "
              f"FPS={results['fps'][i]:.2f}")

# Visualize using TensorRT FP16 model
print("\nVisualizing sample predictions using TensorRT FP16 ResNet-18:")
visualize_predictions(resnet18_trt_fp16, sample_dataset, sample_loader)

# Visualize the original model for comparison
print("\nVisualizing sample predictions using original ResNet-18 for comparison:")
visualize_predictions(resnet18_original, sample_dataset, sample_loader)

print("\nEvaluation complete!")
print(f"Results are saved as 'resnet18_tensorrt_fp16_comparison_avg5runs.png' and 'sample_predictions_tensorrt_fp16.png'")

Output hidden; open in https://colab.research.google.com to view.