In [1]:
import torch
print("PyTorch Version:", torch.__version__)  
print("CUDA Available:", torch.cuda.is_available())  
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


PyTorch Version: 2.5.1+cu121
CUDA Available: True
GPU Name: NVIDIA GeForce RTX 4070 Laptop GPU


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Define input text
input_text = "The future of AI is"
input_tokens = tokenizer(input_text, return_tensors="pt")

# Ensure input tensors are on the same device as the model
input_tokens = {key: value.to(device) for key, value in input_tokens.items()}

# Function to measure inference time
def benchmark_inference(model, input_tokens, device, num_runs=10):
    model.eval()  # Set model to evaluation mode
    total_time = 0.0
    
    with torch.no_grad():  # Disable gradient computation
        for _ in range(num_runs):
            start_time = time.time()
            output = model.generate(**input_tokens, max_length=50)  # Generate text
            end_time = time.time()
            total_time += (end_time - start_time)
    
    avg_time = total_time / num_runs
    print(f"\nAverage inference time on {device}: {avg_time:.4f} seconds")

# Run benchmark on CPU
model.to("cpu")
input_tokens = {key: value.to("cpu") for key, value in input_tokens.items()}  # Move inputs to CPU
benchmark_inference(model, input_tokens, "CPU")

# Run benchmark on GPU (if available)
if torch.cuda.is_available():
    model.to("cuda")
    input_tokens = {key: value.to("cuda") for key, value in input_tokens.items()}  # Move inputs to GPU
    benchmark_inference(model, input_tokens, "GPU")
else:
    print("\nCUDA not available, skipping GPU test.")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Average inference time on CPU: 0.9786 seconds


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Average inference time on GPU: 0.3106 seconds


In [5]:
import torch
import onnx
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

# Define example input text
input_text = "The future of AI is"
input_tokens = tokenizer(input_text, return_tensors="pt")

# Convert model to ONNX format
onnx_model_path = "gpt2_model.onnx"

# Export the model to ONNX
torch.onnx.export(
    model, 
    (input_tokens["input_ids"],),  # Model inputs
    onnx_model_path,
    input_names=["input_ids"],
    output_names=["output"],
    dynamic_axes={"input_ids": {0: "batch_size", 1: "seq_length"}, "output": {0: "batch_size"}},
    opset_version=14  # ONNX Opset version (must be compatible with ONNX Runtime)
)

print(f"✅ GPT-2 model successfully converted to ONNX format: {onnx_model_path}")


✅ GPT-2 model successfully converted to ONNX format: gpt2_model.onnx


In [6]:
import onnxruntime as ort
import numpy as np
import time
from transformers import AutoTokenizer

# Load the ONNX model
onnx_model_path = "gpt2_model.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Prepare input tokens
input_text = "The future of AI is"
input_tokens = tokenizer(input_text, return_tensors="np")  # Convert to NumPy
input_ids = input_tokens["input_ids"].astype(np.int64)  # ONNX requires int64 inputs

# Function to run inference with ONNX Runtime
def benchmark_onnx_inference(ort_session, input_ids, device, num_runs=10):
    total_time = 0.0
    
    for _ in range(num_runs):
        start_time = time.time()
        outputs = ort_session.run(None, {"input_ids": input_ids})  # Run inference
        end_time = time.time()
        total_time += (end_time - start_time)
    
    avg_time = total_time / num_runs
    print(f"\n✅ Average inference time with ONNX Runtime on {device}: {avg_time:.4f} seconds")

# Run benchmark on CPU
benchmark_onnx_inference(ort_session, input_ids, "CPU")

# Run benchmark on GPU (if available)
if torch.cuda.is_available():
    ort_session = ort.InferenceSession(onnx_model_path, providers=["CUDAExecutionProvider"])
    benchmark_onnx_inference(ort_session, input_ids, "GPU")



✅ Average inference time with ONNX Runtime on CPU: 0.0112 seconds

✅ Average inference time with ONNX Runtime on GPU: 0.0058 seconds


In [10]:
import tensorrt as trt
import os

# Define paths
onnx_model_path = "gpt2_model.onnx"
trt_model_path = "gpt2_model.trt"

# Create a TensorRT logger
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# Create TensorRT builder and network
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)

# Read the ONNX model
with open(onnx_model_path, "rb") as model_file:
    if not parser.parse(model_file.read()):
        print("❌ Failed to parse ONNX model!")
        for error in range(parser.num_errors):
            print(parser.get_error(error))
        exit()

# Set TensorRT builder configurations
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # Set max workspace size

# Enable FP16 optimization (Optional: Can be removed if GPU doesn’t support FP16)
if builder.platform_has_fast_fp16:
    config.set_flag(trt.BuilderFlag.FP16)

# Create an optimization profile (for dynamic batch sizes)
profile = builder.create_optimization_profile()
profile.set_shape("input_ids", (1, 1), (1, 50), (1, 100))  # Min, Opt, Max batch sizes
config.add_optimization_profile(profile)

# Build and serialize the engine
engine = builder.build_serialized_network(network, config)

# Ensure the engine was built successfully
if engine is None:
    raise RuntimeError("❌ Failed to build TensorRT engine!")

# Save the model
with open(trt_model_path, "wb") as f:
    f.write(engine)

print(f"✅ GPT-2 model successfully converted to TensorRT format: {trt_model_path}")


✅ GPT-2 model successfully converted to TensorRT format: gpt2_model.trt


In [11]:
import tensorrt as trt
import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit

# Load the optimized TensorRT engine
trt_engine_path = 'gpt2_model.trt'

# Create TensorRT runtime and deserialize engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(TRT_LOGGER)
with open(trt_engine_path, 'rb') as f:
    engine_data = f.read()
engine = runtime.deserialize_cuda_engine(engine_data)
context = engine.create_execution_context()

def benchmark_tensorrt_inference(context, input_ids, num_runs=10):
    input_binding = engine.get_binding_index('input_ids')
    output_binding = engine.get_binding_index('output')
    context.set_binding_shape(input_binding, input_ids.shape)
    d_input = cuda.mem_alloc(input_ids.nbytes)
    output_shape = tuple(context.get_binding_shape(output_binding))
    output_dtype = trt.nptype(engine.get_binding_dtype(output_binding))
    output = np.empty(output_shape, dtype=output_dtype)
    d_output = cuda.mem_alloc(output.nbytes)
    total_time = 0.0
    for _ in range(num_runs):
        start = time.time()
        cuda.memcpy_htod(d_input, input_ids)
        context.execute_v2([int(d_input), int(d_output)])
        cuda.memcpy_dtoh(output, d_output)
        end = time.time()
        total_time += end - start
    avg_time = total_time / num_runs
    print(f'✅ Average inference time with TensorRT: {avg_time:.4f} seconds')
    return output

# Dummy input ids
input_ids = np.array([[50256]], dtype=np.int32)
benchmark_tensorrt_inference(context, input_ids)


InvalidProtobuf: [ONNXRuntimeError] : 7 : INVALID_PROTOBUF : Load model from gpt2_model.trt failed:Protobuf parsing failed.