In [1]:
#Step 0: Install required libraries:

In [2]:
#!pip install torch transformers tritonclient[http] nvidia-pyindex torch-tensorrt

In [3]:
#Step 1: Load a Pre-trained LLM
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [5]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded on:", device)

Model loaded on: cuda


In [6]:
#Step 2: Quantize the Model to FP16
# Convert model to FP16
model.half()  # Converts weights to FP16
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
# Test inference with a sample input
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors="pt").to(device)
outputs = model(**inputs)
print("FP16 inference successful!")

FP16 inference successful!


In [8]:
#Step 3: Profile the Model on GPU
#from torch.profiler import profile, record_shapes, ProfilerActivity

In [9]:
"""In PyTorch’s torch.profiler module (introduced in PyTorch 1.8.0 and refined in later versions), record_shapes is a boolean flag 
passed to the profile context manager, not a separate class or function you import individually.
The correct imports are profile and ProfilerActivity from torch.profiler, and record_shapes is used as an argument."""

'In PyTorch’s torch.profiler module (introduced in PyTorch 1.8.0 and refined in later versions), record_shapes is a boolean flag \npassed to the profile context manager, not a separate class or function you import individually.\nThe correct imports are profile and ProfilerActivity from torch.profiler, and record_shapes is used as an argument.'

In [10]:
from torch.profiler import profile, ProfilerActivity

In [11]:
# Define a profiling function
def profile_model(model, inputs):
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True
    ) as prof:
        with torch.no_grad():
            model(**inputs)
    
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# Profile the model
#This will output a table showing where time is spent (e.g., matrix multiplications, memory transfers). 
#Look for high CUDA time or memory bottlenecks.
profile_model(model, inputs)

-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      aten::addmm        19.09%       7.975ms        19.75%       8.251ms     171.892us       8.234ms        17.04%       9.129ms     190.188us           0 b           0 b     972.00 Kb     -47.05 Mb            48  
      

In [12]:
#Step 4: Apply Pruning to Remove 20% of Weights
import torch.nn.utils.prune as prune

# Function to prune linear layers
def prune_model(model, amount=0.2):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)  # Prune 20% of weights
            prune.remove(module, 'weight')  # Make pruning permanent
    
    return model

In [13]:
# Apply pruning
model = prune_model(model)
print("Model pruned successfully!")

# Test inference after pruning
outputs = model(**inputs)
print("Pruned model inference successful!")

Model pruned successfully!
Pruned model inference successful!


In [14]:
#Step 5: Deploy with Triton Inference Server
"""Triton requires exporting the model and setting up a server. Here’s how to export the model and configure Triton:"""

'Triton requires exporting the model and setting up a server. Here’s how to export the model and configure Triton:'

In [15]:
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
dummy_input = tokenizer("Test input", return_tensors="pt", padding=True, truncation=True).to(device)
input_ids = dummy_input["input_ids"]  # Keep as torch.long
attention_mask = dummy_input["attention_mask"].half()  # Convert to FP16

In [16]:
import torch.onnx

# Export to ONNX
torch.onnx.export(
    model,
    (input_ids, attention_mask),  # Pass the fixed tensors
    "gpt2_fp16.onnx",
    opset_version=12,
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"}
    }
)
print("Model exported to ONNX!")

IndexError: Dimension specified as -2 but tensor has no dimensions

In [None]:
#fix the above error step by step

In [21]:
print(input_ids.shape)
print(attention_mask.shape)

torch.Size([1, 2])
torch.Size([1, 2])


In [22]:
input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids
attention_mask = attention_mask.unsqueeze(0) if attention_mask.dim() == 1 else attention_mask

In [23]:
torch.onnx.export(
    model,
    {"input_ids": input_ids, "attention_mask": attention_mask},  # Use a dictionary
    "gpt2_fp16.onnx",
    opset_version=12,
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"}
    }
)


  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:


UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 12 is not supported. Support for this operator was added in version 14, try exporting with this version.

In [24]:
torch.onnx.export(
    model,
    (input_ids, attention_mask),  # Use a tuple
    "gpt2_fp16.onnx",
    opset_version=14,  # Update to at least 14
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"}
    }
)

IndexError: Dimension specified as -2 but tensor has no dimensions

In [25]:
print("input_ids shape:", input_ids.shape)
print("attention_mask shape:", attention_mask.shape)

input_ids shape: torch.Size([1, 2])
attention_mask shape: torch.Size([1, 2])


In [26]:
if input_ids.dim() == 1:
    input_ids = input_ids.unsqueeze(0)  # Add batch dimension
if attention_mask.dim() == 1:
    attention_mask = attention_mask.unsqueeze(0)

In [27]:
input_ids = input_ids.to(torch.int64)
attention_mask = attention_mask.to(torch.int64)

In [29]:
!pip install onnx onnxruntime onnxruntime-gpu

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting onnx
  Downloading onnx-1.17.0-cp310-cp310-win_amd64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.21.0-cp310-cp310-win_amd64.whl.metadata (4.9 kB)
Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.21.0-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Collecting protobuf>=3.20.2 (from onnx)
  Downloading protobuf-6.30.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting pyreadline3 (from humanfriendly>=9.1->coloredlogs->onnxruntime)
  Downloading pyreadline3-3.5.4-py3-none-any.whl.metadata (4.7 kB)
Downloading onnx-1.17.0-cp310-cp310-win_amd64.whl (14.5 MB)
   ---------------------------------------- 0.0/14.5 MB ? eta -:--:--

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tb-nightly 2.19.0a20250218 requires tensorboard-data-server<0.8.0,>=0.7.0, but you have tensorboard-data-server 0.6.1 which is incompatible.
tensorboard 2.10.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 6.30.1 which is incompatible.
tensorflow 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 6.30.1 which is incompatible.
tensorflow-intel 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.10.0 which is incompatible.
tensorflow-intel 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 6.30.1 which is incompatible.
tensorflow-intel 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.10.1 which is incompatible.
tensorflow-intel 2.12.0 requires tensorflow-estimator<2.13,

In [30]:
!pip install onnxruntime-gpu

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com




In [31]:
import onnx
print(onnx.__version__)

1.17.0


In [32]:
torch.onnx.export(
    model,
    {"input_ids": input_ids, "attention_mask": attention_mask},  # Dictionary input
    "gpt2_fp16.onnx",
    opset_version=14,  # Use ONNX 14 or higher
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"}
    }
)


In [34]:
#Export the Model to ONNX
torch.onnx.export(
    model,
    {"input_ids": input_ids, "attention_mask": attention_mask},  # Dictionary input
    "gpt2_fp16.onnx",
    opset_version=14,  # Use ONNX 14 or higher
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"}
    }
)