In [1]:
!nvidia-smi

Mon Nov 17 19:02:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:01:00.0 Off |                  Off |
| 30%   36C    P5             34W /  450W |       2MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch, time, gc
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODELS = {
    "HQ_Llama": "meta-llama/Llama-3.1-8B-Instruct",
    "Fast_Gemma": "google/gemma-2-2b-it"
}
BATCH_SIZES = [1, 5]
NUM_RUNS = 10  # Number of times to repeat each batch

In [4]:
def load_model(model_name):
    """Loads a model and tokenizer, then clears cache to ensure fairness."""
    
    print(f"\n--- Loading {model_name} ---")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # CRITICAL FIX: Ensure padding token exists for batching
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # CRITICAL FIX: Padding side must be LEFT for generation
    tokenizer.padding_side = "left"
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        device_map="auto", 
        torch_dtype=torch.float16
    )
    return model, tokenizer

In [5]:
def warmup_model(model, tokenizer, prompt="Hello world"):
    """Warm up the model to remove first-batch latency spikes."""
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(DEVICE)
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=5)
    if DEVICE == "cuda":
        torch.cuda.synchronize()

In [6]:
def measure_batch_performance(model, tokenizer, prompts, max_new_tokens=150):
    """Measures latency, QPS, and token throughput for a batch."""
    # Batch tokenization
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(DEVICE)
    batch_size = len(prompts)
    
    if DEVICE == "cuda":
        torch.cuda.synchronize()
    start_time = time.time()

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)

    if DEVICE == "cuda":
        torch.cuda.synchronize()
    end_time = time.time()
    
    total_time = end_time - start_time
    
    # Count generated tokens per batch
    total_tokens = (outputs.shape[1] - inputs.input_ids.shape[1]) * batch_size
    
    qps = batch_size / total_time
    tps = total_tokens / total_time
    
    return qps, tps, total_time

In [7]:
# --- BENCHMARK ---

base_prompts = [
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",
]

# Store results
results = {model_name: {b: {"latency": [], "qps": [], "tps": []} for b in BATCH_SIZES} 
           for model_name in MODELS.keys()}

for model_name, model_path in MODELS.items():
    model, tokenizer = load_model(model_path)

    warmup_model(model, tokenizer, prompt=base_prompts[0])
    
    for b_size in BATCH_SIZES:
        # Prepare batch
        current_batch = (base_prompts * ((b_size // len(base_prompts)) + 1))[:b_size]
        print(current_batch)
        for run in range(1, NUM_RUNS + 1):
            qps, tps, latency = measure_batch_performance(model, tokenizer, current_batch)
            
            results[model_name][b_size]["latency"].append(latency)
            results[model_name][b_size]["qps"].append(qps)
            results[model_name][b_size]["tps"].append(tps)
            
            print(f"[{model_name} | Batch {b_size} | Run {run}]")
            print(f"  Latency: {latency:.2f}s | QPS: {qps:.2f} | TPS: {tps:.2f}")
    
    # Cleanup after each model
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(1)


--- Loading meta-llama/Llama-3.1-8B-Instruct ---


Loading checkpoint shards: 100%|██████████| 4/4 [00:24<00:00,  6.11s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Create a podcast intro about AI ethics with a guest scientist.']


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 1]
  Latency: 3.05s | QPS: 0.33 | TPS: 49.20


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 2]
  Latency: 2.97s | QPS: 0.34 | TPS: 50.48


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 3]
  Latency: 2.98s | QPS: 0.34 | TPS: 50.42


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 4]
  Latency: 2.97s | QPS: 0.34 | TPS: 50.43


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 5]
  Latency: 2.98s | QPS: 0.34 | TPS: 50.38


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 6]
  Latency: 2.97s | QPS: 0.34 | TPS: 50.42


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 7]
  Latency: 2.98s | QPS: 0.34 | TPS: 50.36


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 8]
  Latency: 2.98s | QPS: 0.34 | TPS: 50.40


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 9]
  Latency: 2.97s | QPS: 0.34 | TPS: 50.44


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 1 | Run 10]
  Latency: 2.97s | QPS: 0.34 | TPS: 50.45
['Create a podcast intro about AI ethics with a guest scientist.', 'Generate a fantasy podcast clip where Gandalf gives life advice.', 'Podcast clip about football news with enthusiastic commentary.', 'Create a podcast intro about AI ethics with a guest scientist.', 'Generate a fantasy podcast clip where Gandalf gives life advice.']


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 1]
  Latency: 6.33s | QPS: 0.79 | TPS: 118.56


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 2]
  Latency: 3.49s | QPS: 1.43 | TPS: 215.00


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 3]
  Latency: 3.51s | QPS: 1.43 | TPS: 213.86


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 4]
  Latency: 3.51s | QPS: 1.42 | TPS: 213.72


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 5]
  Latency: 3.51s | QPS: 1.43 | TPS: 213.98


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 6]
  Latency: 3.50s | QPS: 1.43 | TPS: 214.04


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 7]
  Latency: 3.51s | QPS: 1.43 | TPS: 213.94


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 8]
  Latency: 3.51s | QPS: 1.43 | TPS: 213.80


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[HQ_Llama | Batch 5 | Run 9]
  Latency: 3.50s | QPS: 1.43 | TPS: 214.11
[HQ_Llama | Batch 5 | Run 10]
  Latency: 3.70s | QPS: 1.35 | TPS: 202.56

--- Loading google/gemma-2-2b-it ---


Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.60s/it]


['Create a podcast intro about AI ethics with a guest scientist.']
[Fast_Gemma | Batch 1 | Run 1]
  Latency: 16.55s | QPS: 0.06 | TPS: 9.06
[Fast_Gemma | Batch 1 | Run 2]
  Latency: 1.91s | QPS: 0.52 | TPS: 78.46
[Fast_Gemma | Batch 1 | Run 3]
  Latency: 1.75s | QPS: 0.57 | TPS: 85.86
[Fast_Gemma | Batch 1 | Run 4]
  Latency: 1.75s | QPS: 0.57 | TPS: 85.92
[Fast_Gemma | Batch 1 | Run 5]
  Latency: 1.77s | QPS: 0.57 | TPS: 84.92
[Fast_Gemma | Batch 1 | Run 6]
  Latency: 1.81s | QPS: 0.55 | TPS: 83.00
[Fast_Gemma | Batch 1 | Run 7]
  Latency: 1.75s | QPS: 0.57 | TPS: 85.80
[Fast_Gemma | Batch 1 | Run 8]
  Latency: 1.75s | QPS: 0.57 | TPS: 85.59
[Fast_Gemma | Batch 1 | Run 9]
  Latency: 1.68s | QPS: 0.59 | TPS: 89.25
[Fast_Gemma | Batch 1 | Run 10]
  Latency: 1.73s | QPS: 0.58 | TPS: 86.71
['Create a podcast intro about AI ethics with a guest scientist.', 'Generate a fantasy podcast clip where Gandalf gives life advice.', 'Podcast clip about football news with enthusiastic commentary.', '

# Experiment settings:
- Hardware: RTX 4090
- Max Tokens 150
- Warm up yes
- Number of Runs 10
  

# HQ_Llama (8b)

### Batch 1:
- Latency: 2.98 secs
- QPS: 0.34
- TPS: 50.43

### Batch 5:
- Latency: 3.51 secs
- QPS: 1.43
- TPS: 213

# Fast_Gemma (2B)

### Batch 1:
- Latency: 1.75 secs
- QPS: 0.57
- TPS: 85

### Batch 5:
- Latency: 1.90 secs
- QPS: 2.7
- TPS: 400
