In [None]:
!nvidia-smi

In [None]:
import torch, time, gc
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODELS = {
    "HQ_Llama": "meta-llama/Llama-3.1-8B-Instruct",
    "Fast_Gemma": "google/gemma-2-2b-it"
}
BATCH_SIZES = [1, 5]
NUM_RUNS = 10  # Number of times to repeat each batch

In [None]:
def load_model(model_name):
    """Loads a model and tokenizer, then clears cache to ensure fairness."""
    
    print(f"\n--- Loading {model_name} ---")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # CRITICAL FIX: Ensure padding token exists for batching
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # CRITICAL FIX: Padding side must be LEFT for generation
    tokenizer.padding_side = "left"
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        device_map="auto", 
        torch_dtype=torch.float16
    )
    return model, tokenizer

In [None]:
def warmup_model(model, tokenizer, prompt="Hello world"):
    """Warm up the model to remove first-batch latency spikes."""
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(DEVICE)
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=5)
    if DEVICE == "cuda":
        torch.cuda.synchronize()

In [None]:
def measure_batch_performance(model, tokenizer, prompts, max_new_tokens=150):
    """Measures latency, QPS, and token throughput for a batch."""
    # Batch tokenization
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(DEVICE)
    batch_size = len(prompts)
    
    if DEVICE == "cuda":
        torch.cuda.synchronize()
    start_time = time.time()

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)

    if DEVICE == "cuda":
        torch.cuda.synchronize()
    end_time = time.time()
    
    total_time = end_time - start_time
    
    # Count generated tokens per batch
    total_tokens = (outputs.shape[1] - inputs.input_ids.shape[1]) * batch_size
    
    qps = batch_size / total_time
    tps = total_tokens / total_time
    
    return qps, tps, total_time

In [None]:
# --- BENCHMARK ---

base_prompts = [
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",
]

# Store results
results = {model_name: {b: {"latency": [], "qps": [], "tps": []} for b in BATCH_SIZES} 
           for model_name in MODELS.keys()}

for model_name, model_path in MODELS.items():
    model, tokenizer = load_model(model_path)

    warmup_model(model, tokenizer, prompt=base_prompts[0])
    
    for b_size in BATCH_SIZES:
        # Prepare batch
        current_batch = (base_prompts * ((b_size // len(base_prompts)) + 1))[:b_size]
        print(current_batch)
        for run in range(1, NUM_RUNS + 1):
            qps, tps, latency = measure_batch_performance(model, tokenizer, current_batch)
            
            results[model_name][b_size]["latency"].append(latency)
            results[model_name][b_size]["qps"].append(qps)
            results[model_name][b_size]["tps"].append(tps)
            
            print(f"[{model_name} | Batch {b_size} | Run {run}]")
            print(f"  Latency: {latency:.2f}s | QPS: {qps:.2f} | TPS: {tps:.2f}")
    
    # Cleanup after each model
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(1)

# Experiment settings:
- Hardware: RTX 4090
- Max Tokens 150
- Warm up yes
- Number of Runs 10
  

# HQ_Llama (8b)

### Batch 1:
- Latency: 2.98 secs
- QPS: 0.34
- TPS: 50.43

### Batch 5:
- Latency: 3.51 secs
- QPS: 1.43
- TPS: 213

# Fast_Gemma (2B)

### Batch 1:
- Latency: 1.75 secs
- QPS: 0.57
- TPS: 85

### Batch 5:
- Latency: 1.90 secs
- QPS: 2.7
- TPS: 400
