# LLM-Inference-Bench

GPU benchmarking toolkit for LLM inference performance evaluation.

**Metrics measured:**
- Throughput (tokens/second)
- Time to First Token (TTFT)
- Per-token latency
- GPU memory usage

---

## 1. Setup Environment

In [None]:
# Install dependencies
!pip install -q torch transformers accelerate bitsandbytes
!pip install -q pandas matplotlib seaborn tqdm pynvml

In [None]:
# Check GPU availability
import torch

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"GPU: {gpu_name}")
    print(f"Memory: {gpu_memory:.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("WARNING: No GPU detected. Go to Runtime > Change runtime type > GPU")

## 2. Import Benchmark Modules

In [None]:
import sys
import time
import gc
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Tuple, Optional

import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
# Supported models for Colab T4
SUPPORTED_MODELS = {
    "tiny": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "phi2": "microsoft/phi-2",
    "gemma-2b": "google/gemma-2b",
}

# Default prompts
DEFAULT_PROMPTS = [
    "Explain the concept of machine learning in simple terms.",
    "Write a Python function to calculate the factorial of a number.",
    "What are the benefits of using renewable energy sources?",
]

## 3. Model Loading Utilities

In [None]:
def get_quantization_config(quantization: str) -> Optional[BitsAndBytesConfig]:
    """Get BitsAndBytes quantization config."""
    if quantization == "int8":
        return BitsAndBytesConfig(load_in_8bit=True)
    elif quantization == "int4":
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
    return None


def load_model(model_name: str, quantization: str = "fp16"):
    """Load model with specified quantization."""
    model_path = SUPPORTED_MODELS.get(model_name, model_name)
    print(f"Loading {model_path} with {quantization} precision...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    quant_config = get_quantization_config(quantization)
    model_kwargs = {"trust_remote_code": True, "device_map": "auto"}
    
    if quantization == "fp16":
        model_kwargs["torch_dtype"] = torch.float16
    elif quant_config:
        model_kwargs["quantization_config"] = quant_config
    
    model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
    model.eval()
    
    print(f"Model loaded on {next(model.parameters()).device}")
    return model, tokenizer

## 4. Benchmark Functions

In [None]:
def benchmark_throughput(model, tokenizer, prompts, max_new_tokens=128, 
                         batch_size=1, num_warmup=2, num_runs=5):
    """Benchmark throughput (tokens/second)."""
    device = next(model.parameters()).device
    
    # Prepare batch
    batch_prompts = (prompts * (batch_size // len(prompts) + 1))[:batch_size]
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, 
                       truncation=True, max_length=512).to(device)
    input_tokens = inputs.input_ids.shape[1]
    
    # Warmup
    for _ in range(num_warmup):
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=max_new_tokens, 
                              do_sample=False, pad_token_id=tokenizer.pad_token_id)
    
    # Benchmark
    total_time = 0
    total_tokens = 0
    
    for _ in tqdm(range(num_runs), desc="Throughput"):
        torch.cuda.synchronize()
        start = time.perf_counter()
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens,
                                    do_sample=False, pad_token_id=tokenizer.pad_token_id)
        
        torch.cuda.synchronize()
        total_time += time.perf_counter() - start
        total_tokens += (outputs.shape[1] - input_tokens) * batch_size
    
    avg_time = total_time / num_runs
    avg_tokens = total_tokens / num_runs
    
    return {
        "tokens_per_second": avg_tokens / avg_time,
        "batch_size": batch_size,
        "total_time_seconds": avg_time,
    }

In [None]:
def benchmark_latency(model, tokenizer, prompt, max_new_tokens=128, 
                      num_warmup=2, num_runs=10):
    """Benchmark latency (TTFT and per-token)."""
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, 
                       max_length=512).to(device)
    input_length = inputs.input_ids.shape[1]
    
    # Warmup
    for _ in range(num_warmup):
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=max_new_tokens,
                              do_sample=False, pad_token_id=tokenizer.pad_token_id)
    
    # Measure TTFT
    ttft_times = []
    for _ in tqdm(range(num_runs), desc="TTFT"):
        torch.cuda.synchronize()
        start = time.perf_counter()
        with torch.no_grad():
            _ = model.generate(**inputs, max_new_tokens=1, do_sample=False,
                              pad_token_id=tokenizer.pad_token_id)
        torch.cuda.synchronize()
        ttft_times.append((time.perf_counter() - start) * 1000)
    
    # Measure full generation
    gen_times = []
    tokens_list = []
    
    for _ in tqdm(range(num_runs), desc="Generation"):
        torch.cuda.synchronize()
        start = time.perf_counter()
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens,
                                    do_sample=False, pad_token_id=tokenizer.pad_token_id)
        torch.cuda.synchronize()
        gen_times.append((time.perf_counter() - start) * 1000)
        tokens_list.append(outputs.shape[1] - input_length)
    
    avg_ttft = sum(ttft_times) / len(ttft_times)
    avg_gen_time = sum(gen_times) / len(gen_times)
    avg_tokens = sum(tokens_list) / len(tokens_list)
    
    per_token = (avg_gen_time - avg_ttft) / max(avg_tokens - 1, 1)
    
    return {
        "time_to_first_token_ms": avg_ttft,
        "per_token_latency_ms": per_token,
        "total_generation_time_ms": avg_gen_time,
        "tokens_generated": int(avg_tokens),
    }

In [None]:
def benchmark_memory(model, tokenizer, prompt, max_new_tokens=128, batch_size=1):
    """Benchmark GPU memory usage."""
    device = next(model.parameters()).device
    
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    prompts = [prompt] * batch_size
    inputs = tokenizer(prompts, return_tensors="pt", padding=True,
                       truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=max_new_tokens,
                          do_sample=False, pad_token_id=tokenizer.pad_token_id)
    
    return {
        "peak_memory_mb": torch.cuda.max_memory_allocated() / (1024**2),
        "allocated_memory_mb": torch.cuda.memory_allocated() / (1024**2),
    }

## 5. Run Benchmarks

In [None]:
def run_full_benchmark(model_name, quantization, batch_size=1, max_new_tokens=128, num_runs=5):
    """Run complete benchmark for a configuration."""
    print(f"\n{'='*60}")
    print(f"Benchmarking: {model_name} ({quantization})")
    print(f"{'='*60}\n")
    
    model, tokenizer = load_model(model_name, quantization)
    
    print("\n[1/3] Throughput...")
    throughput = benchmark_throughput(model, tokenizer, DEFAULT_PROMPTS, 
                                      max_new_tokens, batch_size, num_runs=num_runs)
    
    print("\n[2/3] Latency...")
    latency = benchmark_latency(model, tokenizer, DEFAULT_PROMPTS[0], 
                                max_new_tokens, num_runs=num_runs)
    
    print("\n[3/3] Memory...")
    memory = benchmark_memory(model, tokenizer, DEFAULT_PROMPTS[0], 
                              max_new_tokens, batch_size)
    
    result = {
        "model_name": model_name,
        "quantization": quantization,
        "batch_size": batch_size,
        "throughput": throughput,
        "latency": latency,
        "memory": memory,
    }
    
    # Cleanup
    del model
    gc.collect()
    torch.cuda.empty_cache()
    
    return result

In [None]:
# Run benchmark - TinyLlama with different quantizations
results = []

for quant in ["fp16", "int8", "int4"]:
    try:
        result = run_full_benchmark("tiny", quant, num_runs=3)
        results.append(result)
    except Exception as e:
        print(f"Error with {quant}: {e}")

## 6. Results Summary

In [None]:
# Create summary DataFrame
summary_data = []
for r in results:
    summary_data.append({
        "Model": r["model_name"],
        "Quantization": r["quantization"],
        "Throughput (tok/s)": f"{r['throughput']['tokens_per_second']:.2f}",
        "TTFT (ms)": f"{r['latency']['time_to_first_token_ms']:.2f}",
        "Per-Token (ms)": f"{r['latency']['per_token_latency_ms']:.2f}",
        "Peak Memory (GB)": f"{r['memory']['peak_memory_mb']/1024:.2f}",
    })

df = pd.DataFrame(summary_data)
print("\n" + "="*70)
print("BENCHMARK RESULTS")
print("="*70)
display(df)

## 7. Visualizations

In [None]:
# Plot throughput comparison
plt.figure(figsize=(10, 5))

labels = [f"{r['model_name']}\n({r['quantization']})" for r in results]
throughputs = [r['throughput']['tokens_per_second'] for r in results]

bars = plt.bar(labels, throughputs, color=sns.color_palette("husl", len(results)))
plt.ylabel("Tokens per Second", fontsize=12)
plt.title("Throughput Comparison", fontsize=14, fontweight='bold')

for bar, val in zip(bars, throughputs):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
             f'{val:.1f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('throughput_comparison.png', dpi=150)
plt.show()

In [None]:
# Plot memory comparison
plt.figure(figsize=(10, 5))

labels = [f"{r['model_name']}\n({r['quantization']})" for r in results]
memory = [r['memory']['peak_memory_mb']/1024 for r in results]

bars = plt.bar(labels, memory, color=sns.color_palette("husl", len(results)))
plt.ylabel("Peak Memory (GB)", fontsize=12)
plt.title("GPU Memory Usage", fontsize=14, fontweight='bold')

for bar, val in zip(bars, memory):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
             f'{val:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('memory_comparison.png', dpi=150)
plt.show()

In [None]:
# Plot latency comparison
fig, ax = plt.subplots(figsize=(10, 5))

labels = [f"{r['model_name']}\n({r['quantization']})" for r in results]
ttft = [r['latency']['time_to_first_token_ms'] for r in results]
per_token = [r['latency']['per_token_latency_ms'] for r in results]

x = range(len(labels))
width = 0.35

ax.bar([i - width/2 for i in x], ttft, width, label='TTFT (ms)')
ax.bar([i + width/2 for i in x], per_token, width, label='Per-Token (ms)')

ax.set_ylabel('Latency (ms)', fontsize=12)
ax.set_title('Latency Breakdown', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.tight_layout()
plt.savefig('latency_comparison.png', dpi=150)
plt.show()

## 8. Save Results

In [None]:
import json
from datetime import datetime

# Save to JSON
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f'benchmark_results_{timestamp}.json', 'w') as f:
    json.dump(results, f, indent=2)

# Save summary to CSV
df.to_csv(f'benchmark_summary_{timestamp}.csv', index=False)

print(f"Results saved!")
print(f"- benchmark_results_{timestamp}.json")
print(f"- benchmark_summary_{timestamp}.csv")

---

## Next Steps

1. **Try different models**: Change `"tiny"` to `"phi2"` or add your own HuggingFace model path
2. **Test batch scaling**: Modify `batch_size` parameter to see throughput scaling
3. **Compare more quantizations**: Add INT8 vs INT4 vs FP16 comparisons
4. **Export charts**: Use for README and LinkedIn posts