# Llama Instruct 4-bit GPTQ with vLLM Compressor

This notebook applies 4-bit GPTQ using the vLLM compressor (llm-compressor), serves the model with vLLM, makes a streaming smoke test, runs 1000 examples, and compares accuracy with the base model.


## Prerequisites
- GPU with compute capability >= 8.0 for fast W4A16 inference
- Set `HUGGINGFACE_HUB_TOKEN` if the model is gated
- Restart the kernel after installation if CUDA libraries change


In [None]:
# Install dependencies from requirements.txt (run once per environment)
import sys
!{sys.executable} -m pip install -q --upgrade -r requirements.txt


## 1. Set up Hugging Face access


In [None]:
import os

# Optional: set HUGGINGFACE_HUB_TOKEN in your environment or paste it here.
HUGGINGFACE_HUB_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
# HUGGINGFACE_HUB_TOKEN = "hf_..."  # Uncomment to hardcode for this notebook session

if not HUGGINGFACE_HUB_TOKEN:
    print("HUGGINGFACE_HUB_TOKEN not set; gated models may fail to load.")


## 2. Configure model + paths
GPTQ needs small calibration data to compute quantization parameters.


In [None]:
from pathlib import Path
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
CALIBRATION_DATASET = "HuggingFaceH4/ultrachat_200k"
CALIBRATION_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 128  # GPTQ works well with smaller calibration
MAX_SEQUENCE_LENGTH = 2048
QUANTIZED_DIR = Path("llama-gptq-w4a16")
BASE_MODEL_PATH = None  # populated by the download step below

QUANTIZED_DIR.mkdir(exist_ok=True)
print(f"Saving quantized model to: {QUANTIZED_DIR.resolve()}")


## 2.1 Download the base model (cache snapshot)


In [None]:
import os
from huggingface_hub import snapshot_download

local_files_only = bool(os.getenv("HF_HUB_OFFLINE")) or not HUGGINGFACE_HUB_TOKEN
BASE_MODEL_PATH = snapshot_download(
    MODEL_ID,
    token=HUGGINGFACE_HUB_TOKEN,
    local_files_only=local_files_only,
)
print(f"Base model snapshot: {BASE_MODEL_PATH}")


## 3. Load model + tokenizer


In [None]:
MODEL_SOURCE = BASE_MODEL_PATH or MODEL_ID

model = AutoModelForCausalLM.from_pretrained(
    MODEL_SOURCE,
    torch_dtype="auto",
    device_map="auto",
    token=HUGGINGFACE_HUB_TOKEN,
)
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_SOURCE,
    trust_remote_code=True,
    token=HUGGINGFACE_HUB_TOKEN,
)


## 3.1 Build calibration set


In [None]:
raw_ds = load_dataset(
    CALIBRATION_DATASET,
    split=f"{CALIBRATION_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
    token=HUGGINGFACE_HUB_TOKEN,
).shuffle(seed=42)

def format_example(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"], tokenize=False
        )
    }

formatted = raw_ds.map(format_example)

def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

calibration_ds = formatted.map(tokenize, remove_columns=formatted.column_names)
print(calibration_ds)


## 3.2 Quantize to 4-bit (W4A16) with GPTQ
GPTQ performs weight-only quantization with calibration data.


In [None]:
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization.gptq import GPTQModifier

recipe = [
    GPTQModifier(
        ignore=["lm_head"],
        scheme="W4A16",
        targets=["Linear"],
        block_size=128,
        dampening_frac=0.01,
    )
]

oneshot(
    model=model,
    dataset=calibration_ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

model.save_pretrained(QUANTIZED_DIR, save_compressed=True)
tokenizer.save_pretrained(QUANTIZED_DIR)
print(f"Quantized model saved to {QUANTIZED_DIR}")


## 4. Serve the quantized model


Run this in a separate terminal so the notebook can continue:

```bash
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
python scripts/serve_vllm.py \
  --model llama-gptq-w4a16 --quantization none \
  --port 8000 --api-key dummy \
  --gpu-memory-utilization 0.7 --max-model-len 2048 --max-num-seqs 32 \
  --served-model-name gptq-llama
```


## 4.5 Smoke test (streaming hello world)


In [None]:
from openai import OpenAI

QUANT_BASE_URL = "http://localhost:8000/v1"
QUANT_SERVED_MODEL = "gptq-llama"  # update if you used --served-model-name

client = OpenAI(base_url=QUANT_BASE_URL, api_key="dummy")
stream = client.chat.completions.create(
    model=QUANT_SERVED_MODEL,
    messages=[
        {"role": "system", "content": "You are a concise assistant."},
        {"role": "user", "content": "Hello world in one short sentence."},
    ],
    max_tokens=32,
    stream=True,
)
for event in stream:
    delta = event.choices[0].delta.content or ""
    print(delta, end="", flush=True)
print()


## 5. Run 1000 jobs on the quantized model


In [None]:
import sys

!{sys.executable} scripts/run_batch.py \
  --base-url http://localhost:8000/v1 \
  --model gptq-llama \
  --task xsum --tokenizer llama-gptq-w4a16 --max-context-tokens 2048 --context-buffer 128 \
  --output results/gptq.jsonl --max-samples 1000


## 6. Serve the non-quantized (base) model


In [None]:
base_cmd = (
    f"PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
"
    f"python scripts/serve_unquantized.py \\
"
    f"  --model {BASE_MODEL_PATH} \\
"
    "  --port 8001 --api-key dummy \\
"
    "  --gpu-memory-utilization 0.7 --max-model-len 2048 --max-num-seqs 16 \\
"
    "  --served-model-name base-llama"
)
print(base_cmd)


Stop the quantized server first, then run one of the commands below:

```bash
# Use the local snapshot to avoid extra HF downloads
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
python scripts/serve_unquantized.py \
  --model /path/to/local/snapshot \
  --port 8001 --api-key dummy \
  --gpu-memory-utilization 0.7 --max-model-len 2048 --max-num-seqs 16 \
  --served-model-name base-llama

# Or pull from HF (requires access + token)
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
python scripts/serve_unquantized.py \
  --port 8001 --api-key dummy \
  --gpu-memory-utilization 0.7 --max-model-len 2048 --max-num-seqs 16 \
  --served-model-name base-llama
```


## 6.5 Smoke test (streaming hello world)


In [None]:
from openai import OpenAI

BASE_BASE_URL = "http://localhost:8001/v1"
BASE_SERVED_MODEL = "base-llama"

client = OpenAI(base_url=BASE_BASE_URL, api_key="dummy")
stream = client.chat.completions.create(
    model=BASE_SERVED_MODEL,
    messages=[
        {"role": "system", "content": "You are a concise assistant."},
        {"role": "user", "content": "Hello world in one short sentence."},
    ],
    max_tokens=32,
    stream=True,
)
for event in stream:
    delta = event.choices[0].delta.content or ""
    print(delta, end="", flush=True)
print()


## 7. Run 1000 jobs on the base model


In [None]:
import sys

!{sys.executable} scripts/run_batch.py \
  --base-url http://localhost:8001/v1 \
  --model base-llama \
  --task xsum --tokenizer meta-llama/Llama-3.2-1B-Instruct --max-context-tokens 2048 --context-buffer 128 \
  --output results/base.jsonl --max-samples 1000


## 8. Measure accuracy (Rouge-L)


In [None]:
from pathlib import Path
import json
from statistics import mean
from rouge_score import rouge_scorer

def load_rows(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"Missing results file: {path}")
    with path.open("r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

def rouge_l(rows):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    scores = [
        scorer.score(r["reference"], r["prediction"])["rougeL"].fmeasure
        for r in rows
    ]
    return mean(scores)

gptq_rows = load_rows(Path("results/gptq.jsonl"))
base_rows = load_rows(Path("results/base.jsonl"))

gptq_score = rouge_l(gptq_rows)
base_score = rouge_l(base_rows)

print(f"GPTQ Rouge-L: {gptq_score:.4f} ({len(gptq_rows)} samples)")
print(f"Base Rouge-L: {base_score:.4f} ({len(base_rows)} samples)")
print(f"Delta (GPTQ - Base): {gptq_score - base_score:+.4f}")
