# Llama Instruct 4-bit AWQ with vLLM Compressor

This notebook applies 4-bit Activation-Aware Quantization (AWQ) using the vLLM compressor (llm-compressor), serves the model with vLLM, makes a sample call, and benchmarks on 100 samples.

## Prerequisites
- GPU with compute capability >= 8.0 for fast W4A16 inference
- Set `HUGGINGFACE_HUB_TOKEN` if the model is gated
- Restart the kernel after installation if CUDA libraries change

In [None]:
# Install vLLM + llm-compressor and friends (pin llmcompressor 0.6.0.1: py3.12-friendly and numpy<2/vLLM compatible)
!pip install -q --upgrade \
  'vllm>=0.5.5' \
  'llmcompressor==0.6.0.1' \
  'transformers>=4.43' \
  datasets accelerate

## Configure model + paths
AWQ needs small calibration data to compute per-channel scales.

In [None]:
from pathlib import Path
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
CALIBRATION_DATASET = "HuggingFaceH4/ultrachat_200k"
CALIBRATION_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 256  # AWQ benefits from a bit more data
MAX_SEQUENCE_LENGTH = 1024
QUANTIZED_DIR = Path("llama-awq-w4a16")

QUANTIZED_DIR.mkdir(exist_ok=True)
print(f"Saving quantized model to: {QUANTIZED_DIR.resolve()}")


## Load model + tokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)


## Build calibration set

In [None]:
raw_ds = load_dataset(
    CALIBRATION_DATASET,
    split=f"{CALIBRATION_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
).shuffle(seed=42)

def format_example(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"], tokenize=False
        )
    }

formatted = raw_ds.map(format_example)

def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

calibration_ds = formatted.map(tokenize, remove_columns=formatted.column_names)
print(calibration_ds)


## Quantize to 4-bit (W4A16) with AWQ
AWQ scales activations before weight quantization. The default mapping covers Llama-family layers.

In [None]:
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier

recipe = [
    AWQModifier(
        ignore=["lm_head"],
        scheme="W4A16_ASYM",
        targets=["Linear"],
        duo_scaling="both",
    )
]

oneshot(
    model=model,
    dataset=calibration_ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

model.save_pretrained(QUANTIZED_DIR, save_compressed=True)
tokenizer.save_pretrained(QUANTIZED_DIR)
print(f"Quantized model saved to {QUANTIZED_DIR}")


## Serve with vLLM

In [None]:
# Example: launch the server
# !CUDA_VISIBLE_DEVICES=0 vllm serve "$QUANTIZED_DIR" \
#   --max-model-len 4096 \
#   --tensor-parallel-size 1 \
#   --port 8000 \
#   --api-key dummy


## Call the served model

In [None]:
from openai import OpenAI

client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")

resp = client.chat.completions.create(
    model="awq-w4a16-demo",  # update to the id advertised by vLLM
    messages=[
        {"role": "system", "content": "You are a concise assistant."},
        {"role": "user", "content": "List two advantages of AWQ."},
    ],
    max_tokens=64,
)
print(resp.choices[0].message.content)


## Local inference without the server (optional)

In [None]:
from vllm import LLM, SamplingParams

sampling = SamplingParams(temperature=0.2, max_tokens=64)
local_llm = LLM(model=str(QUANTIZED_DIR), tensor_parallel_size=1)
outputs = local_llm.generate([
    "Explain activation-aware quantization in one sentence.",
], sampling)
print(outputs[0].outputs[0].text.strip())


## Benchmark on a small dataset (100 samples)
Same `tweet_eval/sentiment` probe for quick accuracy-check.

In [None]:
from collections import Counter
from datasets import load_dataset
import numpy as np

LABELS = {0: "negative", 1: "neutral", 2: "positive"}
EVAL_SPLIT = "train[:100]"

eval_ds = load_dataset("tweet_eval", "sentiment", split=EVAL_SPLIT)

prompts = [
    f"""Classify the sentiment of the tweet as negative, neutral, or positive. Respond with a single word.

Tweet: {row['text']}

Label:"""
    for row in eval_ds
]

sampling_params = SamplingParams(temperature=0.0, max_tokens=3, stop=["
"])
llm_for_eval = local_llm

generations = llm_for_eval.generate(prompts, sampling_params)

def normalize(text: str) -> str:
    t = text.strip().lower()
    for lbl in LABELS.values():
        if lbl in t:
            return lbl
    if t.startswith("pos"):
        return "positive"
    if t.startswith("neg"):
        return "negative"
    return "neutral"

preds = [normalize(out.outputs[0].text) for out in generations]
gold = [LABELS[int(row["label"])] for row in eval_ds]

acc = np.mean([p == g for p, g in zip(preds, gold)])
counts = Counter(preds)
print(f"Accuracy on {len(gold)} samples: {acc:.3f}")
print("Prediction distribution:", counts)
