In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

def quantize_hf(model_name, out_dir, bit=8):
    bnb_config = BitsAndBytesConfig(
        load_in_{bit}bit=True,  # load_in_8bit or load_in_4bit
        llm_int8_threshold=6.0  # only for 8‑bit
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
    )
    model.save_pretrained(out_dir)


In [None]:
from vllm import LLM, QuantizationConfig

def quantize_vllm(model_name, out_dir, bit=4):
    qconfig = QuantizationConfig(bits=bit, backend="bitsandbytes")
    llm = LLM.from_pretrained(model_name, quantization_config=qconfig)
    llm.save_pretrained(out_dir)

In [None]:
# assuming llama.cpp built and in PATH
# bit ∈ {4, 8}
!./quantize models/orig.gguf models/quantized_{bit}bit.gguf {bit}

In [None]:
import torch
from deepspeed import init_inference

def quantize_ds(model_name, out_dir, bit=8):
    dtype = torch.int8 if bit==8 else torch.int4
    ds_model = init_inference(
        model_name,
        mp_size=1,
        dtype=dtype,
        replace_method="auto",
    )
    ds_model.save_pretrained(out_dir)


In [None]:
for backend in ["hf", "vllm", "llamacpp", "ds"]:
    for bit in [16, 8, 4]:
        out = f"quantized/{backend}_{bit}bit"
        if backend == "hf":
            quantize_hf("facebook/llama-7b", out, bit=bit)
        elif backend == "vllm":
            quantize_vllm("facebook/llama-7b", out, bit=bit)
        elif backend == "llamacpp":
            subprocess.run(["quantize", "models/llama-7b.gguf", f"{out}.gguf", str(bit)])
        elif backend == "ds":
            quantize_ds("facebook/llama-7b", out, bit=bit)