In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

def quantize_hf(model_name, out_dir, bit=8):
    bnb_config = BitsAndBytesConfig(
        load_in_{bit}bit=True,  # load_in_8bit or load_in_4bit
        llm_int8_threshold=6.0  # only for 8‑bit
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
    )
    model.save_pretrained(out_dir)

In [None]:
from vllm import LLM, QuantizationConfig

def quantize_vllm(model_name, out_dir, bit=4):
    qconfig = QuantizationConfig(bits=bit, backend="bitsandbytes")
    llm = LLM.from_pretrained(model_name, quantization_config=qconfig)
    llm.save_pretrained(out_dir)

In [None]:
# assuming llama.cpp built and in PATH
# bit ∈ {4, 8}
!./quantize models/orig.gguf models/quantized_{bit}bit.gguf {bit}

In [None]:
import torch
from deepspeed import init_inference

def quantize_ds(model_name, out_dir, bit=8):
    dtype = torch.int8 if bit==8 else torch.int4
    ds_model = init_inference(
        model_name,
        mp_size=1,
        dtype=dtype,
        replace_method="auto",
    )
    ds_model.save_pretrained(out_dir)


In [None]:
for backend in ["hf", "vllm", "llamacpp", "ds"]:
    for bit in [16, 8, 4]:
        out = f"quantized/{backend}_{bit}bit"
        if backend == "hf":
            quantize_hf("facebook/llama-7b", out, bit=bit)
        elif backend == "vllm":
            quantize_vllm("facebook/llama-7b", out, bit=bit)
        elif backend == "llamacpp":
            subprocess.run(["quantize", "models/llama-7b.gguf", f"{out}.gguf", str(bit)])
        elif backend == "ds":
            quantize_ds("facebook/llama-7b", out, bit=bit)

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os

def quantize_model(model_path: str, save_path: str, quant_bits: int = 4, compute_dtype: str = "bfloat16"):
    assert quant_bits in [4, 8], "Only 4-bit and 8-bit quantization supported."

    if quant_bits == 4:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=compute_dtype
        )
    else:  # quant_bits == 8
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True
        )

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    os.makedirs(save_path, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"✅ Model quantized to {quant_bits}-bit and saved at {save_path}")

In [4]:
quantize_model(
    model_path="/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct",
    save_path="/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-quantizised/llama-3.1-8B-8bit",
    quant_bits=8
)

2025-04-22 07:04:59.209113: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745305499.228433   85462 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745305499.234600   85462 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745305499.251302   85462 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745305499.251315   85462 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745305499.251317   85462 computation_placer.cc:177] computation placer alr

[2025-04-22 07:05:01,889] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Model quantized to 8-bit and saved at /home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-quantizised/llama-3.1-8B-8bit


In [None]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer

def quantize_model_gptq(model_path, save_path, bits=4, group_size=128):
    quant_config = BaseQuantizeConfig(
        bits=bits,
        group_size=group_size,
        desc_act=False,
        sym=True,
        model_seqlen=2048
    )

    model = AutoGPTQForCausalLM.from_pretrained(
        model_path,
        quantize_config=quant_config,
        trust_remote_code=True
    )

    model.quantize()
    model.save_quantized(save_path)

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.save_pretrained(save_path)

    print(f"✅ Model saved in GPTQ ({bits}-bit) format at {save_path}")

2025-04-22 07:41:21.321744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745307681.341024   97828 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745307681.346185   97828 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745307681.362320   97828 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745307681.362340   97828 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745307681.362341   97828 computation_placer.cc:177] computation placer alr

[2025-04-22 07:41:24,305] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
