In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os

def quantize_model(model_path: str, save_path: str, quant_bits: int = 4, compute_dtype: str = "bfloat16"):
    assert quant_bits in [4, 8], "Only 4-bit and 8-bit quantization supported."

    if quant_bits == 4:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=compute_dtype
        )
    else:  # quant_bits == 8
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True
        )

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    os.makedirs(save_path, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"✅ Model quantized to {quant_bits}-bit and saved at {save_path}")

In [2]:
quantize_model(
    model_path="/home/ubuntu/fast_llm_inference/models/Qwen2.5-7B-Instruct",
    save_path="/home/ubuntu/fast_llm_inference/models/Qwen2.5-7B-Instruct-4bit",
    quant_bits=4
)

2025-05-06 12:46:26.836810: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746535586.854317   86919 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746535586.860350   86919 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746535586.875228   86919 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746535586.875247   86919 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746535586.875250   86919 computation_placer.cc:177] computation placer alr

[2025-05-06 12:46:29,237] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Model quantized to 4-bit and saved at /home/ubuntu/fast_llm_inference/models/Qwen2.5-7B-Instruct-4bit
