In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch, os

def quantize_model(model_path: str,
                   save_path:  str,
                   quant_bits: int    = 4,
                   compute_dtype: str = "float16"):
    assert quant_bits in (4, 8)

    # map the string to the actual torch.dtype
    compute_dtype = getattr(torch, compute_dtype)

    if quant_bits == 4:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=compute_dtype,
        )
    else:  # 8-bit
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_enable_fp32_cpu_offload=False,
        )

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    os.makedirs(save_path, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"✅ Quantized to {quant_bits}-bit ({compute_dtype}) → saved at {save_path}")


In [5]:
quantize_model(
    model_path="/home/ubuntu/fast_llm_inference/models/Qwen2.5-0.5B-Instruct",
    save_path=f"/home/ubuntu/fast_llm_inference/models/Qwen2.5-0.5B-Instruct-{4}bit",
    quant_bits=4
)

✅ Quantized to 4-bit (torch.float16) → saved at /home/ubuntu/fast_llm_inference/models/Qwen2.5-0.5B-Instruct-4bit
