In [None]:
# Install libraries for 8-bit/4-bit
!pip install -q -U bitsandbytes transformers accelerate optimum auto-gptq

# Clone and build llama.cpp for GGUF conversion tools
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && pip install -r requirements.txt

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Example base model

# 1. Save 8-bit Model
model_8bit = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    load_in_8bit=True
)
model_8bit.save_pretrained("./quantized/model-int8")

# 2. Save 4-bit Model (NF4)
bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model_4bit = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config_4bit, 
    device_map="auto"
)
model_4bit.save_pretrained("./quantized/model-int4")

print("INT8 and INT4 weights saved successfully.")

In [None]:

!python llama.cpp/convert_hf_to_gguf.py ./quantized/model-int4 \
    --outfile ./quantized/model-f16.gguf \
    --outtype f16

# 2. Quantize the GGUF file to 4-bit (q4_0)
!cd llama.cpp && ./llama-quantize ../quantized/model-f16.gguf ../quantized/model.gguf q4_0

print("GGUF Conversion Complete: /quantized/model.gguf created.")

In [None]:
def get_size(path):
    size = os.path.getsize(path) / (1024 * 1024) # MB
    return f"{size:.2f} MB"

print(f"FP16 (approx): ~250 MB") # Base OPT-125m
print(f"INT8 Size: {get_size('./quantized/model-int8/adapter_model.bin')}") 
print(f"INT4 Size: {get_size('./quantized/model-int4/adapter_model.bin')}")
print(f"GGUF Size: {get_size('./quantized/model.gguf')}")