# Quantization of Hugging Face model using llama.cpp

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/05/notebooks/quantization.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/05/notebooks/quantization.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

## Install dependencies

In [1]:
!uv pip install huggingface_hub

[2mUsing Python 3.13.1 environment at: /Users/simon/Dev/CS-394/.venv[0m
[2mAudited [1m1 package[0m [2min 27ms[0m[0m


## Clone and build llama.cpp

In [1]:
TEMP_FOLDER = "/content/tmp"

# Create a temporary location and clone llama.cpp
!mkdir -p {TEMP_FOLDER}
!cd {TEMP_FOLDER} && git clone https://github.com/ggml-org/llama.cpp

!cd {TEMP_FOLDER}/llama.cpp && cmake -B build
!cd {TEMP_FOLDER}/llama.cpp && cmake --build build --config Release --target llama-quantize


Cloning into 'llama.cpp'...
remote: Enumerating objects: 77965, done.[K
remote: Counting objects: 100% (228/228), done.[K
remote: Compressing objects: 100% (156/156), done.[K
remote: Total 77965 (delta 141), reused 76 (delta 72), pack-reused 77737 (from 4)[K
Receiving objects: 100% (77965/77965), 286.82 MiB | 15.73 MiB/s, done.
Resolving deltas: 100% (56335/56335), done.
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identificati

## Run convert hf to gguf script

In [None]:
from huggingface_hub import snapshot_download

MODEL_FOLDER = "/content/models"
MODEL_VENDOR = "Qwen"
MODEL_NAME = "Qwen3-0.6B"
MODEL_REPO = f"{MODEL_VENDOR}/{MODEL_NAME}"
GGUF_FOLDER = "/content/ggufs"

# Create a temporary location and clone llama.cpp

!mkdir -p {MODEL_FOLDER}/{MODEL_NAME}
!mkdir -p {GGUF_FOLDER}

# Download the model from Hugging Face at full precision
snapshot_download(repo_id=MODEL_REPO, local_dir=f"{MODEL_FOLDER}/{MODEL_NAME}", repo_type="model")

# Convert to GGUF (fp16 first)
!cd {TEMP_FOLDER}/llama.cpp && python convert_hf_to_gguf.py {MODEL_FOLDER}/{MODEL_NAME} \
    --outfile {GGUF_FOLDER}/{MODEL_NAME}-F16.gguf \
    --outtype f16

# # Convert gp16 GGUF to Q4_K_M
!{TEMP_FOLDER}/llama.cpp/build/bin/llama-quantize {GGUF_FOLDER}/{MODEL_NAME}-F16.gguf \
    {GGUF_FOLDER}/{MODEL_NAME}-Q4_K_M.gguf \
    Q4_K_M

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:hf-to-gguf:Loading model: Qwen3-0.6B
INFO:hf-to-gguf:Model architecture: Qwen3ForCausalLM
INFO:hf-to-gguf:gguf: indexing model part 'model.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:output.weight,             torch.bfloat16 --> F16, shape = {1024, 151936}
INFO:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> F16, shape = {1024, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.bfloat16 --> F32, shape = {1024}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.bfloat16 --> F16, shape = {3072, 1024}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,     torch.bfloat16 --> F16, shape = {1024, 3072}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.bfloat16 --> F16, shape = {1024, 3072}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.bfloat16 --> F32, shape = {1024}
INFO:hf-to-gguf:blk.0.attn_k_norm.weight,  torch.bfloat16 --> F32, shape = {128}
INFO:hf-to-gguf:blk.0.attn_k.weight,       torch.b