In [1]:
# Alternative GGUF Conversion Using HF-Specific Tools

import os
from huggingface_hub import login
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from dotenv import load_dotenv


In [15]:
load_dotenv(override=True)

assert(os.getenv('hf_token'))
assert(os.getenv('WANDB_API_KEY'))

In [17]:

# Replace with your actual repository names
LORA_MODEL_PATH = "rtweera/qwen_choreo_ft_lora"  # Update this path!
BASE_MODEL_PATH = "Qwen/Qwen2.5-0.5B-Instruct"
OUTPUT_MODEL_PATH = "./qwen_choreo_merged"
HF_OUTPUT_REPO = "rtweera/qwen_choreo_merged"  # Update this!

# Login to Hugging Face
login(token=os.getenv('hf_token'))


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:

print("Loading and merging models...")
# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    torch_dtype=torch.float16,  # Use float16 for compatibility with conversion tools
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)

# Load LoRA and merge
peft_model = PeftModel.from_pretrained(base_model, LORA_MODEL_PATH)
merged_model = peft_model.merge_and_unload()


Loading and merging models...


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


adapter_model.safetensors:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

In [18]:

print("Saving merged model...")
# Save the merged model locally
merged_model.save_pretrained(OUTPUT_MODEL_PATH, safe_serialization=True)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

print("Pushing merged model to Hugging Face Hub...")
# Push the merged model to Hugging Face Hub
merged_model.push_to_hub(HF_OUTPUT_REPO)
tokenizer.push_to_hub(HF_OUTPUT_REPO)

print("Merged model successfully pushed to Hugging Face Hub!")


Saving merged model...
Pushing merged model to Hugging Face Hub...


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Merged model successfully pushed to Hugging Face Hub!


In [22]:
# Make sure you have git-lfs installed (https://git-lfs.com)
!git lfs install
# Clone your model from Huggingface
!git clone https://huggingface.co/rtweera/qwen_choreo_merged
# Clone llama.cpp's repository. They provide code to convert models into gguf.
!git clone https://github.com/ggerganov/llama.cpp.git


Updated Git hooks.
Git LFS initialized.


fatal: destination path 'qwen_choreo_merged' already exists and is not an empty directory.
fatal: destination path 'llama.cpp' already exists and is not an empty directory.


In [24]:
os.getcwd()

'c:\\Users\\Ravindu\\Documents\\My Projects\\slm-fine-tune\\choreo-doc-assistant\\notebooks'

In [25]:
!pip install -r ./llama.cpp/requirements.txt



Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu
Collecting numpy~=1.26.4 (from -r c:\Users\Ravindu\Documents\My Projects\slm-fine-tune\choreo-doc-assistant\notebooks\llama.cpp\requirements\requirements-convert_legacy_llama.txt (line 1))
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting sentencepiece~=0.2.0 (from -r c:\Users\Ravindu\Documents\My Projects\slm-fine-tune\choreo-doc-assistant\notebooks\llama.cpp\requirements\requirements-convert_legacy_llama.txt (line 2))
  Using cached sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Collecting gguf>=0.1.0 (from -r c:\Users\Ravindu\Documents\My Projects\slm-fine-tune\choreo-doc-assistant\notebooks\llama.cpp\requirements\requirements-convert_legacy_llama.txt (line 4))
  Downloading gguf-0.16.3-py3-none-any.whl.metadata (4.4 kB)
Collecting protobuf<5

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.5.1 requires fsspec[http]<=2025.3.0,>=2023.1.0, but you have fsspec 2025.3.2 which is incompatible.
datasets 3.5.1 requires huggingface-hub>=0.24.0, but you have huggingface-hub 0.23.5 which is incompatible.
grpcio-status 1.71.0 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 4.25.7 which is incompatible.
peft 0.15.2 requires huggingface_hub>=0.25.0, but you have huggingface-hub 0.23.5 which is incompatible.


In [29]:
!python ./llama.cpp/convert_hf_to_gguf.py ./qwen_choreo_merged --outfile finetuned-lora.gguf --outtype q8_0


INFO:hf-to-gguf:Loading model: qwen_choreo_merged
INFO:hf-to-gguf:Model architecture: Qwen2ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> Q8_0, shape = {896, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.float16 --> F32, shape = {896}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.float16 --> Q8_0, shape = {4864, 896}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,     torch.float16 --> Q8_0, shape = {896, 4864}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.float16 --> Q8_0, shape = {896, 4864}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.float16 --> F32, shape = {896}
INFO:hf-to-gguf:blk.0.attn_k.bias,         torch.float16 --> F32, shape = {128}
INFO:hf-to-gguf:blk.0.attn_k.weight,       torch.float16 --> Q8_0, shape = {896, 128}
INFO:hf-to-gguf:blk.0.attn_output.weight,  torch.float16

In [None]:

# Convert to GGUF using huggingface-hub CLI
print("\nConverting model to GGUF format...")
!huggingface-hub-gguf convert {HF_OUTPUT_REPO} --outdir ./gguf_output --quantization-method q4_k_m

print("\nGGUF conversion complete! Files saved in: ./gguf_output")

print("\nCreating Ollama modelfile...")
# Create an Ollama modelfile for easy import
modelfile_content = """
FROM ./gguf_output/model-q4_k_m.gguf
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER stop "<|im_end|>"
PARAMETER stop "<|im_start|>"
SYSTEM """


In [None]:

# Write modelfile to disk
with open("./qwen_choreo.modelfile", "w") as f:
    f.write(modelfile_content)

print("Modelfile created at: ./qwen_choreo.modelfile")
print("\nTo import the model into Ollama, run:")
print("ollama create qwen-choreo -f ./qwen_choreo.modelfile")