In [8]:
# Alternative GGUF Conversion Using HF-Specific Tools

import os
from huggingface_hub import login
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from dotenv import load_dotenv


In [9]:
load_dotenv(override=True)

assert(os.getenv('hf_token'))
assert(os.getenv('WANDB_API_KEY'))

In [10]:
FFT_MODEL_PATH = "rtweera/qwen-choreo-full-ft-2025-May-19_14-29-25-final-k3"
OUTPUT_MODEL_PATH = "./qwen_choreo_fft"

login(token=os.getenv('hf_token'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Ravindu\.cache\huggingface\token
Login successful


In [11]:
# Test the pushed model
print("Testing already pushed model from Hub...")
try:
    merged_model = AutoModelForCausalLM.from_pretrained(
       FFT_MODEL_PATH,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(FFT_MODEL_PATH)
    print("Successfully loaded fft model.")
except Exception as e:
    print(f"Could not load fft model: {e}")


Testing already pushed model from Hub...


config.json:   0%|          | 0.00/683 [00:01<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Successfully loaded fft model.


In [12]:
print("Saving merged model...")
# Save the merged model locally
merged_model.save_pretrained(OUTPUT_MODEL_PATH, safe_serialization=True)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

Saving merged model...


('./qwen_choreo_fft\\tokenizer_config.json',
 './qwen_choreo_fft\\special_tokens_map.json',
 './qwen_choreo_fft\\vocab.json',
 './qwen_choreo_fft\\merges.txt',
 './qwen_choreo_fft\\added_tokens.json',
 './qwen_choreo_fft\\tokenizer.json')

In [13]:
!python ./llama.cpp/convert_hf_to_gguf.py ./qwen_choreo_fft --outfile finetuned-fft.gguf --outtype q8_0


INFO:hf-to-gguf:Loading model: qwen_choreo_fft
INFO:hf-to-gguf:Model architecture: Qwen2ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> Q8_0, shape = {896, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.bfloat16 --> F32, shape = {896}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.bfloat16 --> Q8_0, shape = {4864, 896}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,     torch.bfloat16 --> Q8_0, shape = {896, 4864}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.bfloat16 --> Q8_0, shape = {896, 4864}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.bfloat16 --> F32, shape = {896}
INFO:hf-to-gguf:blk.0.attn_k.bias,         torch.bfloat16 --> F32, shape = {128}
INFO:hf-to-gguf:blk.0.attn_k.weight,       torch.bfloat16 --> Q8_0, shape = {896, 128}
INFO:hf-to-gguf:blk.0.attn_output.weight,  torch.bf

In [None]:
print("Modelfile created at: ./qwen_choreo.modelfile")
print("\nTo import the model into Ollama, run:")
print("ollama create qwen-choreo -f ./qwen_choreo.modelfile")