In [3]:
# 1. Check GPU
import torch
if not torch.cuda.is_available():
    raise SystemError("‚ùå STOP! You are on CPU. Go to Runtime > Change runtime type > T4 GPU")
print(f"‚úÖ GPU Detected: {torch.cuda.get_device_name(0)}")

# 2. Install Unsloth (Clean Install)
# We allow it to install the latest xformers automatically
!pip uninstall unsloth xformers -y
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes

print("‚úÖ Installation Complete.")

‚úÖ GPU Detected: Tesla T4
Found existing installation: unsloth 2025.12.5
Uninstalling unsloth-2025.12.5:
  Successfully uninstalled unsloth-2025.12.5
[0mCollecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-ljim_ij3/unsloth_be1096e591a14587a63f2df45a338717
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-ljim_ij3/unsloth_be1096e591a14587a63f2df45a338717
  Resolved https://github.com/unslothai/unsloth.git to commit 8490f6efc407f409c42081988e93973df8e11f2d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filena

In [5]:
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

# 1. Load the Model (Qwen 2.5 Coder 1.5B)
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Coder-1.5B-Instruct",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# 2. Add LoRA Adapters (The "Fine-Tuning" Layer)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
)

# 3. Load Your Data
dataset = load_dataset("json", data_files="training_data.jsonl", split="train")

# Format data into the Chat style Qwen expects
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts }

dataset = dataset.map(formatting_prompts_func, batched = True)

# 4. Start Training
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30, # 30 steps is perfect for a small dataset
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        output_dir = "outputs",
    ),
)

print("--- üöÄ STARTING TRAINING ---")
trainer.train()
print("‚úÖ TRAINING FINISHED")


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth 2025.12.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/20 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- üöÄ STARTING TRAINING ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20 | Num Epochs = 10 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
1,2.0914
2,2.355
3,2.4743
4,2.2431
5,1.8175
6,1.8595
7,1.5237
8,1.5418
9,1.2094
10,1.049




0,1
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÅ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÖ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÜ‚ñÜ‚ñÖ‚ñà‚ñà‚ñÜ‚ñá‚ñá‚ñá‚ñÜ‚ñÖ‚ñÉ‚ñÖ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÅ‚ñÇ
train/learning_rate,‚ñÅ‚ñÇ‚ñÑ‚ñÖ‚ñá‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ
train/loss,‚ñá‚ñà‚ñà‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
total_flos,258925700345856.0
train/epoch,10.0
train/global_step,30.0
train/grad_norm,0.89496
train/learning_rate,1e-05
train/loss,0.4833
train_loss,1.02212
train_runtime,156.677
train_samples_per_second,1.532
train_steps_per_second,0.191


‚úÖ TRAINING FINISHED


In [None]:
import gc
import torch

# 1. Clean Memory to prevent crashes
print("--- üßπ Cleaning Memory... ---")
torch.cuda.empty_cache()
gc.collect()

# 2. Convert and Save
print("--- üíæ SAVING TO GGUF (Please wait...) ---")
# This converts the adapters + base model into a single file
model.save_pretrained_gguf("code_review_model", tokenizer, quantization_method = "q4_k_m")

print("‚úÖ DONE! Refresh the Files panel on the left.")
print("‚¨áÔ∏è Download 'code_review_model-unsloth.Q4_K_M.gguf' to your D: drive.")

--- üßπ Cleaning Memory... ---
--- üíæ SAVING TO GGUF (Please wait...) ---
Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/765 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [01:12<00:00, 72.09s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [01:16<00:00, 76.63s/it]


Unsloth: Merge process complete. Saved to `/content/code_review_model`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages


In [1]:
import torch
import os
from google.colab import drive

# 1. Mount Google Drive (The Safe Zone)
print("--- üìÇ MOUNTING GOOGLE DRIVE ---")
drive.mount('/content/drive')
save_path = "/content/drive/My Drive/veritas_adapters"

# 2. Install
print("--- ‚öôÔ∏è INSTALLING ---")
from IPython.utils import io
with io.capture_output() as captured:
    !pip uninstall unsloth -y
    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# 3. Train
print("--- üß† TRAINING START ---")
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Coder-1.5B-Instruct",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
)

dataset = load_dataset("json", data_files="training_data.jsonl", split="train")
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts }
dataset = dataset.map(formatting_prompts_func, batched = True)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        output_dir = "outputs",
    ),
)
trainer.train()

# 4. Save ADAPTERS ONLY to Drive (This is fast and won't crash)
print(f"--- üíæ SAVING TO GOOGLE DRIVE: {save_path} ---")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("‚úÖ PHASE 1 COMPLETE! Your model is safe on Google Drive.")

--- üìÇ MOUNTING GOOGLE DRIVE ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- ‚öôÔ∏è INSTALLING ---
--- üß† TRAINING START ---
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth 2025.12.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/20 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20 | Num Epochs = 10 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
1,2.0914
2,2.355
3,2.4743
4,2.2431
5,1.8175
6,1.8595
7,1.5237
8,1.5418
9,1.2094
10,1.049




0,1
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÅ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÖ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÜ‚ñÜ‚ñÖ‚ñà‚ñà‚ñÜ‚ñá‚ñá‚ñá‚ñÜ‚ñÖ‚ñÉ‚ñÖ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÅ‚ñÇ
train/learning_rate,‚ñÅ‚ñÇ‚ñÑ‚ñÖ‚ñá‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ
train/loss,‚ñá‚ñà‚ñà‚ñá‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
total_flos,258925700345856.0
train/epoch,10.0
train/global_step,30.0
train/grad_norm,0.89496
train/learning_rate,1e-05
train/loss,0.4833
train_loss,1.02212
train_runtime,86.2521
train_samples_per_second,2.783
train_steps_per_second,0.348


--- üíæ SAVING TO GOOGLE DRIVE: /content/drive/My Drive/veritas_adapters ---
‚úÖ PHASE 1 COMPLETE! Your model is safe on Google Drive.


In [None]:
import torch
from google.colab import drive
import gc

# 1. Mount Drive (Where your Phase 1 model lives)
print("--- üìÇ MOUNTING GOOGLE DRIVE ---")
drive.mount('/content/drive')
adapter_path = "/content/drive/My Drive/veritas_adapters"

# 2. Re-Install Unsloth (CRITICAL: Must happen BEFORE importing)
print("--- ‚öôÔ∏è INSTALLING TOOLS ---")
from IPython.utils import io
with io.capture_output() as captured:
    !pip uninstall unsloth -y
    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# 3. NOW we can import Unsloth (After installation)
from unsloth import FastLanguageModel

# 4. Load the Model & Adapters from Drive
print("--- üîÑ LOADING FROM DRIVE ---")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = adapter_path,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# 5. Save to GGUF
print("--- üíæ EXPORTING GGUF (This may take 5-10 mins) ---")
model.save_pretrained_gguf("veritas_final", tokenizer, quantization_method = "q4_k_m")

print("‚úÖ DONE! Refresh the files panel on the left.")
print("‚¨áÔ∏è Download 'veritas_final-unsloth.Q4_K_M.gguf' to your D: drive.")

--- üìÇ MOUNTING GOOGLE DRIVE ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- ‚öôÔ∏è INSTALLING TOOLS ---
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
--- üîÑ LOADING FROM DRIVE ---
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Unsloth 2025.12.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


--- üíæ EXPORTING GGUF (This may take 5-10 mins) ---
Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/765 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:25<00:00, 25.44s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:20<00:00, 20.02s/it]


Unsloth: Merge process complete. Saved to `/content/veritas_final`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages


In [1]:
import os
import torch
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# 1. Mount Drive (Where your adapters are)
print("--- üìÇ MOUNTING DRIVE ---")
drive.mount('/content/drive')
adapter_path = "/content/drive/My Drive/veritas_adapters"
output_merged = "/content/veritas_merged_hf"

# 2. Install Lightweight Tools (No Unsloth, just standard HuggingFace)
print("--- ‚öôÔ∏è INSTALLING TOOLS (Takes ~2 mins) ---")
!pip install peft transformers sentencepiece protobuf

# 3. Load Base Model & Adapters (CPU Mode)
print("--- üß† LOADING MODEL (This will fit in CPU RAM) ---")
base_model_id = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

# Load Base Model in Float16 (Small enough for RAM: ~3GB)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="cpu",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

# Load Adapters
print(f"--- üîó ATTACHING ADAPTERS FROM: {adapter_path} ---")
model = PeftModel.from_pretrained(base_model, adapter_path)

# 4. Merge & Save
print("--- üß¨ MERGING WEIGHTS ---")
model = model.merge_and_unload() # Fuses the LoRA into the base model

print("--- üíæ SAVING MERGED MODEL ---")
model.save_pretrained(output_merged)
tokenizer.save_pretrained(output_merged)
print("‚úÖ Merged model saved to local disk.")

# 5. Clean up RAM
import gc
del model
del base_model
gc.collect()

# 6. Convert to GGUF using Llama.cpp (Official Method)
print("--- üõ†Ô∏è SETTING UP LLAMA.CPP ---")
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && make # Compile the conversion tools

print("--- üîÑ CONVERTING TO GGUF (Quantization: q4_k_m) ---")
# Step A: Convert to GGUF format
!python llama.cpp/convert_hf_to_gguf.py {output_merged} --outfile veritas_f16.gguf

# Step B: Quantize to 4-bit (q4_k_m)
!llama.cpp/llama-quantize veritas_f16.gguf veritas_final_q4km.gguf q4_k_m

print("‚úÖ DONE! PREPARING DOWNLOAD...")
from google.colab import files
files.download('veritas_final_q4km.gguf')



--- üìÇ MOUNTING DRIVE ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- ‚öôÔ∏è INSTALLING TOOLS (Takes ~2 mins) ---
--- üß† LOADING MODEL (This will fit in CPU RAM) ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

--- üîó ATTACHING ADAPTERS FROM: /content/drive/My Drive/veritas_adapters ---
--- üß¨ MERGING WEIGHTS ---
--- üíæ SAVING MERGED MODEL ---
‚úÖ Merged model saved to local disk.
--- üõ†Ô∏è SETTING UP LLAMA.CPP ---
Cloning into 'llama.cpp'...
remote: Enumerating objects: 71768, done.[K
remote: Counting objects: 100% (233/233), done.[K
remote: Compressing objects: 100% (192/192), done.[K
remote: Total 71768 (delta 133), reused 44 (delta 40), pack-reused 71535 (from 3)[K
Receiving objects: 100% (71768/71768), 234.35 MiB | 27.24 MiB/s, done.
Resolving deltas: 100% (51821/51821), done.
Updating files: 100% (2051/2051), done.
Makefile:6: *** Build system changed:
 The Makefile build has been replaced by CMake.

 For build instructions see:
 https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md

.  Stop.
--- üîÑ CONVERTING TO GGUF (Quantization: q4_k_m) ---
INFO:hf-to-gguf:Loading model: veritas_merged_hf
INFO:hf-to-gguf:Model architecture: Qwen2ForCausalLM
INFO:hf-to-gguf:gg

FileNotFoundError: Cannot find file: veritas_final_q4km.gguf

In [2]:
import os
from google.colab import files

# 1. Verify the uncompressed file is there
if not os.path.exists("veritas_f16.gguf"):
    print("‚ùå Critical Error: 'veritas_f16.gguf' is missing. You might need to re-run the previous merge step.")
else:
    print("‚úÖ Found 'veritas_f16.gguf' (3.1 GB). Proceeding...")

    # 2. Compile llama.cpp correctly (using CMake)
    print("--- üõ†Ô∏è COMPILING TOOLS (Takes ~2-3 mins) ---")
    !cd llama.cpp && cmake -B build && cmake --build build --config Release

    # 3. Run Quantization (Shrink to ~1GB)
    print("--- üîÑ QUANTIZING to q4_k_m ---")
    # The tool is located in build/bin/ now
    !llama.cpp/build/bin/llama-quantize veritas_f16.gguf veritas_final_q4km.gguf q4_k_m

    # 4. Download
    print("‚úÖ DONE! Starting Download...")
    if os.path.exists('veritas_final_q4km.gguf'):
        files.download('veritas_final_q4km.gguf')
    else:
        print("‚ùå Error: Quantization failed.")

‚úÖ Found 'veritas_f16.gguf' (3.1 GB). Proceeding...
--- üõ†Ô∏è COMPILING TOOLS (Takes ~2-3 mins) ---
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (f

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>