In [None]:
# ============================================================
# 0. (Optional) Install Unsloth - run this ONCE in Colab
# ============================================================
# If you already have unsloth installed in your runtime, you can skip this cell.

%%capture
!pip install unsloth
!pip uninstall unsloth -y
!pip install --upgrade --no-cache-dir --no-deps \
    git+https://github.com/unslothai/unsloth.git@nightly \
    git+https://github.com/unslothai/unsloth-zoo.git

!pip install -U "transformers" "huggingface_hub"


In [None]:
# ============================================================
# HF Hub config for GGUF upload
# ============================================================

from google.colab import userdata

HF_USERNAME   = userdata.get("HF_USERNAME")
HF_GGUF_REPO  = f"{HF_USERNAME}/llama3.2-3b-finetome-25k"
HF_TOKEN      = userdata.get("HF_TOKEN")

# GLOBAL PARAMS
BATCH_SIZE = 50
MAX_STEPS = 3
NUM_EPOCHS = 1

# Choose one or more GGUF quantization methods
# Common options:
#   "q8_0"   - high quality, larger, uses more RAM
#   "q4_k_m" - very popular tradeoff (recommended)
#   "f16"    - full-precision GGUF (largest)
GGUF_QUANT_METHODS = "q8_0"  # or ["q4_k_m", "q8_0", "f16"]


In [None]:
# ============================================================
# 1. Imports + basic configuration
# ============================================================
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import (
    get_chat_template,
    standardize_sharegpt,
    train_on_responses_only,
)
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
import torch

# --- Model / training configuration ---
max_seq_length = 2048          # context length for training
dtype = None                   # auto: fp16 on T4/V100, bf16 on A100/L4 etc.
load_in_4bit = True            # QLoRA-style 4-bit loading
model_name = "unsloth/Llama-3.2-3B-Instruct"

# --- Checkpoint configuration ---
OUTPUT_DIR = "outputs"         # where HuggingFace Trainer saves checkpoints
SAVE_EVERY_STEPS = 2            # how often to save
SAVE_TOTAL_LIMIT = 3         # keep only the last 3 checkpoints

# --- Control: start from scratch or resume from checkpoint ---
# Set this to e.g. "outputs/checkpoint-40" or True to resume.
RESUME_FROM_CHECKPOINT = None  # None = train from scratch


In [None]:
# ============================================================
# 2. Load base model + tokenizer with Unsloth (4-bit)
# ============================================================
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name       = model_name,
    max_seq_length   = max_seq_length,
    dtype            = dtype,
    load_in_4bit     = load_in_4bit,
    # token = "hf_...",  # only needed for gated models (e.g. meta-llama/*)
)

# ============================================================
# 3. Add LoRA adapters (PEFT)
#    -> we only train a small set of extra parameters
# ============================================================
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


In [None]:
# ============================================================
# 4. Data preparation: FineTome-100k + chat template
# ============================================================

# 4.1 Attach Llama 3.x chat template to the tokenizer
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",  # Llama-3.x style: <|start_header_id|>user ... etc.
)

# 4.2 Helper function to turn conversations -> plain text using the chat template
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

# 4.3 Load FineTome-100k dataset (ShareGPT style)
dataset = load_dataset("mlabonne/FineTome-100k", split=f"train[:25000]")

# 4.4 Convert ShareGPT format ("from", "value") -> HF generic ("role", "content")
dataset = standardize_sharegpt(dataset)

# 4.5 Apply our formatting function to create a "text" field
dataset = dataset.map(formatting_prompts_func, batched=True)

# (Optional) Inspect one example to sanity-check
print("Example conversations[5]:")
print(dataset[5]["conversations"])
print("\nExample text[5] after applying chat template:")
print(dataset[5]["text"][:400], "...")


In [None]:
# ============================================================
# 5. Create SFTTrainer (Supervised Fine-Tuning)
# ============================================================

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size = BATCH_SIZE,
    gradient_accumulation_steps = 4,          # effective batch size = 2 * 4 = 8
    warmup_steps               = 5,
    # max_steps                  = MAX_STEPS,
    num_train_epochs           = NUM_EPOCHS,
    learning_rate              = 2e-4,
    fp16                       = not is_bfloat16_supported(),
    bf16                       = is_bfloat16_supported(),
    logging_steps              = 1,
    optim                      = "adamw_8bit",
    weight_decay               = 0.01,
    lr_scheduler_type          = "linear",
    seed                       = 3407,
    output_dir                 = OUTPUT_DIR,  # where checkpoints & logs go
    report_to                  = "none",     # or "wandb", "tensorboard", etc.

    # ------- NEW: periodic checkpoint saving --------
    save_strategy              = "steps",    # save checkpoints by number of steps
    save_steps                 = SAVE_EVERY_STEPS,
    save_total_limit           = SAVE_TOTAL_LIMIT,  # keep only last N checkpoints
)

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = "text",
    max_seq_length     = max_seq_length,
    data_collator      = data_collator,
    dataset_num_proc   = 2,
    packing            = False,  # can be True for big speedup if sequences are short
    args               = training_args,
)

# ============================================================
# 6. Train on ASSISTANT responses only (mask user/system prompts)
# ============================================================
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part    = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# Verify masking for one example
print("\n--- Verify masking on example 5 ---")
print("Input text:")
print(tokenizer.decode(trainer.train_dataset[5]["input_ids"])[:400], "...\n")

space_token_id = tokenizer(" ", add_special_tokens=False).input_ids[0]
print("Labels (masked non-assistant tokens replaced by spaces for visualization):")
print(
    tokenizer.decode(
        [space_token_id if x == -100 else x for x in trainer.train_dataset[5]["labels"]]
    )[:400],
    "..."
)


In [None]:
# ============================================================
# 7. Train the model (from scratch or resume from checkpoint)
# ============================================================

# Show starting GPU memory usage
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved before training.\n")

if RESUME_FROM_CHECKPOINT is None:
    print(">>> Training from scratch (no checkpoint)...")
    trainer_stats = trainer.train()
else:
    print(f">>> Resuming training from checkpoint: {RESUME_FROM_CHECKPOINT}")
    trainer_stats = trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)

# Show memory + runtime stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print("\n==== Training finished ====")
print(f"Train runtime (seconds): {trainer_stats.metrics['train_runtime']}")
print(f"Train runtime (minutes): {round(trainer_stats.metrics['train_runtime']/60, 2)}")
print(f"Peak reserved memory: {used_memory} GB.")
print(f"Peak reserved memory for training: {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory: {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory: {lora_percentage} %.")


In [None]:
# ============================================================
# 9. Save model locally
# ============================================================
model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit")

In [None]:
# ============================================================
# 10. Convert to GGUF
# ============================================================
import os
if not os.path.isdir("llama.cpp"):
    !git clone https://github.com/ggerganov/llama.cpp.git

!python llama.cpp/convert_hf_to_gguf.py merged_model \
    --outfile llama-3.2-3b-finetuned.gguf \
    --outtype {GGUF_QUANT_METHODS}

In [None]:
# ============================================================
# 11. Push GGUF model(s) to Hugging Face Hub
# ============================================================
# This will create a HF repo `HF_GGUF_REPO` and upload the GGUF file(s) there.
# Perfect for your lab: your Gradio/Streamlit app can download GGUF from this repo.

print(f"Pushing GGUF to HF repo: {HF_GGUF_REPO}")
print(f"Quantization methods: {GGUF_QUANT_METHODS}")

model.push_to_hub_gguf(
    HF_GGUF_REPO,
    tokenizer,
    quantization_method = GGUF_QUANT_METHODS,  # list or single string
    token = HF_TOKEN,
)

print("✅ GGUF upload complete!")
print("Visit your repo at: https://huggingface.co/" + HF_GGUF_REPO)
