In [None]:
%%capture

!pip install bitsandbytes accelerate transformers datasets peft
# !pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install unsloth_zoo

In [None]:
train_data = [
    {"input": "The Force is strong in you.", "output": "Strong in you, the Force is."},
    {"input": "You must learn the ways of the Force.", "output": "Learn the ways of the Force, you must."},
    {"input": "I will help you.", "output": "Help you, I will."},
]


In [None]:
from datasets import Dataset

# 3. Load your Yoda-style data
examples = [
    {"input": "The Force is strong in you.", "output": "Strong in you, the Force is."},
    {"input": "I will help you.", "output": "Help you, I will."},
    {"input": "This is your destiny.", "output": "Your destiny, this is."},
    {"input":"You must learn the ways of the Force.", "output": "Learn the ways of the Force, you must."}
]

examples = examples *20
# Format for LLaMA 3 Instruct using <|begin_of_text|> and special header tokens
def format_sample(e):
    return {
        "text": f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Speak like Yoda, you must.<|eot_id|><|start_header_id|>user<|end_header_id|>

{e['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{e['output']}<|eot_id|>"""
    }

# Convert list of examples to HuggingFace dataset
dataset = Dataset.from_list([format_sample(e) for e in examples])
dataset

Dataset({
    features: ['text'],
    num_rows: 80
})

In [None]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import Dataset

# 1. Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/llama-3-8b-bnb-4bit",  # or "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    model_name= "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    # model_name = "unsloth/Llama-3.2-3B",
    max_seq_length = 512,
    dtype = None,  # auto
    load_in_4bit = True,
)

# 2. Apply LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    # task_type = "CAUSAL_LM",
)



# 4. Set training arguments
# training_args = TrainingArguments(
#     per_device_train_batch_size = 1,
#     gradient_accumulation_steps = 1,
#     num_train_epochs = 3,
#     learning_rate = 2e-4,
#     fp16 = True,
#     logging_steps = 1,
#     output_dir = "yoda-lora",
#     save_strategy = "no",
# )


training_args = TrainingArguments(
    output_dir="yoda-lora",
    num_train_epochs=2,
    max_steps=18,                    # <- this overrides epochs
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    fp16=False,
    disable_tqdm=False,
    remove_unused_columns=False,
    report_to=[],  # disables wandb
)



==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [None]:
# 5. Train using SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_dataset,
    dataset_text_field = "text",
    max_seq_length = 512,
    args = training_args,
    logging_steps = 2
)
print("training  reached")
trainer.train()

training  reached


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 80 | Num Epochs = 1 | Total steps = 18
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 20,971,520/8,000,000,000 (0.26% trained)


Step,Training Loss
2,10.7123
4,9.2152
6,8.1871
8,6.5721
10,6.0118
12,4.4802
14,3.5392
16,2.558
18,1.8343


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=18, training_loss=5.901128464274937, metrics={'train_runtime': 22.0066, 'train_samples_per_second': 0.818, 'train_steps_per_second': 0.818, 'total_flos': 416151955832832.0, 'train_loss': 5.901128464274937})

In [None]:
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Speak like Yoda, you must.<|eot_id|><|start_header_id|>user<|end_header_id|>

Who killed Jabba the Hutt?|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# outputs = model.generate(**inputs, max_new_tokens=50)
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    eos_token_id=tokenizer.convert_tokens_to_ids("<|eot_id|>"),
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


system

Speak like Yoda, you must.user

Who killed Jabba the Hutt?|eot_id|>assistant
"A bounty hunter, Boba Fett, killed Jabba the Hutt. A bounty, he was, on the Hutt's head. The sarlacc, a victim, Jabba became. In the Great Pit of Carkoon, he met his end."


In [None]:
tokens = tokenizer(prompt, return_tensors="pt")
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]))

['<|begin_of_text|>', '<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', 'ĊĊ', 'Speak', 'Ġlike', 'ĠY', 'oda', ',', 'Ġyou', 'Ġmust', '.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', 'ĊĊ', 'Who', 'Ġkilled', 'ĠJab', 'ba', 'Ġthe', 'ĠH', 'utt', '?', '|', 'e', 'ot', '_id', '|', '>', '<|start_header_id|>', 'assistant', '<|end_header_id|>', 'Ċ']


In [None]:
# base_model, _ = FastLanguageModel.from_pretrained(
#     model_name = "unsloth/Llama-3.2-3B",
#     max_seq_length = 512,
#     dtype = None,
#     load_in_4bit = True,
# )
# inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# # outputs = model.generate(**inputs, max_new_tokens=50)
# outputs = base_model.generate(
#     **inputs,
#     max_new_tokens=100,
#     do_sample=True,
#     temperature=0.7,
#     top_p=0.9,
#     eos_token_id=tokenizer.convert_tokens_to_ids("<|eot_id|>"),
# )
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:

# 6. Save LoRA adapter (for merging/export later)
model.save_pretrained("yoda-lora")
tokenizer.save_pretrained("yoda-lora")


('yoda-lora/tokenizer_config.json',
 'yoda-lora/special_tokens_map.json',
 'yoda-lora/tokenizer.json')

In [None]:
import torch

In [None]:
# from peft import AutoPeftModelForCausalLM

# # Load base + merge LoRA
# merged_model = AutoPeftModelForCausalLM.from_pretrained(
#     "yoda-lora",  # your LoRA output dir
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16,
# ).merge_and_unload()

# Save merged model (standard Hugging Face format)
# merged_model.save_pretrained("merged_yoda_model", safe_serialization=True, max_shard_size="10GB")
# tokenizer.save_pretrained("merged_yoda_model")


In [None]:
# !tar -czvf goog_merged_yoda_model3.tar.gz merged_yoda_model

In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel
from transformers import AutoTokenizer

# 1. Load base model (same one used for training)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name= "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = 4096,
    dtype = None,
    load_in_4bit = True,
)

# 2. Load LoRA adapter you saved earlier
model = PeftModel.from_pretrained(
    model,
    "yoda-lora",  # folder containing adapter_model.safetensors
)

# 3. Merge LoRA into base model weights
model = model.merge_and_unload()

# 4. Export to GGUF (you can choose quantization method here)
model.save_pretrained_gguf(
    "merged_yoda_gguf",     # output directory
    tokenizer,
    quantization_method="q4_k_m"  # or "f16", "q8_0"
)


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 35.15 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


  0%|          | 0/32 [00:00<?, ?it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [02:35<00:00,  4.86s/it]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at merged_yoda_gguf into bf16 GGUF format.
The output location will be /content/merged_yoda_gguf/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: merged_yoda_gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model par

In [None]:
from google.colab import files

!zip yoda_gguf.zip merged_yoda_gguf/unsloth.Q4_K_M.gguf
files.download("yoda_gguf.zip")

  adding: merged_yoda_gguf/unsloth.Q4_K_M.gguf (deflated 18%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>