<a href="https://colab.research.google.com/github/shubham5027/LLM-Finetuning/blob/main/Llama_3_2_3B_Instruct_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiment v1: SFT + LoRA (Unsloth)
# Base model: unsloth/Llama-3.2-3B-Instruct

In [1]:
# Install Unsloth (includes a patched, stable HF + PEFT stack)
!pip install -U unsloth


Collecting unsloth
  Downloading unsloth-2025.12.10-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.12.8 (from unsloth)
  Downloading unsloth_zoo-2025.12.8-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from 

In [2]:

import os
import json
import csv
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Global Configuration

In [3]:

MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"
ADAPTER_DIR = "unsloth-lora-adapter"
OUTPUT_DIR = "./unsloth-output"

In [4]:
EVAL_PROMPTS = [
    "Explain LoRA in simple terms.",
    "Write a Python function to reverse a string.",
    "What is gradient accumulation and why is it used?",
    "Explain attention in transformers like I'm 12.",
    "Difference between LoRA and QLoRA.",
]

GEN_KWARGS = dict(
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9,
)

# Evaluation Utilities


In [5]:
def generate_outputs(model, tokenizer, prompts):
    FastLanguageModel.for_inference(model)
    outputs = []

    for prompt in prompts:
        full_prompt = f"""### Instruction:
{prompt}

### Response:
"""
        inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            out = model.generate(**inputs, **GEN_KWARGS)
        outputs.append(tokenizer.decode(out[0], skip_special_tokens=True))

    return outputs


def export_eval_report(prompts, before, after, prefix="eval_report"):
    rows = []
    for p, b, a in zip(prompts, before, after):
        rows.append({
            "prompt": p,
            "before": b,
            "after": a,
            "before_len": len(b.split()),
            "after_len": len(a.split()),
        })

    # JSON
    with open(f"{prefix}.json", "w") as f:
        json.dump(rows, f, indent=2)

    # CSV
    with open(f"{prefix}.csv", "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
        writer.writeheader()
        writer.writerows(rows)


# BEFORE Evaluation

In [6]:

# -------------------------
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=2048,
    dtype=torch.float16,
    load_in_4bit=True,
)

before_outputs = generate_outputs(
    base_model,
    base_tokenizer,
    EVAL_PROMPTS,
)

==((====))==  Unsloth 2025.12.10: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

# Training (SFT + LoRA)

In [7]:


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=2048,
    dtype=torch.float16,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)

dataset = load_dataset("tatsu-lab/alpaca", split="train[:2000]")

def format_prompt(ex):
    return f"""### Instruction:
{ex['instruction']}

### Response:
{ex['output']}"""

dataset = dataset.map(
    lambda x: {"text": format_prompt(x)},
    remove_columns=dataset.column_names,
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    args=training_args,
)

trainer.train()

model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

==((====))==  Unsloth 2025.12.10: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.12.10 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 4,587,520 of 3,217,337,344 (0.14% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.1412
20,1.7271
30,1.7947
40,1.6419
50,1.5714
60,1.6583
70,1.558
80,1.6048
90,1.6555
100,1.6078


('unsloth-lora-adapter/tokenizer_config.json',
 'unsloth-lora-adapter/special_tokens_map.json',
 'unsloth-lora-adapter/chat_template.jinja',
 'unsloth-lora-adapter/tokenizer.json')

# AFTER Evaluation

In [8]:

# -------------------------
ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=2048,
    dtype=torch.float16,
    load_in_4bit=True,
)

ft_model.load_adapter(ADAPTER_DIR)

after_outputs = generate_outputs(
    ft_model,
    ft_tokenizer,
    EVAL_PROMPTS,
)


==((====))==  Unsloth 2025.12.10: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Export Report

In [9]:

# -------------------------
export_eval_report(
    EVAL_PROMPTS,
    before_outputs,
    after_outputs,
    prefix="sft_v1_eval",
)

print("✅ Experiment complete. Reports saved (CSV + JSON).")

✅ Experiment complete. Reports saved (CSV + JSON).
