Install Unsloth

In [None]:
# Install Unsloth (includes a patched, stable HF + PEFT stack)
!pip install -U unsloth


#Import and Enviroments

In [None]:


import os
import torch

from datasets import load_dataset
from unsloth import FastLanguageModel



# (Optional) Hugging Face Token

In [None]:

# Needed only for gated models (LLaMA, some Qwen, etc.)
# In Colab: set via environment variable or paste directly (not recommended)
HF_TOKEN = os.environ.get("HF_TOKEN")



#Load Model and Tokenizer

In [None]:


# Base instruct model to fine-tune
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,

    # Maximum context length supported during training
    # Higher = more memory usage
    max_seq_length = 2048,

    # Computation datatype (fp16 is safest on Colab GPUs)
    dtype = torch.float16,

    # Enable 4-bit quantization (QLoRA)
    load_in_4bit = True,

    # Hugging Face auth token (if required)
    token = HF_TOKEN,
)



#Add LoRA Adapters

In [None]:


model = FastLanguageModel.get_peft_model(
    model,

    # LoRA rank: controls how much the model can adapt
    # 8  ‚Üí small change
    # 16 ‚Üí balanced (recommended)
    # 32 ‚Üí strong but higher overfitting risk
    r = 16,

    # Scaling factor for LoRA updates
    # Usually 2 √ó r
    lora_alpha = 32,

    # Dropout applied to LoRA layers (regularization)
    lora_dropout = 0.05,

    # Apply LoRA only to attention projections
    # This is the standard and most stable setup
    target_modules = ["q_proj", "v_proj"],

    # Do not train bias terms
    bias = "none",

    # Saves memory by recomputing activations during backward pass
    use_gradient_checkpointing = True,

    # For reproducibility
    random_state = 42,
)


Load Dataset

In [None]:
# Instruction-tuning dataset
# Using a small slice for safe Colab runs
dataset = load_dataset(
    "tatsu-lab/alpaca",
    split = "train[:2000]",
)


Prompt Formatting

In [None]:
# Standard instruction ‚Üí response format
# The model learns to predict the response given the instruction
def format_prompt(example):
    return (
        "### Instruction:\n"
        f"{example['instruction']}\n\n"
        "### Response:\n"
        f"{example['output']}"
    )


Prepare Dataset

In [None]:
# Convert dataset into a single "text" field
dataset = dataset.map(
    lambda x: {"text": format_prompt(x)},
    remove_columns = dataset.column_names,
)


Training Arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    # Where checkpoints & logs are saved
    output_dir = "./unsloth-output",

    # Batch size per GPU step
    per_device_train_batch_size = 2,

    # Accumulate gradients to simulate a larger batch
    # Effective batch size = batch_size √ó grad_accumulation
    gradient_accumulation_steps = 4,

    # Learning rate (LoRA typically uses higher LR)
    learning_rate = 2e-4,

    # Number of passes over the dataset
    num_train_epochs = 1,

    # Use mixed precision for speed & memory savings
    fp16 = True,

    # Print loss every N steps
    logging_steps = 10,

    # Save model at end of each epoch
    save_strategy = "epoch",

    # Disable external loggers
    report_to = "none",
)


Trainer

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,

    # Prepared dataset
    train_dataset = dataset,

    # Field containing the formatted prompt
    dataset_text_field = "text",

    # Maximum sequence length during training
    max_seq_length = 512,

    args = training_args,
)


Train

In [None]:
# Start LoRA fine-tuning
trainer.train()


Save LoRA Adapters

In [None]:
# Saves only the trained LoRA adapters (small size)
model.save_pretrained("unsloth-lora-adapter")
tokenizer.save_pretrained("unsloth-lora-adapter")


Inference Test

In [None]:
# Switch model to inference-optimized mode
FastLanguageModel.for_inference(model)

prompt = """### Instruction:
Explain LoRA in simple terms.

### Response:
"""

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens = 150,
        temperature = 0.7,
    )

print(tokenizer.decode(output[0], skip_special_tokens=True))


LoRA Paramenters




In [1]:
# | `r`    | When to use                  |
# | ------ | ---------------------------- |
# | 4‚Äì8    | Style / formatting           |
# | **16** | ‚úÖ General instruction tuning |
# | 32     | Domain-heavy adaptation      |
# | 64+    | ‚ùå Overkill on Colab          |


In [2]:
# üîπ lora_alpha ‚Äî Scaling

# Controls strength of LoRA updates
# Rule:
# lora_alpha ‚âà 2 √ó r


# Examples:

# r=8 ‚Üí alpha=1
# r=16 ‚Üí alpha=32 ‚úÖ
# r=32 ‚Üí alpha=64

In [None]:
# üîπ lora_dropout

# Regularization for LoRA layers

# Value	        Use case
# 0.0	         Very clean data
# 0.05	       ‚úÖ Default
# 0.1	         Noisy data

In [None]:
# üîπ target_modules

# Where LoRA is injected

# Modules	                   Use
# q_proj, v_proj	       ‚úÖ Standard
# + k_proj, o_proj	     More capacity
# All linear layers	     ‚ùå Risky

# üëâ Don‚Äôt change this early

In [None]:
# 2Ô∏è‚É£ Learning Rate (SECOND MOST IMPORTANT)

# LoRA tolerates higher LR than full fine-tuning
# LR	       Result
# 1e-4	     Conservative
# 2e-4	     ‚úÖ Best default
# 3e-4	     Fast, risky
# 5e-4+	     Often unstable

In [None]:
# 3Ô∏è‚É£ Batch Size & Gradient Accumulation

# Effective batch size matters more than raw batch

# Formula: effective_batch = per_device_batch √ó grad_accum


# Typical safe values (Colab):

# 2 √ó 4 = 8 ‚úÖ
# 1 √ó 8 = 8
# 2 √ó 8 = 16 (if VRAM allows)

# üëâ Keep effective batch 8‚Äì32

In [None]:
# Epochs (OVERFITTING RISK)

# Epochs	     When
# 1	           ‚úÖDefault
# 2            Large / diverse data
# 3+	         ‚ùå Usually overfits

# üß† If loss ‚Üì but answers worse ‚Üí stop.

In [None]:
# 5Ô∏è‚É£ Sequence Length

# Length	       Use
# 256	          Short Q&A
# 512	          ‚úÖ Most instruction data
# 1024+	        Only if needed

# ‚ö†Ô∏è Longer seq = more VRAM + slower training.

In [None]:
# 6Ô∏è‚É£ Temperature (INFERENCE ONLY)

# Value	       Behavior
# 0.2	         Deterministic
# 0.7	         ‚úÖ Balanced
# 1.0+	       Creative, risky

In [3]:
# 7Ô∏è‚É£ Common Tuning Recipes


# ‚úÖ General Instruction Tuning (RECOMMENDED)
# r = 16
# alpha = 32
# lr = 2e-4
# epochs = 1
# batch = 2
# grad_accum = 4
# seq_len = 512


# Style / Personality Tuning
# r = 8
# alpha = 16
# lr = 1e-4
# epochs = 1



# Domain Adaptation
# r = 32
# alpha = 64
# lr = 1e-4
# epochs = 2
