<a href="https://colab.research.google.com/github/samipn/unsloth.ai_demo/blob/main/colab2_lora_smollm2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab 2 — LoRA / QLoRA on SmolLM2‑135M (same dataset)
PEFT (LoRA) vs full FT comparison using `yahma/alpaca-cleaned`. This runs fast and saves a tiny adapter (~100MB).

In [1]:
#@title Install Unsloth + deps (Colab-safe)
%pip -q install --upgrade pip
%pip -q install unsloth datasets trl transformers accelerate bitsandbytes peft --no-cache-dir
import torch, platform
print("PyTorch:", torch.__version__, "CUDA:", torch.version.cuda, "Python:", platform.python_version())


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.[0m[31m
[0mPyTorch: 2.8.0+cu126 CUDA: 12.6 Python: 3.12.12


In [6]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"
max_seq_length = 2048
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

# Set load_in_4bit=True to demonstrate QLoRA; switch to False for FP16 LoRA.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,   # QLoRA path
)

# Set the chat template for the tokenizer
# The model uses <|endoftext|> as pad_token, so it's a good candidate for message separation
eos_token_val = tokenizer.eos_token
tokenizer.chat_template = (
    "{% for message in messages %}"
        "{% if message['role'] == 'system' %}"
            f"{{{{ '<|system|>\n' + message['content'] + '{eos_token_val}' + '\n' }}}}"
        "{% elif message['role'] == 'user' %}"
            f"{{{{ '<|user|>\n' + message['content'] + '{eos_token_val}' + '\n' }}}}"
        "{% elif message['role'] == 'assistant' %}"
            f"{{{{ '<|assistant|>\n' + message['content'] + '{eos_token_val}' + '\n' }}}}"
        "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}" # Add this for inference
)


# Attach LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0.0,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    max_seq_length=max_seq_length,
)
print("Model ready with LoRA adapters.")

==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.
Model ready with LoRA adapters.


In [7]:
from datasets import load_dataset

ds = load_dataset("yahma/alpaca-cleaned")
train = ds["train"].select(range(2000))

def to_chat(ex):
    instr = ex["instruction"]
    inp = ex.get("input", "")
    out = ex["output"]
    user = instr if not inp else f"{instr}\n\nInput: {inp}"
    messages = [
        {"role":"system","content":"You are a helpful assistant."},
        {"role":"user","content":user},
        {"role":"assistant","content":out},
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    return {"text": text}

train = train.map(to_chat, remove_columns=train.column_names)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
from trl import SFTTrainer, SFTConfig
import torch

cfg = SFTConfig(
    output_dir="outputs_lora_smolm2",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=1,
    max_seq_length=1024,
    logging_steps=10,
    save_steps=100,
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train,
    dataset_text_field="text",
    args=cfg,
)
trainer.train()

# Save LoRA adapter
model.save_pretrained("smollm2_lora_adapter")
tokenizer.save_pretrained("smollm2_lora_adapter")
print("Saved LoRA adapter to ./smollm2_lora_adapter")


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/2000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 125
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 4,884,480 of 139,399,488 (3.50% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamipn[0m ([33msamipn-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.712
20,2.5654
30,2.1833
40,1.8821
50,1.7999
60,1.6924
70,1.6268
80,1.6151
90,1.5812
100,1.5749


Saved LoRA adapter to ./smollm2_lora_adapter


In [11]:
# Quick inference helper
from unsloth import FastLanguageModel
import torch

FastLanguageModel.for_inference(model)  # enables 2x faster kernels (no change to outputs)

def chat(prompt, history=None, max_new_tokens=128):
    if history is None: history = []
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens)
    print(tokenizer.decode(out[0], skip_special_tokens=True))

system_prompt = "You are a helpful assistant."
chat("Say hello in one sentence.")


<|system|>
You are a helpful assistant.
<|user|>
Say hello in one sentence.
<|assistant|>
Hello,


In [12]:
# (Optional) Merge LoRA -> 16-bit for serving or export to GGUF/Ollama
merged = False
try:
    merged = True
    merged_model = FastLanguageModel.merge_and_unload(model) # returns a plain HF model
    merged_model.save_pretrained("smollm2_merged_16bit")
    tokenizer.save_pretrained("smollm2_merged_16bit")
    print("Merged to ./smollm2_merged_16bit")
except Exception as e:
    print("Merge skipped:", e)


Merge skipped: type object 'FastLanguageModel' has no attribute 'merge_and_unload'
