Install Required Libraries

In [1]:
!pip install -q transformers datasets peft accelerate bitsandbytes trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.8/528.8 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Import Libraries

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb
import os

Choose a 1B LLaMA-Based Instruction Model

In [2]:

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"



Load a Small Instruction Dataset

In [3]:
dataset = load_dataset("tatsu-lab/alpaca", split="train[:2000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]



data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Tokenization Function

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    prompt = example["instruction"] + "\n" + example["input"]
    response = example["output"]
    text = prompt + "\n" + response

    tokens = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=256,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=False)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

PART 1 — LoRA Fine-Tuning (Full Precision)

Load Base Model (FP16)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Apply LoRA (ONLY q_proj, v_proj)

In [6]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


Training Arguments

In [7]:
training_args = TrainingArguments(
    output_dir="./lora-output",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=50,
    fp16=True,
    save_strategy="no"
)

Train (LoRA)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

torch.cuda.reset_peak_memory_stats()
trainer.train()

print("Peak GPU Memory (LoRA):",
      torch.cuda.max_memory_allocated() / 1024**3, "GB")

Step,Training Loss
50,0.478309
100,0.413677
150,0.464019
200,0.442921
250,0.442679
300,0.442109
350,0.441857
400,0.465045
450,0.422363
500,0.436193


Peak GPU Memory (LoRA): 4.183407783508301 GB


PART 2 — QLoRA (4-bit Quantization)

Load Model in 4-Bit (NF4)

In [10]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_qlora = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Apply SAME LoRA Config

In [11]:
model_qlora = get_peft_model(model_qlora, lora_config)
model_qlora.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


Train (QLoRA)

In [12]:
training_args_qlora = TrainingArguments(
    output_dir="./qlora-output",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=50,
    fp16=True,
    save_strategy="no"
)

trainer_qlora = Trainer(
    model=model_qlora,
    args=training_args_qlora,
    train_dataset=tokenized_dataset,
)

torch.cuda.reset_peak_memory_stats()
trainer_qlora.train()

print("Peak GPU Memory (QLoRA):",
      torch.cuda.max_memory_allocated() / 1024**3, "GB")

Step,Training Loss
50,4.724262
100,0.505887
150,0.527843
200,0.494963
250,0.483448
300,0.482291
350,0.477456
400,0.498105
450,0.45134
500,0.464745


Peak GPU Memory (QLoRA): 5.752193450927734 GB


Generate Outputs (Comparison)

In [13]:
test_prompts = [
    "Explain what overfitting is in machine learning.",
    "Write a short motivational paragraph for students.",
    "What are the benefits of exercise?"
]

def generate(model, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print("---- LoRA Outputs ----")
for p in test_prompts:
    print(generate(model, p))
    print("\n")

print("---- QLoRA Outputs ----")
for p in test_prompts:
    print(generate(model_qlora, p))
    print("\n")

---- LoRA Outputs ----
Explain what overfitting is in machine learning.

Overfitting is a type of machine learning problem where the model is trained on a small portion of the data and then used to make predictions on new data. This is because the model has learned the patterns in the training data, but it may not generalize well to new data. Overfitting can lead to poor performance on new data, as the model may not be able to generalize to new data.


Write a short motivational paragraph for students.

Students, the world is full of opportunities and challenges. It's up to you to decide which path to take. The key to success is to be open-minded, persistent, and willing to take risks. Don't be afraid to try new things and take on new challenges. With hard work and dedication, you can achieve great things.


What are the benefits of exercise?

Exercise is a great way to improve your physical health, reduce stress, and improve your mood. It can also help you lose weight, improve your sl

Observed Memory Results:


LoRA	     4.18 GB
QLoRA      5.75 GB

Although QLoRA is designed to reduce memory usage, in this experiment the 1.1B model was already small enough to fit efficiently in FP16. The additional quantization metadata and optimizer overhead slightly increased memory usage in QLoRA.

This suggests that QLoRA provides greater benefits for larger models rather than small 1B-scale models.