# QLoRA Fine-tuning (Gemma) on fine_tune_data.json

This basic notebook fine-tunes a small Gemma Instruct model with QLoRA on pairs from `fine_tuning_data/fine_tune_data.json`.

- Model: adjustable (defaults to `google/gemma-2-2b-it`).
- Method: QLoRA (4-bit quantization via bitsandbytes + PEFT LoRA) using TRL's SFTTrainer.
- Data: list of objects with keys `llm_message` (input/prompt) and `user_message` (target/response).

Run top-to-bottom on a GPU runtime (e.g., Runpod).


In [1]:
pip install -q transformers datasets accelerate peft trl bitsandbytes einops hf_transfer datasets evaluate scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("MJ141592/message_style_data")
ds_builder.info


README.md:   0%|          | 0.00/492 [00:00<?, ?B/s]

DatasetInfo(description='', citation='', homepage='', license='', features={'llm_message': Value('string'), 'user_message': Value('string')}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='message_style_data', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=3310, num_examples=18, shard_lengths=None, dataset_name=None), 'val': SplitInfo(name='val', num_bytes=367, num_examples=2, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=551, num_examples=3, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=9233, post_processing_size=None, dataset_size=4228, size_in_bytes=None)

In [6]:
from transformers import AutoTokenizer
from datasets import load_dataset

ds = load_dataset("MJ141592/message_style_data")

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m-it")

def tokenize_function(example):
    # Construct Gemma chat format
    user_message = example["llm_message"]
    model_message = example["user_message"]

    text = (
        f"<start_of_turn>user\n{user_message}\n<end_of_turn>\n"
        f"<start_of_turn>model\n{model_message}\n<end_of_turn>\n"
    )

    # Tokenize the full sequence
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding="max_length",
    )

    # Now mask out everything *before* the model’s message
    # Find where the model turn starts in tokenized IDs
    model_start = text.find("<start_of_turn>model")
    model_start_tokens = len(
        tokenizer(text[:model_start], truncation=True, max_length=512)["input_ids"]
    )

    labels = tokenized["input_ids"].copy()
    labels[:model_start_tokens] = [-100] * model_start_tokens  # ignore loss before model turn
    tokenized["labels"] = labels

    return tokenized

tokenized_dataset = ds.map(tokenize_function, batched=False)

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [14]:
example = tokenized_dataset["train"][0]
decoded = tokenizer.decode(example["input_ids"])
print(decoded)
print(example["labels"][:50])

<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [20]:
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m-it")

training_args = SFTConfig(
    output_dir="./gemma-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    max_steps=50,
    logging_steps=10,
    bf16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    args=training_args,
)
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 2, 'pad_token_id': 0}.


Step,Training Loss
10,3.3364
20,0.21
30,0.0924
40,0.0329
50,0.0237


TrainOutput(global_step=50, training_loss=0.7390912637114525, metrics={'train_runtime': 39.0842, 'train_samples_per_second': 10.234, 'train_steps_per_second': 1.279, 'total_flos': 93693262430208.0, 'train_loss': 0.7390912637114525, 'epoch': 16.88888888888889})

In [22]:
trainer.save_model("./gemma-finetuned")
tokenizer.save_pretrained("./gemma-finetuned")

('./gemma-finetuned/tokenizer_config.json',
 './gemma-finetuned/special_tokens_map.json',
 './gemma-finetuned/chat_template.jinja',
 './gemma-finetuned/tokenizer.json')

In [23]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="./gemma-finetuned",
    tokenizer=tokenizer,
    device=0
)

prompt = "<start_of_turn>user\nHey, how are you?\n<end_of_turn>\n<start_of_turn>model\n"
response = pipe(prompt, max_new_tokens=100)[0]["generated_text"]
print(response)

Device set to use cuda:0


<start_of_turn>user
Hey, how are you?
<end_of_turn>
<start_of_turn>model
Hey there! I'm doing well, thanks :)



In [25]:
model.push_to_hub("MJ141592/first_ft_try_on_message_style")
tokenizer.push_to_hub("MJ141592/first_ft_try_on_message_style")

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/MJ141592/first_ft_try_on_message_style/commit/ec110bd8c03fd81ba27e9fed72e7d270367cd3be', commit_message='Upload Gemma3ForCausalLM', commit_description='', oid='ec110bd8c03fd81ba27e9fed72e7d270367cd3be', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MJ141592/first_ft_try_on_message_style', endpoint='https://huggingface.co', repo_type='model', repo_id='MJ141592/first_ft_try_on_message_style'), pr_revision=None, pr_num=None)

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import torch

# Load the fine-tuned model and tokenizer
model_path = "./gemma-finetuned"  # or "./gemma-finetuned-merged" if you merged a LoRA
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

# Evaluate on the test set
test_set = dataset["test"]

# Define an evaluation function
def evaluate_model(model, tokenizer, test_data, max_new_tokens=50, num_samples=100):
    model.eval()
    results = []

    # Sample a subset to keep things fast
    samples = test_data.select(range(min(num_samples, len(test_data))))

    for sample in tqdm(samples, desc="Evaluating"):
        prompt = sample["llm_message"]
        target = sample["user_message"]

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

        results.append({
            "prompt": prompt,
            "target": target,
            "generated": generated
        })

    return results

# Run the evaluation
results = evaluate_model(model, tokenizer, test_set, num_samples=100)

# Display a few examples
for r in results[:5]:
    print("🗣️ Prompt:", r["prompt"])
    print("🎯 Target:", r["target"])
    print("🤖 Generated:", r["generated"])
    print("-" * 80)

Evaluating: 100%|██████████| 3/3 [00:02<00:00,  1.12it/s]

🗣️ Prompt: I'll fill your bin with bricks back
🎯 Target: I'll fill your bin with bricks
🤖 Generated: I'll fill your bin with bricks back in my bag (or make one of my own if you want to keep it tidy)

--------------------------------------------------------------------------------
🗣️ Prompt: I could call you in about 10 minutes if you're around
🎯 Target: Calling in maybe 10 minutes would be good if you're free
🤖 Generated: I could call you in about 10 minutes if you're around, but in a few minutes would be best for me to focus on some kind of urgent task right now. Happy you're free!

--------------------------------------------------------------------------------
🗣️ Prompt: You've overlooked a section on my diagram that says basically the same thing except for those two crucial lines
🎯 Target: You've missed the part on my diagram saying approximately the same except for a crucial two lines:
🤖 Generated: You've overlooked a section on my diagram that says basically the same thing except




In [27]:
my_model = AutoModelForCausalLM.from_pretrained("MJ141592/first_ft_try_on_message_style")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

In [None]:
from huggingface_hub import login

login(token="put token here")

In [None]:
# CUDA_LAUNCH_BLOCKING=1

In [None]:
from dataclasses import dataclass
from typing import Optional
from pathlib import Path
import json

import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, TaskType

@dataclass
class TrainCfg:
    base_model: str = "google/gemma-3-270m-it"  # small instruct model; adjust for VRAM
    out_dir: str = "./outputs/gemma-qlora"
    bf16: bool = False
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    max_steps: int = 10
    learning_rate: float = 1e-4
    warmup_ratio: float = 0.03
    logging_steps: int = 10
    max_seq_len: int = 1024

cfg = TrainCfg()

# 4-bit quantization
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if cfg.bf16 else torch.float16,
    bnb_4bit_use_double_quant=True,
)

device_map = "auto"

tokenizer = AutoTokenizer.from_pretrained(cfg.base_model, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    cfg.base_model,
    device_map=device_map,
    quantization_config=bnb_cfg,
    trust_remote_code=True,
)

# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = ""  # optional, forces CPU

# device_map = {"": "cpu"}
# cfg.bf16 = False  # CPUs don’t support bf16

# model = AutoModelForCausalLM.from_pretrained(
#     cfg.base_model,
#     device_map=device_map,
#     dtype=torch.float32,   # CPU-friendly
#     trust_remote_code=True,
# )

peft_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)



In [None]:
# Load fine_tune_data.json
data_path = Path("fine_tune_data.json")
with open(data_path, "r") as f:
    pairs = json.load(f)

# Expect list of {"llm_message": str, "user_message": str}
assert isinstance(pairs, list) and all("llm_message" in r and "user_message" in r for r in pairs)

# Format into supervised text field (prompt + target)
BOS = tokenizer.bos_token
EOS = tokenizer.eos_token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

def format_row(r):
    prompt = r["llm_message"].strip()
    target = r["user_message"].strip()
    return {"text": f"{BOS}{prompt}\n{target}{EOS}"}

train_ds = Dataset.from_list([format_row(r) for r in pairs])
print(train_ds[0]["text"][:200])


In [None]:
training_args = SFTConfig(
    output_dir=cfg.out_dir,
    bf16=cfg.bf16,
    per_device_train_batch_size=cfg.per_device_train_batch_size,
    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
    max_steps=cfg.max_steps,
    learning_rate=cfg.learning_rate,
    warmup_ratio=cfg.warmup_ratio,
    logging_steps=cfg.logging_steps,
    save_steps=0,
    dataset_text_field="text",
    packing=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    peft_config=peft_cfg,
    args=training_args,
)
trainer.train()


In [None]:
# Save adapter and tokenizer
Path(cfg.out_dir).mkdir(parents=True, exist_ok=True)
trainer.model.save_pretrained(cfg.out_dir)
tokenizer.save_pretrained(cfg.out_dir)
print(f"Saved adapter to {cfg.out_dir}")

# Quick inference: merge adapter on the fly
from peft import PeftModel
base = AutoModelForCausalLM.from_pretrained(
    cfg.base_model,
    device_map=device_map,
    quantization_config=bnb_cfg,
    trust_remote_code=True,
)
adapted = PeftModel.from_pretrained(base, cfg.out_dir)
adapted.eval()

prompt = "hey how are you?"
inputs = tokenizer(prompt, return_tensors="pt").to(adapted.device)
with torch.no_grad():
    out = adapted.generate(**inputs, max_new_tokens=40, do_sample=True, temperature=0.7)
print(tokenizer.decode(out[0], skip_special_tokens=True))
