<a href="https://colab.research.google.com/github/samipn/unsloth.ai_demo/blob/main/colab3_rl_dpo_gemma1b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab 3 ‚Äî Reinforcement Learning via **DPO** (preferences)
We use a dataset with `chosen`/`rejected` columns (`HuggingFaceH4/ultrafeedback_binarized`).
Model: `unsloth/gemma-3-1b-it-bnb-4bit` (lightweight; runs on free GPUs).

In [1]:
#@title Install Unsloth + deps (Colab-safe)
%pip -q install --upgrade pip
%pip -q install unsloth datasets trl transformers accelerate bitsandbytes peft --no-cache-dir
import torch, platform
print("PyTorch:", torch.__version__, "CUDA:", torch.version.cuda, "Python:", platform.python_version())


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/1.8 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.[0m[31m
[0mPyTorch: 2.8.0+cu126 CUDA: 12.6 Python: 3.12.12


In [2]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

MODEL_NAME = "unsloth/gemma-3-1b-it-bnb-4bit"
max_seq_length = 2048
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,   # QLoRA
)

# Add LoRA for RL preference tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16, lora_alpha=16, lora_dropout=0.0,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    use_gradient_checkpointing="unsloth",
    random_state=3407, max_seq_length=max_seq_length,
)


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.2: Fast Gemma3 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


model.safetensors:   0%|          | 0.00/965M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

Unsloth: Making `model.base_model.model.model` require gradients


In [3]:
# Load UltraFeedback-binarized (has chosen/rejected as message lists)
from datasets import load_dataset

raw = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs").select(range(5000))

def to_strings(ex):
    # Each of 'chosen' and 'rejected' is a list of chat messages with roles.
    def render(messages):
        return tokenizer.apply_chat_template(messages, tokenize=False)
    return {
        "prompt": render(ex["chosen"][:-1]),  # everything before assistant reply
        "chosen": render(ex["chosen"]),
        "rejected": render(ex["rejected"]),
    }

data = raw.map(to_strings, remove_columns=raw.column_names)
data = data.train_test_split(test_size=0.02, seed=42)
train_prefs, eval_prefs = data["train"], data["test"]
len(train_prefs), len(eval_prefs)


README.md: 0.00B [00:00, ?B/s]

data/train_prefs-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

data/test_prefs-00000-of-00001.parquet:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

data/test_sft-00000-of-00001.parquet:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

data/train_gen-00000-of-00001.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

data/test_gen-00000-of-00001.parquet:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

(4900, 100)

In [4]:
# DPO training with TRL
from trl import DPOTrainer, DPOConfig

cfg = DPOConfig(
    output_dir="outputs_dpo_gemma1b",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    num_train_epochs=1,
    beta=0.1,
    max_seq_length=1024,
    logging_steps=10,
    save_steps=200,
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",
)

trainer = DPOTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_prefs,
    eval_dataset=eval_prefs,
    args=cfg,
    data_collator=None,
    loss_type="sigmoid",
    precompute_ref_log_probs=True,
    dataset_num_proc=1,
    dataset_text_field=("prompt","chosen","rejected"),
)
trainer.train()
trainer.save_model("gemma1b_dpo_adapter")
tokenizer.save_pretrained("gemma1b_dpo_adapter")


Extracting prompt in train dataset (num_proc=16):   0%|          | 0/4900 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=16):   0%|          | 0/4900 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=16):   0%|          | 0/4900 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,900 | Num Epochs = 1 | Total steps = 613
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 13,045,760 of 1,012,931,712 (1.29% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamipn[0m ([33msamipn-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.7049,0.011117,0.008036,0.3125,0.003081,-1017.80127,-976.008484,-7.0751,-7.013534,0,0,0
20,0.694,0.047105,0.033364,0.6125,0.013741,-928.074402,-865.93866,-7.580657,-7.549309,No Log,No Log,No Log
30,0.663,0.141583,0.049758,0.5875,0.091825,-1040.950684,-909.232117,-7.412745,-7.476302,No Log,No Log,No Log
40,0.7056,0.261128,0.259237,0.5375,0.00189,-993.98645,-1001.972473,-7.310129,-7.181903,No Log,No Log,No Log
50,0.7051,0.310489,0.299079,0.4375,0.01141,-937.673218,-896.474243,-7.341068,-7.249033,No Log,No Log,No Log
60,0.6704,0.235464,0.135513,0.6375,0.099951,-1041.644531,-945.421204,-7.555921,-7.55239,No Log,No Log,No Log
70,0.662,0.092923,-0.015555,0.625,0.108478,-1011.595581,-990.095215,-7.264603,-7.097335,No Log,No Log,No Log
80,0.6136,-0.229587,-0.472512,0.675,0.242925,-1001.003052,-900.093384,-7.146501,-7.14763,No Log,No Log,No Log
90,0.692,0.028538,-0.08217,0.575,0.110709,-1090.819092,-1004.513855,-7.449244,-7.483138,No Log,No Log,No Log
100,0.621,0.274387,-0.036376,0.5625,0.310763,-1010.246399,-962.769165,-6.97163,-6.730422,No Log,No Log,No Log


('gemma1b_dpo_adapter/tokenizer_config.json',
 'gemma1b_dpo_adapter/special_tokens_map.json',
 'gemma1b_dpo_adapter/chat_template.jinja',
 'gemma1b_dpo_adapter/tokenizer.model',
 'gemma1b_dpo_adapter/added_tokens.json',
 'gemma1b_dpo_adapter/tokenizer.json')

In [5]:
# Quick inference helper
from unsloth import FastLanguageModel
import torch

FastLanguageModel.for_inference(model)  # enables 2x faster kernels (no change to outputs)

def chat(prompt, history=None, max_new_tokens=128):
    if history is None: history = []
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens)
    print(tokenizer.decode(out[0], skip_special_tokens=True))

system_prompt = "You are a helpful assistant."
chat("Say hi in one sentence.")


user
You are a helpful assistant.

Say hi in one sentence.
model
Hello there! How can I help you today?
