In [2]:
# Install Unsloth
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install xformers trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-grqen75c/unsloth_a2c48d436df34fc69ee83e4999835035
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-grqen75c/unsloth_a2c48d436df34fc69ee83e4999835035
  Resolved https://github.com/unslothai/unsloth.git to commit 1c0ad844f170f67c7cdf6f7a9465bafb0f9627df
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.11.3 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.11.3-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.gi

In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load SmolLM2-135M for DPO training
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("✅ Model loaded for DPO training!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

✅ Model loaded for DPO training!


In [5]:
# Configure LoRA for DPO (using r=16 for efficiency)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
)

print("✅ LoRA configured for DPO!")
model.print_trainable_parameters()

Unsloth: Already have LoRA adapters! We shall skip this step.


✅ LoRA configured for DPO!
trainable params: 4,884,480 || all params: 139,400,064 || trainable%: 3.5039


In [13]:
from datasets import load_dataset

# Load DPO dataset with preference pairs
dataset = load_dataset("argilla/distilabel-capybara-dpo-7k-binarized", split="train")

# Select subset for training
dataset = dataset.select(range(1000))

print(f"Dataset loaded: {len(dataset)} examples")
print(f"Columns: {dataset.column_names}\n")

print("="*70)
print("EXAMPLE 1")
print("="*70)

example = dataset[0]

print("\nPROMPT:")
print(example['prompt'])

print("\nCHOSEN:")
print(example['chosen'][:400] + "..." if len(example['chosen']) > 400 else example['chosen'])

print("\nREJECTED:")
print(example['rejected'][:400] + "..." if len(example['rejected']) > 400 else example['rejected'])

print("\n" + "="*70)
print("EXAMPLE 2")
print("="*70)

example2 = dataset[1]

print("\nPROMPT:")
print(example2['prompt'])

print("\nCHOSEN:")
print(example2['chosen'][:300] + "..." if len(example2['chosen']) > 300 else example2['chosen'])

print("\nREJECTED:")
print(example2['rejected'][:300] + "..." if len(example2['rejected']) > 300 else example2['rejected'])

Dataset loaded: 1000 examples
Columns: ['source', 'conversation', 'original_response', 'generation_prompt', 'raw_generation_responses', 'new_generations', 'prompt', 'chosen', 'rejected', 'rating_chosen', 'rating_rejected', 'chosen_model', 'rejected_model']

EXAMPLE 1

PROMPT:
<|user|>
The setting is an otherworldly, yet eerily familiar, metropolis known as "Zephyria." It's a city suspended in the ether, floating amidst nebulous clouds of cosmic dust. The architecture here is surreal and alien, with buildings that twist and spiral like strands of DNA, reaching towards the cosmos. Streets are paved with luminescent cobblestones, casting soft hues of blues and purples, reflecting off iridescent structures. Strange vegetation, vibrant and bioluminescent, thrives everywhere, creating a stark contrast against the deep indigo sky.

Now, immerse me in this ethereal journey through Zephyria.</s>
<|assistant|>
As you step onto the teleportation platform, there's a momentary sense of disorientati

In [14]:
from trl import DPOTrainer, DPOConfig

# DPO-specific configuration
training_args = DPOConfig(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    max_steps = 60,
    learning_rate = 5e-5,  # Lower learning rate for DPO
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    report_to = "none",
    beta = 0.1,  # DPO beta parameter (controls strength of preference)
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,  # Unsloth handles reference model automatically
    args = training_args,
    train_dataset = dataset,
    tokenizer = tokenizer,
    max_length = max_seq_length,
    max_prompt_length = 1024,
)

print("✅ DPO Trainer configured!")
print(f"📊 Beta parameter: {training_args.beta} (controls preference strength)")

✅ DPO Trainer configured!
📊 Beta parameter: 0.1 (controls preference strength)


In [15]:
# Start DPO training
import time
start_time = time.time()

trainer_stats = dpo_trainer.train()

training_time = time.time() - start_time
print(f"\n⏱️ DPO training completed in {training_time/60:.2f} minutes")

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
1,0.6931,0.0,0.0,0.0,0.0,-692.14563,-761.049805,5.8145,5.6955,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-761.250488,-840.366638,4.588463,4.920459,No Log,No Log,No Log
3,0.6939,-0.000159,0.001173,0.375,-0.001332,-692.37085,-593.498413,6.393243,5.106608,No Log,No Log,No Log
4,0.6963,-0.000992,0.005168,0.5,-0.00616,-858.845215,-1068.730713,3.773338,5.220665,No Log,No Log,No Log
5,0.6863,0.008544,-0.005322,0.625,0.013866,-879.024536,-1159.073364,4.393763,4.521501,No Log,No Log,No Log
6,0.6894,0.002934,-0.004961,0.625,0.007894,-858.710388,-1445.01062,5.494203,6.138864,No Log,No Log,No Log
7,0.6919,0.000659,-0.002095,0.5,0.002754,-758.801758,-1060.530396,4.710511,4.70378,No Log,No Log,No Log
8,0.6894,-0.002007,-0.009587,0.625,0.007579,-789.600891,-1004.224548,3.841358,3.987925,No Log,No Log,No Log
9,0.6905,-0.005062,-0.010597,0.5,0.005535,-931.38623,-1059.737793,5.166306,6.67407,No Log,No Log,No Log
10,0.6833,-0.002102,-0.022501,0.625,0.020399,-630.47168,-849.630127,5.142216,5.672153,No Log,No Log,No Log



⏱️ DPO training completed in 2.91 minutes


In [16]:
# Enable inference mode
FastLanguageModel.for_inference(model)

# Test prompts
test_prompts = [
    "Explain what is machine learning in simple terms.",
    "Write a polite email declining a job offer.",
    "How do I make a simple web server in Python?",
]

for prompt in test_prompts:
    print(f"\n{'='*60}")
    print(f"📝 Prompt: {prompt}")
    print('='*60)

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        use_cache=True,
        temperature=0.7,
        top_p=0.9,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt from output
    response = response[len(prompt):].strip()
    print(f"🤖 Response: {response}")


📝 Prompt: Explain what is machine learning in simple terms.
🤖 Response: Machine learning is a type of artificial intelligence that uses algorithms to learn from data and make predictions or decisions based on that data. It's a way of doing things that's based on the idea that computers can learn and improve over time.

So, in simpler terms, machine learning is a way of using computers to learn from data and make predictions or decisions. It's a way of using computers to learn from data and make predictions or decisions based on that data.

Think of it like a teacher who's teaching a class of students. The teacher is using computers to teach the students, and the computer is using the computer to teach the students. The computer is using the computer to teach the students, and the computer is using the computer

📝 Prompt: Write a polite email declining a job offer.
🤖 Response: Here's a sample response:

Hi [Jill's Name],

I hope you're doing well. I've been thinking about the job offer

In [17]:
# Show how the model prefers better responses
print("="*60)
print("🎯 DPO PREFERENCE DEMONSTRATION")
print("="*60)

test_case = """
The model was trained to prefer:
- Helpful, detailed responses (CHOSEN)
- Over unhelpful, brief responses (REJECTED)

Let's test this preference learning:
"""

print(test_case)

# Test with a tricky prompt
tricky_prompt = "What should I do if my code has a bug?"

inputs = tokenizer(tricky_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=200, use_cache=True)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"📝 Prompt: {tricky_prompt}")
print(f"\n🤖 DPO-trained response:")
print(response[len(tricky_prompt):].strip())

print("\n✅ The model should give helpful, detailed advice!")
print("✅ DPO training helps align the model with human preferences!")

🎯 DPO PREFERENCE DEMONSTRATION

The model was trained to prefer:
- Helpful, detailed responses (CHOSEN)
- Over unhelpful, brief responses (REJECTED)

Let's test this preference learning:

📝 Prompt: What should I do if my code has a bug?

🤖 DPO-trained response:


✅ The model should give helpful, detailed advice!
✅ DPO training helps align the model with human preferences!


In [18]:
# Save the DPO fine-tuned model
save_dir = "smollm2_dpo_aligned"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"✅ DPO model saved to: {save_dir}")

# Check size
!du -sh {save_dir}

✅ DPO model saved to: smollm2_dpo_aligned
24M	smollm2_dpo_aligned
