# DPO (Direct Preference Optimization) Training Demo

This notebook demonstrates Direct Preference Optimization, a simpler alternative to RLHF.

**Key Features:**
- Direct optimization without reward model
- Preference-based training
- Stable training process
- Memory efficient

In [None]:
# Install required packages
!pip install -q transformers datasets torch accelerate
!pip install -q trl peft bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import DPOTrainer
from datasets import Dataset
import pandas as pd

In [None]:
# Load model and tokenizer
model_name = "microsoft/DialoGPT-medium"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded: {model_name}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Create preference dataset
# Each example has: prompt, chosen (preferred), rejected (not preferred)
preference_data = [
    {
        "prompt": "Explain quantum computing",
        "chosen": "Quantum computing uses quantum mechanical phenomena like superposition and entanglement to perform computations.",
        "rejected": "Quantum computing is just faster computers."
    },
    {
        "prompt": "What is machine learning?",
        "chosen": "Machine learning is a subset of AI that enables computers to learn patterns from data without explicit programming.",
        "rejected": "Machine learning is robots thinking."
    },
    {
        "prompt": "How do neural networks work?",
        "chosen": "Neural networks consist of interconnected nodes that process information through weighted connections, mimicking brain structure.",
        "rejected": "Neural networks are just computer brains."
    }
]

# Convert to dataset
dataset = Dataset.from_pandas(pd.DataFrame(preference_data))
print(f"Dataset size: {len(dataset)}")
print(f"Sample: {dataset[0]}")

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./dpo-model",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-7,
    logging_steps=10,
    save_strategy="no",
    evaluation_strategy="no",
    warmup_steps=10,
    bf16=True,
    remove_unused_columns=False,
)

print("Training configuration:")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Batch size: {training_args.per_device_train_batch_size}")

In [None]:
# Initialize DPO trainer
dpo_trainer = DPOTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset,
    beta=0.1,  # DPO temperature parameter
    max_length=512,
    max_prompt_length=128,
)

print("Starting DPO training...")

# Train the model
dpo_trainer.train()

print("DPO training completed!")

# Test the trained model
test_prompt = "What is artificial intelligence?"
inputs = tokenizer(test_prompt, return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=inputs["input_ids"].shape[1] + 50,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Prompt: {test_prompt}")
print(f"Generated: {generated_text}")