# ORPO (Odds Ratio Preference Optimization) Training Demo

This notebook demonstrates ORPO, a reference-model-free alignment method.

**Key Features:**
- No reference model required
- Memory efficient
- Combines SFT and preference optimization
- Single-stage training process

In [None]:
# Install required packages
!pip install -q transformers datasets torch accelerate
!pip install -q trl peft bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import ORPOTrainer, ORPOConfig
from datasets import Dataset
import pandas as pd

In [None]:
# ORPO Configuration
orpo_config = ORPOConfig(
    model_name="microsoft/DialoGPT-medium",
    learning_rate=8e-6,
    lr_scheduler_type="linear",
    warmup_steps=10,
    max_length=512,
    max_prompt_length=128,
    beta=0.1,  # ORPO penalty coefficient
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=1000,
    output_dir="./orpo-model"
)

print("ORPO Configuration:")
print(f"Model: {orpo_config.model_name}")
print(f"Learning rate: {orpo_config.learning_rate}")
print(f"Beta (penalty): {orpo_config.beta}")
print(f"Max length: {orpo_config.max_length}")

In [None]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(orpo_config.model_name)
tokenizer = AutoTokenizer.from_pretrained(orpo_config.model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded: {orpo_config.model_name}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Tokenizer vocab size: {len(tokenizer)}")

In [None]:
# Create preference dataset for ORPO
# ORPO requires: prompt, chosen (preferred), rejected (not preferred)
preference_data = [
    {
        "prompt": "Explain the importance of renewable energy",
        "chosen": "Renewable energy is crucial for combating climate change, reducing pollution, and ensuring sustainable development for future generations.",
        "rejected": "Renewable energy is just expensive and unreliable."
    },
    {
        "prompt": "What are the benefits of exercise?",
        "chosen": "Regular exercise improves cardiovascular health, strengthens muscles, enhances mental well-being, and increases longevity.",
        "rejected": "Exercise is just for losing weight."
    },
    {
        "prompt": "Describe the role of education in society",
        "chosen": "Education empowers individuals, promotes social mobility, drives innovation, and creates informed citizens who contribute to democratic society.",
        "rejected": "Education is just about memorizing facts."
    }
]

# Convert to dataset
dataset = Dataset.from_pandas(pd.DataFrame(preference_data))
print(f"Dataset size: {len(dataset)}")
print(f"Sample example: {dataset[0]}")

In [None]:
# Initialize ORPO trainer
trainer = ORPOTrainer(
    model=model,
    args=orpo_config,
    tokenizer=tokenizer,
    train_dataset=dataset,
)

print("Starting ORPO training...")

# Train the model
trainer.train()

print("ORPO training completed!")

# Test the trained model
test_prompts = [
    "What is the future of technology?",
    "How can we protect the environment?",
    "What makes a good leader?"
]

print("\nTesting the trained model:")
for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + 50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {generated_text}")