# 07 — DPO & RL-Free Alternatives

> **Purpose:** Implement preference optimization without complex RL loops. DPO and its variants directly optimize policies from preference data.

**Key insight:** DPO reparameterizes the RLHF objective to eliminate the reward model and RL training.

| Algorithm | Reference Model | Data Format | Key Feature |
|-----------|-----------------|-------------|-------------|
| DPO | Required | Paired | Bradley-Terry loss |
| ORPO | None | Paired | Odds ratio + SFT |
| SimPO | None | Paired | Length normalization |
| KTO | Required | Unpaired | Prospect theory |

---

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, List, Tuple, Optional
import copy

torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

Device: cpu


## 1. TinyLM + Reference Model

DPO requires computing log probabilities under both current and reference policies.

In [2]:
class TinyLM(nn.Module):
    """Minimal LM for preference optimization experiments."""
    
    def __init__(self, vocab_size=100, hidden_size=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.head = nn.Linear(hidden_size, vocab_size)
        self.vocab_size = vocab_size
    
    def forward(self, x):
        h = self.embed(x)
        h, _ = self.rnn(h)
        return self.head(h)
    
    def get_sequence_log_prob(self, input_ids: torch.Tensor, 
                               labels: torch.Tensor) -> torch.Tensor:
        """
        Compute sum of log probabilities for a sequence.
        
        Args:
            input_ids: (batch, seq_len) prompt + response tokens
            labels: (batch, seq_len) target tokens
        
        Returns:
            log_probs: (batch,) sum of log probs per sequence
        """
        logits = self(input_ids)  # (batch, seq, vocab)
        log_probs = F.log_softmax(logits, dim=-1)
        
        # Gather log probs for target tokens
        token_log_probs = torch.gather(
            log_probs, dim=-1, index=labels.unsqueeze(-1)
        ).squeeze(-1)  # (batch, seq)
        
        # Sum over sequence
        return token_log_probs.sum(dim=-1)  # (batch,)


def create_reference_model(policy: nn.Module) -> nn.Module:
    """Create frozen copy for DPO reference."""
    ref = copy.deepcopy(policy)
    ref.eval()
    for p in ref.parameters():
        p.requires_grad = False
    return ref

## 2. DPO Loss Implementation

$$\mathcal{L}_{DPO} = -\mathbb{E}\left[\log \sigma\left(\beta \left(\log\frac{\pi_\theta(y_w|x)}{\pi_{ref}(y_w|x)} - \log\frac{\pi_\theta(y_l|x)}{\pi_{ref}(y_l|x)}\right)\right)\right]$$

In [3]:
def compute_dpo_loss(
    policy_chosen_logps: torch.Tensor,
    policy_rejected_logps: torch.Tensor,
    ref_chosen_logps: torch.Tensor,
    ref_rejected_logps: torch.Tensor,
    beta: float = 0.1,
) -> Dict[str, torch.Tensor]:
    """
    Compute DPO loss from log probabilities.
    
    Args:
        policy_chosen_logps: (batch,) log P(y_w|x) under policy
        policy_rejected_logps: (batch,) log P(y_l|x) under policy
        ref_chosen_logps: (batch,) log P(y_w|x) under reference
        ref_rejected_logps: (batch,) log P(y_l|x) under reference
        beta: Temperature parameter controlling divergence strength
    
    Returns:
        Dict with 'loss', 'chosen_reward', 'rejected_reward', 'accuracy'
    """
    # Compute log ratios (implicit rewards)
    chosen_log_ratio = policy_chosen_logps - ref_chosen_logps
    rejected_log_ratio = policy_rejected_logps - ref_rejected_logps
    
    # DPO loss: -log sigmoid(beta * (chosen_reward - rejected_reward))
    logits = beta * (chosen_log_ratio - rejected_log_ratio)
    loss = -F.logsigmoid(logits).mean()
    
    # Metrics
    chosen_reward = beta * chosen_log_ratio
    rejected_reward = beta * rejected_log_ratio
    accuracy = (chosen_reward > rejected_reward).float().mean()
    
    return {
        'loss': loss,
        'chosen_reward': chosen_reward.mean(),
        'rejected_reward': rejected_reward.mean(),
        'accuracy': accuracy,
        'margin': (chosen_reward - rejected_reward).mean(),
    }

In [4]:
# TEST: DPO loss

batch_size = 8

# Simulate log probabilities
policy_chosen = torch.randn(batch_size) - 50  # Log probs are negative
policy_rejected = torch.randn(batch_size) - 55  # Slightly worse
ref_chosen = torch.randn(batch_size) - 52
ref_rejected = torch.randn(batch_size) - 52

result = compute_dpo_loss(
    policy_chosen, policy_rejected, 
    ref_chosen, ref_rejected,
    beta=0.1
)

print("DPO Loss Test:")
for k, v in result.items():
    print(f"  {k}: {v.item():.4f}")

DPO Loss Test:
  loss: 0.4864
  chosen_reward: 0.1758
  rejected_reward: -0.3045
  accuracy: 1.0000
  margin: 0.4802


## 3. ORPO: Reference-Free with Odds Ratio

ORPO combines SFT and alignment in one step using odds ratio:

$$\mathcal{L}_{ORPO} = \mathcal{L}_{SFT}(y_w) - \lambda \log\sigma\left(\log\frac{\text{odds}(y_w)}{\text{odds}(y_l)}\right)$$

In [5]:
def compute_orpo_loss(
    policy_chosen_logps: torch.Tensor,
    policy_rejected_logps: torch.Tensor,
    chosen_lengths: torch.Tensor,
    rejected_lengths: torch.Tensor,
    lambda_weight: float = 1.0,
) -> Dict[str, torch.Tensor]:
    """
    Compute ORPO loss (reference-free).
    
    Key: Uses odds ratio instead of log probability ratio.
    odds(y) = P(y) / (1 - P(y)) ≈ exp(log P(y)) for small P
    
    Args:
        policy_chosen_logps: (batch,) sum of log probs for chosen
        policy_rejected_logps: (batch,) sum of log probs for rejected
        chosen_lengths: (batch,) length of chosen responses
        rejected_lengths: (batch,) length of rejected responses
        lambda_weight: Weight for preference term
    
    Returns:
        Dict with 'loss', 'sft_loss', 'preference_loss'
    """
    # Average log prob per token
    chosen_avg_logp = policy_chosen_logps / chosen_lengths
    rejected_avg_logp = policy_rejected_logps / rejected_lengths
    
    # SFT loss: maximize log prob of chosen
    sft_loss = -policy_chosen_logps.mean()
    
    # Odds ratio (in log space)
    # log(odds) = log(P / (1-P)) ≈ log P for small P
    # We use average log prob to be length-normalized
    log_odds_chosen = chosen_avg_logp
    log_odds_rejected = rejected_avg_logp
    
    # Preference loss: prefer chosen over rejected
    preference_logits = log_odds_chosen - log_odds_rejected
    preference_loss = -F.logsigmoid(preference_logits).mean()
    
    # Combined loss
    total_loss = sft_loss + lambda_weight * preference_loss
    
    return {
        'loss': total_loss,
        'sft_loss': sft_loss,
        'preference_loss': preference_loss,
        'accuracy': (chosen_avg_logp > rejected_avg_logp).float().mean(),
    }

In [6]:
# TEST: ORPO loss

chosen_logps = torch.randn(8) - 100
rejected_logps = torch.randn(8) - 120  # Worse (more negative)
chosen_lens = torch.randint(20, 100, (8,)).float()
rejected_lens = torch.randint(20, 100, (8,)).float()

orpo_result = compute_orpo_loss(
    chosen_logps, rejected_logps,
    chosen_lens, rejected_lens,
    lambda_weight=1.0
)

print("ORPO Loss Test:")
for k, v in orpo_result.items():
    print(f"  {k}: {v.item():.4f}")

ORPO Loss Test:
  loss: 100.6832
  sft_loss: 100.0453
  preference_loss: 0.6379
  accuracy: 0.3750


## 4. SimPO: Length-Normalized Rewards

SimPO prevents length hacking by normalizing rewards:

$$r_{SimPO}(x, y) = \frac{\beta}{|y|} \log \pi_\theta(y|x)$$

In [7]:
def compute_simpo_loss(
    policy_chosen_logps: torch.Tensor,
    policy_rejected_logps: torch.Tensor,
    chosen_lengths: torch.Tensor,
    rejected_lengths: torch.Tensor,
    beta: float = 2.0,
    gamma: float = 0.5,  # Target margin
) -> Dict[str, torch.Tensor]:
    """
    Compute SimPO loss (reference-free, length-normalized).
    
    Key: Reward is (beta/length) * log_prob, preventing length bias.
    
    Args:
        policy_chosen_logps: (batch,) sum of log probs for chosen
        policy_rejected_logps: (batch,) sum of log probs for rejected
        chosen_lengths: (batch,) length of chosen responses
        rejected_lengths: (batch,) length of rejected responses
        beta: Reward scaling
        gamma: Target margin for preference
    
    Returns:
        Dict with 'loss', 'chosen_reward', 'rejected_reward'
    """
    # Length-normalized rewards
    chosen_reward = (beta / chosen_lengths) * policy_chosen_logps
    rejected_reward = (beta / rejected_lengths) * policy_rejected_logps
    
    # SimPO loss with margin
    logits = chosen_reward - rejected_reward - gamma
    loss = -F.logsigmoid(logits).mean()
    
    return {
        'loss': loss,
        'chosen_reward': chosen_reward.mean(),
        'rejected_reward': rejected_reward.mean(),
        'margin': (chosen_reward - rejected_reward).mean(),
        'accuracy': (chosen_reward > rejected_reward).float().mean(),
    }

In [8]:
# TEST: SimPO vs DPO on length bias

# Scenario: chosen is short but good, rejected is long but worse
chosen_logps = torch.tensor([-50.0])  # Short, high per-token prob
rejected_logps = torch.tensor([-150.0])  # Long, but sum is much lower
chosen_lens = torch.tensor([25.0])
rejected_lens = torch.tensor([100.0])

# SimPO rewards (length-normalized)
simpo_chosen_r = (2.0 / chosen_lens) * chosen_logps
simpo_rejected_r = (2.0 / rejected_lens) * rejected_logps

print("Length Bias Test:")
print(f"  Chosen: {chosen_lens.item():.0f} tokens, logP={chosen_logps.item():.0f}")
print(f"  Rejected: {rejected_lens.item():.0f} tokens, logP={rejected_logps.item():.0f}")
print(f"\nSimPO (length-normalized):")
print(f"  Chosen reward: {simpo_chosen_r.item():.2f}")
print(f"  Rejected reward: {simpo_rejected_r.item():.2f}")
print(f"  Prefers: {'Chosen ✓' if simpo_chosen_r > simpo_rejected_r else 'Rejected ✗'}")

Length Bias Test:
  Chosen: 25 tokens, logP=-50
  Rejected: 100 tokens, logP=-150

SimPO (length-normalized):
  Chosen reward: -4.00
  Rejected reward: -3.00
  Prefers: Rejected ✗


## 5. KTO: Unpaired Preference Data

KTO uses prospect theory to learn from unpaired "desirable" vs "undesirable" labels.

In [9]:
def compute_kto_loss(
    policy_desirable_logps: torch.Tensor,
    policy_undesirable_logps: torch.Tensor,
    ref_desirable_logps: torch.Tensor,
    ref_undesirable_logps: torch.Tensor,
    kl_desirable: torch.Tensor,
    kl_undesirable: torch.Tensor,
    beta: float = 0.1,
) -> Dict[str, torch.Tensor]:
    """
    Compute KTO loss for unpaired preference data.
    
    Based on prospect theory: loss aversion means
    undesirable samples are weighted more heavily.
    
    Args:
        policy_*_logps: Log probs under current policy
        ref_*_logps: Log probs under reference policy
        kl_*: KL divergence estimates (for regularization)
        beta: Temperature parameter
    
    Returns:
        Dict with 'loss', 'desirable_loss', 'undesirable_loss'
    """
    # Compute log ratios
    desirable_ratio = policy_desirable_logps - ref_desirable_logps
    undesirable_ratio = policy_undesirable_logps - ref_undesirable_logps
    
    # KL regularization (expected KL from desirable distribution)
    kl_ref = kl_desirable.mean().detach()
    
    # Desirable: maximize (want high ratio)
    # Uses sigmoid to bound the utility
    desirable_utility = beta * desirable_ratio - beta * kl_ref
    desirable_loss = -F.logsigmoid(desirable_utility).mean()
    
    # Undesirable: minimize (want low ratio)
    # Loss aversion: undesirable weighted ~2x (from prospect theory)
    undesirable_utility = beta * kl_ref - beta * undesirable_ratio
    undesirable_loss = -F.logsigmoid(undesirable_utility).mean()
    
    # Combined loss
    total_loss = desirable_loss + undesirable_loss
    
    return {
        'loss': total_loss,
        'desirable_loss': desirable_loss,
        'undesirable_loss': undesirable_loss,
        'desirable_ratio': desirable_ratio.mean(),
        'undesirable_ratio': undesirable_ratio.mean(),
    }

## 6. Complete DPO Trainer

In [10]:
class DPOTrainer:
    """
    Complete DPO trainer with configurable loss variants.
    
    Supports:
    - DPO (default): Requires reference model
    - ORPO: No reference, combines SFT + preference
    - SimPO: No reference, length-normalized
    """
    
    def __init__(self,
                 policy: nn.Module,
                 ref_model: Optional[nn.Module] = None,
                 loss_type: str = 'dpo',
                 beta: float = 0.1,
                 lr: float = 1e-5):
        
        self.policy = policy
        self.ref_model = ref_model
        self.loss_type = loss_type
        self.beta = beta
        
        self.optimizer = torch.optim.AdamW(policy.parameters(), lr=lr)
    
    def compute_logps(self, model: nn.Module, 
                      input_ids: torch.Tensor,
                      labels: torch.Tensor) -> torch.Tensor:
        """Compute sequence log probabilities."""
        return model.get_sequence_log_prob(input_ids, labels)
    
    def train_step(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]:
        """
        Single training step.
        
        Args:
            batch: Dict with 'chosen_ids', 'rejected_ids', 'chosen_labels',
                   'rejected_labels', 'chosen_lengths', 'rejected_lengths'
        
        Returns:
            Training metrics
        """
        self.policy.train()
        
        # Get policy log probs
        policy_chosen_logps = self.compute_logps(
            self.policy, batch['chosen_ids'], batch['chosen_labels']
        )
        policy_rejected_logps = self.compute_logps(
            self.policy, batch['rejected_ids'], batch['rejected_labels']
        )
        
        # Compute loss based on type
        if self.loss_type == 'dpo':
            # Need reference log probs
            with torch.no_grad():
                ref_chosen_logps = self.compute_logps(
                    self.ref_model, batch['chosen_ids'], batch['chosen_labels']
                )
                ref_rejected_logps = self.compute_logps(
                    self.ref_model, batch['rejected_ids'], batch['rejected_labels']
                )
            
            loss_dict = compute_dpo_loss(
                policy_chosen_logps, policy_rejected_logps,
                ref_chosen_logps, ref_rejected_logps,
                beta=self.beta
            )
        
        elif self.loss_type == 'orpo':
            loss_dict = compute_orpo_loss(
                policy_chosen_logps, policy_rejected_logps,
                batch['chosen_lengths'], batch['rejected_lengths']
            )
        
        elif self.loss_type == 'simpo':
            loss_dict = compute_simpo_loss(
                policy_chosen_logps, policy_rejected_logps,
                batch['chosen_lengths'], batch['rejected_lengths'],
                beta=self.beta
            )
        
        else:
            raise ValueError(f"Unknown loss type: {self.loss_type}")
        
        # Backward pass
        self.optimizer.zero_grad()
        loss_dict['loss'].backward()
        self.optimizer.step()
        
        return {k: v.item() for k, v in loss_dict.items()}

In [11]:
# TEST: DPO Trainer

policy = TinyLM().to(device)
ref_model = create_reference_model(policy).to(device)

trainer = DPOTrainer(
    policy=policy,
    ref_model=ref_model,
    loss_type='dpo',
    beta=0.1,
    lr=1e-4
)

# Create synthetic batch
batch_size, seq_len = 4, 50
batch = {
    'chosen_ids': torch.randint(0, 100, (batch_size, seq_len), device=device),
    'rejected_ids': torch.randint(0, 100, (batch_size, seq_len), device=device),
    'chosen_labels': torch.randint(0, 100, (batch_size, seq_len), device=device),
    'rejected_labels': torch.randint(0, 100, (batch_size, seq_len), device=device),
    'chosen_lengths': torch.full((batch_size,), seq_len, dtype=torch.float, device=device),
    'rejected_lengths': torch.full((batch_size,), seq_len, dtype=torch.float, device=device),
}

# Training loop
print("DPO Training:")
print(f"{'Step':>5} {'Loss':>10} {'Accuracy':>10} {'Margin':>10}")
print("-" * 40)

for step in range(5):
    metrics = trainer.train_step(batch)
    print(f"{step:>5} {metrics['loss']:>10.4f} {metrics['accuracy']:>10.2f} {metrics['margin']:>10.4f}")

DPO Training:
 Step       Loss   Accuracy     Margin
----------------------------------------
    0     0.6931       0.00    -0.0000
    1     0.6843       1.00     0.0178
    2     0.6755       1.00     0.0357
    3     0.6668       1.00     0.0535
    4     0.6581       1.00     0.0713


## 7. Algorithm Selection Guide

| Scenario | Recommended | Why |
|----------|-------------|-----|
| Standard alignment, paired data | **DPO** | Simple, proven |
| Memory constrained | **ORPO** or **SimPO** | No reference model |
| Length hacking issues | **SimPO** | Built-in normalization |
| Only binary labels | **KTO** | Works with unpaired |
| Iterative improvement | **Online DPO** | Fresh on-policy data |

### Production β Values

```python
beta_recommendations = {
    'dpo': 0.1,      # Standard
    'dpo_strong': 0.5,  # More aggressive
    'simpo': 2.0,    # Higher for length norm
    'kto': 0.1,      # Similar to DPO
}
```

---
**Tier 2 Complete!** You now have implementations for all major preference optimization algorithms.