# RLHF Assignment 3: Final Analysis

Comprehensive analysis and visualization of RLHF training results.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Results

In [None]:
output_dir = Path('../outputs/full/evaluation')
models = ['reference', 'ppo', 'grpo', 'dpo']
results = {}

for model in models:
    try:
        with open(output_dir / f'{model}_samples.json', 'r') as f:
            results[model] = json.load(f)
    except FileNotFoundError:
        print(f"Warning: {model} not found")

print(f"Loaded {len(results)} models")

## 2. Reward Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for model, samples in results.items():
    rewards = [s['reward'] for s in samples]
    axes[0].hist(rewards, alpha=0.5, label=model, bins=20)

axes[0].set_xlabel('Reward')
axes[0].set_ylabel('Count')
axes[0].set_title('Reward Distribution')
axes[0].legend()

for model, samples in results.items():
    if model != 'reference':
        kls = [s['kl'] for s in samples]
        axes[1].hist(kls, alpha=0.5, label=model, bins=20)

axes[1].set_xlabel('KL Divergence')
axes[1].set_ylabel('Count')
axes[1].set_title('KL Distribution')
axes[1].legend()

plt.tight_layout()
plt.show()

## 3. Pareto Frontier

In [None]:
plt.figure(figsize=(10, 6))

for model, samples in results.items():
    if model != 'reference':
        rewards = [s['reward'] for s in samples]
        kls = [s['kl'] for s in samples]
        plt.scatter(np.mean(kls), np.mean(rewards), label=model, s=100)

plt.xlabel('Mean KL Divergence')
plt.ylabel('Mean Reward')
plt.title('Reward vs KL Trade-off')
plt.legend()
plt.grid(True)
plt.show()

## 4. Summary Statistics

In [None]:
summary_data = []
for model, samples in results.items():
    rewards = [s['reward'] for s in samples]
    kls = [s.get('kl', 0) for s in samples]
    summary_data.append({
        'Model': model,
        'Mean Reward': np.mean(rewards),
        'Std Reward': np.std(rewards),
        'Mean KL': np.mean(kls),
        'Samples': len(samples)
    })

df = pd.DataFrame(summary_data)
print(df.to_string(index=False))

## 5. Sample Outputs

In [None]:
for i in range(min(3, len(results['reference']))):
    print(f"\n{'='*80}")
    print(f"Example {i+1}")
    print(f"{'='*80}")
    print(f"\nPrompt: {results['reference'][i]['prompt']}\n")
    for model in models:
        if model in results:
            print(f"{model.upper()}: {results[model][i]['response'][:150]}...")
            print(f"  Reward: {results[model][i]['reward']:.4f}\n")