# DPO vs SFT Interactive Comparison
## Ranking Adherence in 4-bit Quantized Movie Recommendations

**Thesis**: "Does DPO provide better ranking adherence than SFT in 4-bit quantized recommendation scenarios?"

This notebook provides interactive exploration of both models.

## Setup & Imports

In [None]:
import torch
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from collections import defaultdict

print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Configuration
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.cuda.set_device(0)
MAX_SEQ_LENGTH = 2048

print(f"\nUsing device: {DEVICE}")

## Step 1: Load Dataset

In [None]:
# Load the contrastive recommendation dataset
dataset = load_dataset('json', data_files='contrastive_rec_train.jsonl', split='train')
print(f"Total dataset size: {len(dataset)} examples")

# Display sample
sample = dataset[0]
print("\n" + "="*70)
print("SAMPLE TRAINING EXAMPLE")
print("="*70)
print(f"\nInstruction:\n{sample['instruction']}")
print(f"\nInput:\n{sample['input']}")
print(f"\nOutput (Ground Truth):\n{sample['output']}")

# Analyze dataset structure
print("\n" + "="*70)
print("DATASET ANALYSIS")
print("="*70)

option_a_count = sum(1 for item in dataset if "Option A" in item['output'])
option_b_count = sum(1 for item in dataset if "Option B" in item['output'])

print(f"\nPreference Distribution:")
print(f"  Option A preferred: {option_a_count} ({option_a_count/len(dataset)*100:.1f}%)")
print(f"  Option B preferred: {option_b_count} ({option_b_count/len(dataset)*100:.1f}%)")
print(f"\nPerfect for DPO: Clear preference signal with contrastive pairs!")

## Step 2: Load Models (SFT Baseline)

In [None]:
from unsloth import FastLanguageModel

print("Loading SFT Baseline Model (llama3.2-lora-final)...")

try:
    sft_model, sft_tokenizer = FastLanguageModel.from_pretrained(
        model_name="llama3.2-lora-final",
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=None,
        load_in_4bit=True,
    )
    print("✓ SFT model loaded successfully")
except Exception as e:
    print(f"✗ Could not load SFT model: {e}")
    sft_model = None

if torch.cuda.is_available():
    print(f"Memory used: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

## Step 3: Load DPO Model (After Training)

In [None]:
print("Loading DPO Optimized Model (llama3_dpo_4bit_final)...")
print("Note: This model needs to be trained first using train_dpo.py")

try:
    dpo_model, dpo_tokenizer = FastLanguageModel.from_pretrained(
        model_name="llama3_dpo_4bit_final",
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=None,
        load_in_4bit=True,
    )
    print("✓ DPO model loaded successfully")
    dpo_model = FastLanguageModel.for_inference(dpo_model)
except Exception as e:
    print(f"✗ Could not load DPO model: {e}")
    print("   Run: python train_dpo.py")
    dpo_model = None

if torch.cuda.is_available():
    print(f"Memory used: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

## Step 4: Interactive Model Comparison

In [None]:
def run_inference(model, tokenizer, sample):
    """
    Run model inference on a sample and extract preference.
    """
    instruction = sample['instruction']
    input_text = sample['input']
    
    prompt = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:"""
    
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=0.7,
                top_p=0.95,
                do_sample=False,  # For reproducibility
                use_cache=True,
            )
        
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response_text = full_response.split("### Response:")[-1].strip()
        
        # Extract preference
        if "Option A" in response_text:
            preference = "A"
        elif "Option B" in response_text:
            preference = "B"
        else:
            preference = "Unknown"
        
        return response_text, preference
    except Exception as e:
        return f"Error: {e}", "Error"

print("✓ Inference function ready")
print("  Use: response, pref = run_inference(model, tokenizer, sample)")

## Step 5: Compare Models on Sample

In [None]:
# Select a sample
sample_idx = 0
sample = dataset[sample_idx]

print(f"\n{'='*70}")
print(f"SAMPLE #{sample_idx}: Side-by-side Comparison")
print(f"{'='*70}")

print(f"\nUser History + Movie Options:")
print(sample['input'])

# Ground truth
ground_truth = sample['output']
expected_pref = "A" if "Option A" in ground_truth else "B"
print(f"\n{'Ground Truth (Expected):':<25} {ground_truth}")

if sft_model:
    print("\n" + "-"*70)
    print("SFT Model Response:")
    print("-"*70)
    sft_response, sft_pref = run_inference(sft_model, sft_tokenizer, sample)
    print(f"Response: {sft_response}")
    print(f"\nPreference: {sft_pref} | Correct: {'✓' if sft_pref == expected_pref else '✗'}")
else:
    print("\nSFT model not loaded")

if dpo_model:
    print("\n" + "-"*70)
    print("DPO Model Response:")
    print("-"*70)
    dpo_response, dpo_pref = run_inference(dpo_model, dpo_tokenizer, sample)
    print(f"Response: {dpo_response}")
    print(f"\nPreference: {dpo_pref} | Correct: {'✓' if dpo_pref == expected_pref else '✗'}")
else:
    print("\nDPO model not loaded. Train it first: python train_dpo.py")

## Step 6: Batch Evaluation (Optional - Small Sample)
⚠️ **Note**: This will take a while. Recommended: Run evaluate_dpo.py instead

In [None]:
# Evaluate on small subset for quick results
test_size = 10  # Change to larger number for full evaluation
test_samples = dataset.select(range(min(test_size, len(dataset))))

results = {
    'SFT': {'correct': 0, 'total': 0, 'preferences': []},
    'DPO': {'correct': 0, 'total': 0, 'preferences': []}
}

print(f"\nEvaluating on {len(test_samples)} samples...\n")

for idx, sample in enumerate(tqdm(test_samples, desc="Evaluation")):
    expected_pref = "A" if "Option A" in sample['output'] else "B"
    
    # SFT evaluation
    if sft_model:
        _, sft_pref = run_inference(sft_model, sft_tokenizer, sample)
        results['SFT']['total'] += 1
        if sft_pref == expected_pref:
            results['SFT']['correct'] += 1
        results['SFT']['preferences'].append(sft_pref == expected_pref)
    
    # DPO evaluation
    if dpo_model:
        _, dpo_pref = run_inference(dpo_model, dpo_tokenizer, sample)
        results['DPO']['total'] += 1
        if dpo_pref == expected_pref:
            results['DPO']['correct'] += 1
        results['DPO']['preferences'].append(dpo_pref == expected_pref)

# Display results
print("\n" + "="*70)
print("EVALUATION RESULTS")
print("="*70)

for model_name, result in results.items():
    if result['total'] > 0:
        accuracy = (result['correct'] / result['total']) * 100
        print(f"\n{model_name}:")
        print(f"  Accuracy: {accuracy:.1f}% ({result['correct']}/{result['total']})")

# Comparison
if results['SFT']['total'] > 0 and results['DPO']['total'] > 0:
    sft_acc = (results['SFT']['correct'] / results['SFT']['total']) * 100
    dpo_acc = (results['DPO']['correct'] / results['DPO']['total']) * 100
    improvement = dpo_acc - sft_acc
    
    print(f"\n{'='*70}")
    print(f"DPO Improvement: {improvement:+.1f}%")
    print(f"{'='*70}")
    
    if improvement > 2:
        print("✓ DPO shows meaningful improvement!")
    elif improvement > 0:
        print("◐ DPO shows modest improvement")
    else:
        print("✗ SFT performs better or similar to DPO")

## Step 7: Thesis Visualization

In [None]:
# Create comparison visualization
if results['SFT']['total'] > 0 and results['DPO']['total'] > 0:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Accuracy comparison
    sft_acc = (results['SFT']['correct'] / results['SFT']['total']) * 100
    dpo_acc = (results['DPO']['correct'] / results['DPO']['total']) * 100
    
    models = ['SFT Baseline', 'DPO Optimized']
    accuracies = [sft_acc, dpo_acc]
    colors = ['#3498db', '#e74c3c']
    
    axes[0].bar(models, accuracies, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
    axes[0].set_ylabel('Ranking Accuracy (%)', fontsize=12, fontweight='bold')
    axes[0].set_title('DPO vs SFT: Ranking Accuracy', fontsize=13, fontweight='bold')
    axes[0].set_ylim(0, 105)
    
    for i, (model, acc) in enumerate(zip(models, accuracies)):
        axes[0].text(i, acc + 2, f'{acc:.1f}%', ha='center', fontsize=11, fontweight='bold')
    
    # Preference adherence over samples
    sft_running = np.cumsum(results['SFT']['preferences']) / np.arange(1, len(results['SFT']['preferences'])+1) * 100
    dpo_running = np.cumsum(results['DPO']['preferences']) / np.arange(1, len(results['DPO']['preferences'])+1) * 100
    
    axes[1].plot(sft_running, marker='o', label='SFT', linewidth=2, markersize=6, color='#3498db')
    axes[1].plot(dpo_running, marker='s', label='DPO', linewidth=2, markersize=6, color='#e74c3c')
    axes[1].set_xlabel('Sample Number', fontsize=12, fontweight='bold')
    axes[1].set_ylabel('Cumulative Accuracy (%)', fontsize=12, fontweight='bold')
    axes[1].set_title('Preference Adherence: Convergence Curve', fontsize=13, fontweight='bold')
    axes[1].legend(fontsize=11)
    axes[1].grid(True, alpha=0.3)
    axes[1].set_ylim(0, 105)
    
    plt.tight_layout()
    plt.savefig('dpo_vs_sft_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("✓ Comparison plot saved as dpo_vs_sft_comparison.png")
else:
    print("Cannot create visualization without both models trained")

## Step 8: Thesis Conclusion

In [None]:
print("\n" + "="*70)
print("THESIS HYPOTHESIS TEST")
print("="*70)
print("\nQuestion: Does DPO provide better ranking adherence than SFT")
print("in 4-bit quantized recommendation scenarios?")

if results['SFT']['total'] > 0 and results['DPO']['total'] > 0:
    sft_acc = (results['SFT']['correct'] / results['SFT']['total']) * 100
    dpo_acc = (results['DPO']['correct'] / results['DPO']['total']) * 100
    improvement = dpo_acc - sft_acc
    
    print(f"\nResults:")
    print(f"  SFT Baseline: {sft_acc:.1f}%")
    print(f"  DPO Optimized: {dpo_acc:.1f}%")
    print(f"  Improvement: {improvement:+.1f}%")
    
    print(f"\nConclusion:")
    if improvement > 2:
        print("  ✓ HYPOTHESIS SUPPORTED")
        print(f"    DPO shows {improvement:.1f}% better ranking adherence")
        print("    Preference optimization effective for 4-bit quantization")
    elif improvement > 0:
        print("  ◐ HYPOTHESIS PARTIALLY SUPPORTED")
        print(f"    DPO shows modest {improvement:.1f}% improvement")
        print("    May require larger evaluation set or hyperparameter tuning")
    else:
        print("  ✗ HYPOTHESIS NOT SUPPORTED")
        print(f"    SFT performs {abs(improvement):.1f}% better than DPO")
        print("    Consider: different beta, more training, larger model")
else:
    print("\nWaiting for model evaluation...")
    print("Execute previous cells to generate results")

print("\n" + "="*70)

## Next Steps

1. **Train DPO Model**
   ```bash
   python train_dpo.py
   ```
   Expected: 2-4 hours on RTX 3090

2. **Run Full Evaluation**
   ```bash
   python evaluate_dpo.py
   ```
   Expected: 30-60 minutes for 100-1000 samples

3. **Analyze Results**
   - Compare accuracy metrics
   - Check confidence scores
   - Evaluate reasoning quality

4. **Write Thesis**
   - Summarize findings
   - Discuss implications for quantized LLMs
   - Propose future work