# 03 - Direct Preference Optimization (DPO) Training

**Two trials:**
- Trial 1: Conservative (beta=0.1)
- Trial 2: Aggressive (beta=0.5)

**Output:** JSON result files for each trial

In [None]:
!pip install -q datasets transformers peft trl bitsandbytes accelerate

In [None]:
import torch
import json
import time
import os
from datetime import datetime
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig

SEED = 42
torch.manual_seed(SEED)
os.makedirs('results', exist_ok=True)

# Set best SFT model path (update based on evaluation)
BEST_SFT_MODEL = "./outputs/sft_trial1/final"  # Change to sft_trial2 if better

## 1. Load DPO Dataset

In [None]:
dpo_dataset = load_from_disk('data/dpo_dataset')
print(f"DPO samples: {len(dpo_dataset)}")

# Train/val split
dpo_split = dpo_dataset.train_test_split(test_size=0.1, seed=SEED)
print(f"Train: {len(dpo_split['train'])}, Val: {len(dpo_split['test'])}")

## 2. Load Tokenizer

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 4-bit config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

## 3. Trial 1: Conservative DPO (beta=0.1)

In [None]:
# Trial 1 Configuration
DPO_TRIAL1_CONFIG = {
    "beta": 0.1,
    "num_train_epochs": 2,
    "learning_rate": 5e-5,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "max_length": 512,
    "max_prompt_length": 256,
}

LORA_T1_CONFIG = {
    "r": 8,
    "lora_alpha": 16,
    "target_modules": ["q_proj", "v_proj"],
    "lora_dropout": 0.05,
}

# Load model
model_t1 = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
model_t1 = prepare_model_for_kbit_training(model_t1)
model_t1 = PeftModel.from_pretrained(model_t1, BEST_SFT_MODEL, is_trainable=True)
print("Loaded SFT model for DPO Trial 1")

In [None]:
# DPO Training config
dpo_config_t1 = DPOConfig(
    output_dir="./outputs/dpo_trial1",
    beta=DPO_TRIAL1_CONFIG["beta"],
    num_train_epochs=DPO_TRIAL1_CONFIG["num_train_epochs"],
    per_device_train_batch_size=DPO_TRIAL1_CONFIG["per_device_train_batch_size"],
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=DPO_TRIAL1_CONFIG["gradient_accumulation_steps"],
    learning_rate=DPO_TRIAL1_CONFIG["learning_rate"],
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    fp16=True,
    report_to="none",
    seed=SEED,
    max_length=DPO_TRIAL1_CONFIG["max_length"],
    max_prompt_length=DPO_TRIAL1_CONFIG["max_prompt_length"],
)

trainer_t1 = DPOTrainer(
    model=model_t1,
    ref_model=None,
    args=dpo_config_t1,
    train_dataset=dpo_split["train"],
    eval_dataset=dpo_split["test"],
    tokenizer=tokenizer,
)

In [None]:
# Train and save JSON
print("Starting DPO Trial 1...")
start_time = time.time()
trainer_t1.train()
training_time_t1 = time.time() - start_time

trainer_t1.save_model("./outputs/dpo_trial1/final")

final_metrics_t1 = trainer_t1.state.log_history

dpo_trial1_results = {
    "trial_name": "dpo_trial1",
    "timestamp": datetime.now().isoformat(),
    "base_sft_model": BEST_SFT_MODEL,
    "dataset": "argilla/distilabel-intel-orca-dpo-pairs",
    "dataset_size": len(dpo_split["train"]),
    "dpo_config": DPO_TRIAL1_CONFIG,
    "lora_config": LORA_T1_CONFIG,
    "training_time_seconds": training_time_t1,
    "training_time_minutes": training_time_t1 / 60,
    "final_train_loss": [l for l in final_metrics_t1 if 'loss' in l and 'eval' not in str(l)][-1].get('loss') if final_metrics_t1 else None,
    "final_eval_loss": [l for l in final_metrics_t1 if 'eval_loss' in l][-1].get('eval_loss') if [l for l in final_metrics_t1 if 'eval_loss' in l] else None,
    "training_log": final_metrics_t1,
    "output_dir": "./outputs/dpo_trial1/final"
}

with open('results/dpo_trial1_results.json', 'w') as f:
    json.dump(dpo_trial1_results, f, indent=2)
print(f"DPO Trial 1 complete! Saved to results/dpo_trial1_results.json")
print(f"Training time: {training_time_t1/60:.2f} minutes")

## 4. Trial 2: Aggressive DPO (beta=0.5)

In [None]:
# Clear memory
del model_t1, trainer_t1
torch.cuda.empty_cache()

# Trial 2 Configuration
DPO_TRIAL2_CONFIG = {
    "beta": 0.5,
    "num_train_epochs": 3,
    "learning_rate": 1e-5,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "max_length": 512,
    "max_prompt_length": 256,
}

LORA_T2_CONFIG = {
    "r": 16,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
    "lora_dropout": 0.1,
}

# Reload model
model_t2 = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
model_t2 = prepare_model_for_kbit_training(model_t2)
model_t2 = PeftModel.from_pretrained(model_t2, BEST_SFT_MODEL, is_trainable=True)

In [None]:
dpo_config_t2 = DPOConfig(
    output_dir="./outputs/dpo_trial2",
    beta=DPO_TRIAL2_CONFIG["beta"],
    num_train_epochs=DPO_TRIAL2_CONFIG["num_train_epochs"],
    per_device_train_batch_size=DPO_TRIAL2_CONFIG["per_device_train_batch_size"],
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=DPO_TRIAL2_CONFIG["gradient_accumulation_steps"],
    learning_rate=DPO_TRIAL2_CONFIG["learning_rate"],
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    fp16=True,
    report_to="none",
    seed=SEED,
    max_length=DPO_TRIAL2_CONFIG["max_length"],
    max_prompt_length=DPO_TRIAL2_CONFIG["max_prompt_length"],
)

trainer_t2 = DPOTrainer(
    model=model_t2,
    ref_model=None,
    args=dpo_config_t2,
    train_dataset=dpo_split["train"],
    eval_dataset=dpo_split["test"],
    tokenizer=tokenizer,
)

In [None]:
# Train and save JSON
print("Starting DPO Trial 2...")
start_time = time.time()
trainer_t2.train()
training_time_t2 = time.time() - start_time

trainer_t2.save_model("./outputs/dpo_trial2/final")

final_metrics_t2 = trainer_t2.state.log_history

dpo_trial2_results = {
    "trial_name": "dpo_trial2",
    "timestamp": datetime.now().isoformat(),
    "base_sft_model": BEST_SFT_MODEL,
    "dataset": "argilla/distilabel-intel-orca-dpo-pairs",
    "dataset_size": len(dpo_split["train"]),
    "dpo_config": DPO_TRIAL2_CONFIG,
    "lora_config": LORA_T2_CONFIG,
    "training_time_seconds": training_time_t2,
    "training_time_minutes": training_time_t2 / 60,
    "final_train_loss": [l for l in final_metrics_t2 if 'loss' in l and 'eval' not in str(l)][-1].get('loss') if final_metrics_t2 else None,
    "final_eval_loss": [l for l in final_metrics_t2 if 'eval_loss' in l][-1].get('eval_loss') if [l for l in final_metrics_t2 if 'eval_loss' in l] else None,
    "training_log": final_metrics_t2,
    "output_dir": "./outputs/dpo_trial2/final"
}

with open('results/dpo_trial2_results.json', 'w') as f:
    json.dump(dpo_trial2_results, f, indent=2)
print(f"DPO Trial 2 complete! Saved to results/dpo_trial2_results.json")
print(f"Training time: {training_time_t2/60:.2f} minutes")

## 5. Compare DPO Trials

In [None]:
# Load and compare
with open('results/dpo_trial1_results.json') as f:
    t1 = json.load(f)
with open('results/dpo_trial2_results.json') as f:
    t2 = json.load(f)

print("="*60)
print("DPO TRIALS COMPARISON")
print("="*60)
print(f"{'Metric':<25} {'Trial 1':<15} {'Trial 2':<15}")
print("-"*60)
print(f"{'Beta':<25} {t1['dpo_config']['beta']:<15} {t2['dpo_config']['beta']:<15}")
print(f"{'Learning Rate':<25} {t1['dpo_config']['learning_rate']:<15} {t2['dpo_config']['learning_rate']:<15}")
print(f"{'Epochs':<25} {t1['dpo_config']['num_train_epochs']:<15} {t2['dpo_config']['num_train_epochs']:<15}")
print(f"{'Final Train Loss':<25} {t1['final_train_loss']:<15.4f} {t2['final_train_loss']:<15.4f}")
print(f"{'Final Eval Loss':<25} {t1['final_eval_loss']:<15.4f} {t2['final_eval_loss']:<15.4f}")
print(f"{'Training Time (min)':<25} {t1['training_time_minutes']:<15.1f} {t2['training_time_minutes']:<15.1f}")
print("\nBoth models ready for manual evaluation in notebook 04!")