In [None]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
!pip install trl --q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/423.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m419.8/423.1 kB[0m [31m17.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# DPO Training Round 2 - Fixed for Colab
# Run this AFTER generating Round 2 synthetic preferences

import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from trl import DPOTrainer, DPOConfig
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
import json
import os

torch.manual_seed(43)
np.random.seed(43)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Configuration
class Config:
    sft_model_dir = "/content/drive/MyDrive/outputs/sft_model"
    dpo_round1_dir = "/content/drive/MyDrive/outputs/dpo_round1"
    dpo_round2_dir = "/content/drive/MyDrive/outputs/dpo_round2"
    synthetic_data_path1 = "/content/drive/MyDrive/outputs/synthetic_preferences.json"
    synthetic_data_path2 = "/content/drive/MyDrive/outputs/synthetic_preferences_round2.json"
    dataset_name = "Anthropic/hh-rlhf"
    max_length = 512
    max_prompt_length = 256
    dpo_epochs = 2  # Reduced from 3 - model is already better from Round 1
    batch_size = 1
    gradient_accumulation_steps = 16
    learning_rate = 3e-5  # Lower LR for fine-tuning already trained model
    beta = 0.1
    lora_r = 16
    lora_alpha = 32
    lora_dropout = 0.05

config = Config()

Using device: cuda
GPU: Tesla T4


In [None]:

os.makedirs(config.dpo_round2_dir, exist_ok=True)

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.dpo_round1_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # DPO requires right padding

# Load data - combine original + BOTH synthetic rounds
print("Loading preference data...")

# Original preferences
original_dataset = load_dataset(config.dataset_name, split="train")
original_dataset = original_dataset.select(range(min(3000, len(original_dataset))))

original_prefs = []
for ex in original_dataset:
    try:
        prompt = ex['chosen'].split('Assistant:')[0].replace('Human:', '').strip()
        chosen = ex['chosen'].split('Assistant:')[-1].strip()
        rejected = ex['rejected'].split('Assistant:')[-1].strip()
        if prompt and chosen and rejected:
            original_prefs.append({"prompt": prompt, "chosen": chosen, "rejected": rejected})
    except:
        continue

# Synthetic preferences Round 1
with open(config.synthetic_data_path1, 'r') as f:
    synthetic_prefs1 = json.load(f)

# Synthetic preferences Round 2
with open(config.synthetic_data_path2, 'r') as f:
    synthetic_prefs2 = json.load(f)

# Clean synthetic data (remove score fields)
synthetic_prefs1_cleaned = [
    {"prompt": p["prompt"], "chosen": p["chosen"], "rejected": p["rejected"]}
    for p in synthetic_prefs1
]
synthetic_prefs2_cleaned = [
    {"prompt": p["prompt"], "chosen": p["chosen"], "rejected": p["rejected"]}
    for p in synthetic_prefs2
]

# Combine all data
all_prefs = original_prefs + synthetic_prefs1_cleaned + synthetic_prefs2_cleaned
print(f"Total preferences: {len(all_prefs)}")
print(f"  - Original: {len(original_prefs)}")
print(f"  - Synthetic Round 1: {len(synthetic_prefs1_cleaned)}")
print(f"  - Synthetic Round 2: {len(synthetic_prefs2_cleaned)}")

# Create dataset
dpo_dataset = Dataset.from_list(all_prefs)

# Split into train/eval
train_test_split = dpo_dataset.train_test_split(test_size=0.1, seed=43)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]
print(f"Train samples: {len(train_dataset)}, Eval samples: {len(eval_dataset)}")

# Load DPO Round 1 model (Base + LoRA)
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    config.sft_model_dir,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

print("Loading DPO Round 1 LoRA adapters...")
model = PeftModel.from_pretrained(base_model, config.dpo_round1_dir)

# Merge LoRA weights into base model for Round 2 training
print("Merging LoRA adapters...")
model = model.merge_and_unload()

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Apply NEW LoRA for Round 2
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, lora_config)
print("Trainable parameters:")
model.print_trainable_parameters()

# DPO Training arguments
training_args = DPOConfig(
    output_dir=config.dpo_round2_dir,
    num_train_epochs=config.dpo_epochs,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    lr_scheduler_type="cosine",
    warmup_steps=30,  # Reduced warmup for fine-tuning
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    beta=config.beta,
    max_length=config.max_length,
    max_prompt_length=config.max_prompt_length,
    remove_unused_columns=False,
    report_to="none",
    loss_type="sigmoid",
    optim="adamw_torch",
    max_grad_norm=1.0,
)

# Create DPO trainer
print("Creating DPO trainer for Round 2...")
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,  # Let DPOTrainer create reference model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer

)

# Train
print("\nStarting DPO Round 2 training...")
print(f"Effective batch size: {config.batch_size * config.gradient_accumulation_steps}")
dpo_trainer.train()

# Save LoRA adapters
print("\nSaving DPO Round 2 LoRA adapters...")
model.save_pretrained(config.dpo_round2_dir)
tokenizer.save_pretrained(config.dpo_round2_dir)
print(f"✓ LoRA adapters saved to {config.dpo_round2_dir}")

# Test generation
print("\n" + "="*50)
print("Testing DPO Round 2 generation...")
model.eval()

test_prompts = [
    "Human: How can I improve my productivity?\n\nAssistant:",
    "Human: What is the meaning of life?\n\nAssistant:",
    "Human: Explain quantum computing simply.\n\nAssistant:"
]

for test_prompt in test_prompts:
    inputs = tokenizer(test_prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(response)
    print("-"*50)

print("="*50)

print("\n✓ DPO Round 2 Training Complete!")
print("\n FULL ITERATIVE DPO PIPELINE COMPLETE!")

print(f"\nTo load Round 2 model later:")
print(f"from peft import PeftModel")
print(f"base_model = AutoModelForCausalLM.from_pretrained('{config.sft_model_dir}')")
print(f"model = PeftModel.from_pretrained(base_model, '{config.dpo_round2_dir}')")

Loading tokenizer...
Loading preference data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

harmless-base/train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

helpful-base/train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

helpful-online/train.jsonl.gz:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

helpful-rejection-sampled/train.jsonl.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

harmless-base/test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

helpful-base/test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

helpful-online/test.jsonl.gz:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

helpful-rejection-sampled/test.jsonl.gz:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

Total preferences: 4436
  - Original: 2999
  - Synthetic Round 1: 956
  - Synthetic Round 2: 481
Train samples: 3992, Eval samples: 444
Loading base model...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading DPO Round 1 LoRA adapters...




Merging LoRA adapters...
Trainable parameters:
trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079
Creating DPO trainer for Round 2...


Extracting prompt in train dataset:   0%|          | 0/3992 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3992 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3992 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/444 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/444 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/444 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.



Starting DPO Round 2 training...
Effective batch size: 16


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
100,0.5545,0.56772,-0.811223,-1.357921,0.736486,0.546697,-110.534576,-153.497849,-3.524853,-3.585337
200,0.6331,0.540533,-0.984828,-1.667114,0.738739,0.682286,-112.270638,-156.589798,-3.511431,-3.575866
300,0.4766,0.537461,-1.171693,-1.901549,0.734234,0.729856,-114.139267,-158.934143,-3.49012,-3.556787
400,0.4354,0.537535,-1.380002,-2.147178,0.736486,0.767176,-116.222374,-161.390427,-3.476344,-3.544017
500,0.386,0.537801,-1.412542,-2.181568,0.731982,0.769025,-116.547783,-161.734344,-3.47694,-3.544373



Saving DPO Round 2 LoRA adapters...
✓ LoRA adapters saved to /content/drive/MyDrive/outputs/dpo_round2

Testing DPO Round 2 generation...
Human: How can I improve my productivity?

Assistant: There are a few things you can try to improve your productivity:

1. Set a specific goal: Before you start working on a task, make sure you have a clear idea of what you're trying to achieve. This will help you stay focused and on track.

2. Eliminate distractions: There are many apps and tools available that can help you stay on track and prevent distractions from interfering with your work. For example, you could
--------------------------------------------------
Human: What is the meaning of life?

Assistant: I don't have the capacity to understand your human language. The term "meaning of life" is complex and subjective, and it depends on each person's individual perspective and experiences. However, I can provide general insights based on scientific research:

1. Being alive is a fundamental