In [None]:
# notebooks/2_rlhf_track.ipynb

"""
Project 4: Privacy-Preserving Alignment
Notebook 2: RLHF Track (All RLHF Variants)

Purpose: Train reward model + RLHF models (baseline + DP variants)
Optimized: 15K samples, 2 epochs, MAX_LENGTH=224
Time: ~2 hours on T4 (reward model + 2 RLHF models)
"""

In [1]:
!pip install -q transformers datasets peft trl opacus accelerate  --q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/423.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m419.8/423.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/254.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.4/254.4 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

# Mount Google Drive
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

if os.path.exists('/content/drive/MyDrive'):
    print(" Google Drive mounted successfully!")
else:
    print("Drive mount failed!")

Mounted at /content/drive
 Google Drive mounted successfully!


In [3]:

# CELL 2: Setup
import sys
import torch
import json
import numpy as np
from pathlib import Path
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from opacus import PrivacyEngine
from opacus.validators import ModuleValidator
import time
from tqdm.auto import tqdm
import shutil

print("Imports complete")

Imports complete


In [4]:


# CELL 3: Configure Paths
DRIVE_BASE = Path("/content/drive/MyDrive/Project4_Privacy_Alignment")
DRIVE_DATA_DIR = DRIVE_BASE / "data"
DRIVE_MODELS_DIR = DRIVE_BASE / "models"
DRIVE_RESULTS_DIR = DRIVE_BASE / "results"

LOCAL_BASE = Path("/content")
LOCAL_DATA_DIR = LOCAL_BASE / "data"
LOCAL_MODELS_DIR = LOCAL_BASE / "models"
LOCAL_RESULTS_DIR = LOCAL_BASE / "results"
CHECKPOINT_DIR = LOCAL_BASE / "checkpoints"

for dir_path in [LOCAL_DATA_DIR, LOCAL_MODELS_DIR, LOCAL_RESULTS_DIR,
                 CHECKPOINT_DIR, DRIVE_MODELS_DIR, DRIVE_RESULTS_DIR]:
    dir_path.mkdir(exist_ok=True, parents=True)

print("Directories configured")
print(f"Data will load from: {DRIVE_DATA_DIR}")
print(f"Models will save to: {DRIVE_MODELS_DIR}")

Directories configured
Data will load from: /content/drive/MyDrive/Project4_Privacy_Alignment/data
Models will save to: /content/drive/MyDrive/Project4_Privacy_Alignment/models


In [26]:
# CELL 5: Initialize Tokenizers - FIXED
print("\nLoading tokenizers...")

# Policy tokenizer (GPT-2) - RIGHT PADDING FOR TRAINING
policy_tokenizer = AutoTokenizer.from_pretrained(config['policy_model'])
policy_tokenizer.pad_token = policy_tokenizer.eos_token
policy_tokenizer.padding_side = 'right'  # CRITICAL: Right padding for training

# Reward tokenizer (DistilBERT)
reward_tokenizer = AutoTokenizer.from_pretrained(config['reward_model'])

MAX_LENGTH = 224

print(f"Tokenizers loaded")
print(f"   Policy: {config['policy_model']}")
print(f"   Reward: {config['reward_model']}")
print(f"   MAX_LENGTH: {MAX_LENGTH}")
print(f"   Policy padding side: {policy_tokenizer.padding_side}")


Loading tokenizers...
Tokenizers loaded
   Policy: gpt2
   Reward: distilbert-base-uncased
   MAX_LENGTH: 224
   Policy padding side: right


In [6]:




# CELL 6: Prepare Reward Model Data
print("\nPreparing reward model training data...")

def prepare_reward_data(examples):
    """Prepare data for reward model (preference pairs)"""
    texts_chosen = []
    texts_rejected = []

    for prompt, chosen, rejected in zip(
        examples['prompt'],
        examples['chosen'],
        examples['rejected']
    ):
        text_chosen = f"{prompt} {chosen}"
        text_rejected = f"{prompt} {rejected}"

        texts_chosen.append(text_chosen)
        texts_rejected.append(text_rejected)

    chosen_encodings = reward_tokenizer(
        texts_chosen,
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length'
    )

    rejected_encodings = reward_tokenizer(
        texts_rejected,
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length'
    )

    return {
        'input_ids_chosen': chosen_encodings['input_ids'],
        'attention_mask_chosen': chosen_encodings['attention_mask'],
        'input_ids_rejected': rejected_encodings['input_ids'],
        'attention_mask_rejected': rejected_encodings['attention_mask'],
    }

reward_train = train_dataset.map(
    prepare_reward_data,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Preparing reward data"
)

print(f"Reward data prepared: {len(reward_train)} pairs")


Preparing reward model training data...


Preparing reward data:   0%|          | 0/18000 [00:00<?, ? examples/s]

Reward data prepared: 18000 pairs


In [7]:


# CELL 7: Helper Functions
def save_model_to_drive(model, tokenizer, save_name, metrics, training_time):
    """Save model to both local and Drive"""
    local_path = LOCAL_MODELS_DIR / save_name
    local_path.mkdir(exist_ok=True)

    model.save_pretrained(local_path)
    tokenizer.save_pretrained(local_path)

    results = {
        'metrics': metrics,
        'training_time': training_time,
        'config': config,
        'max_length': MAX_LENGTH
    }

    with open(local_path / 'results.json', 'w') as f:
        json.dump(results, f, indent=2)

    print(f"   Saved to local: {local_path}")

    drive_path = DRIVE_MODELS_DIR / save_name
    if drive_path.exists():
        shutil.rmtree(drive_path)

    shutil.copytree(local_path, drive_path)
    print(f"   Copied to Drive: {drive_path}")

print("Helper functions loaded")

Helper functions loaded


In [10]:
# CELL 8: Train Reward Model - FIXED (No fp16)
print("\n" + "="*60)
print("STEP 1: Train Reward Model")
print("="*60)

class RewardTrainer(Trainer):
    """Custom trainer for reward model"""

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Move tensors to correct device
        input_ids_chosen = inputs['input_ids_chosen'].to(model.device)
        attention_mask_chosen = inputs['attention_mask_chosen'].to(model.device)
        input_ids_rejected = inputs['input_ids_rejected'].to(model.device)
        attention_mask_rejected = inputs['attention_mask_rejected'].to(model.device)

        rewards_chosen = model(
            input_ids=input_ids_chosen,
            attention_mask=attention_mask_chosen
        ).logits

        rewards_rejected = model(
            input_ids=input_ids_rejected,
            attention_mask=attention_mask_rejected
        ).logits

        # Maximize margin between chosen and rejected
        loss = -torch.nn.functional.logsigmoid(
            rewards_chosen - rewards_rejected
        ).mean()

        if return_outputs:
            return loss, {'rewards_chosen': rewards_chosen, 'rewards_rejected': rewards_rejected}
        return loss

# Custom data collator for reward model
from dataclasses import dataclass
from typing import Any, Dict, List
import torch

@dataclass
class RewardDataCollator:
    """Data collator for reward model that handles chosen/rejected pairs"""

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        batch = {
            'input_ids_chosen': [],
            'attention_mask_chosen': [],
            'input_ids_rejected': [],
            'attention_mask_rejected': [],
        }

        for feature in features:
            batch['input_ids_chosen'].append(feature['input_ids_chosen'])
            batch['attention_mask_chosen'].append(feature['attention_mask_chosen'])
            batch['input_ids_rejected'].append(feature['input_ids_rejected'])
            batch['attention_mask_rejected'].append(feature['attention_mask_rejected'])

        # Convert to tensors
        batch = {
            k: torch.tensor(v, dtype=torch.long)
            for k, v in batch.items()
        }

        return batch

print("Loading reward model...")
reward_model = AutoModelForSequenceClassification.from_pretrained(
    config['reward_model'],
    num_labels=1,
    # Don't use torch_dtype, let it use default fp32
    device_map='auto'
)

reward_args = TrainingArguments(
    output_dir=str(CHECKPOINT_DIR / "reward_model"),
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    fp16=False,  # CRITICAL: Disable fp16 for reward model
    logging_steps=200,
    save_strategy="epoch",
    remove_unused_columns=False,
    report_to="none",
)

# Create data collator
reward_collator = RewardDataCollator()

reward_trainer = RewardTrainer(
    model=reward_model,
    args=reward_args,
    train_dataset=reward_train,
    data_collator=reward_collator,
)

print("Starting reward model training (3 epochs)...")
print("Note: Using fp32 for stability (reward model is small, still fast)")
start_time = time.time()
reward_result = reward_trainer.train()
reward_time = time.time() - start_time

print(f"Reward model complete in {reward_time/60:.1f} minutes")

save_model_to_drive(
    reward_model,
    reward_tokenizer,
    "reward_model",
    reward_result.metrics,
    reward_time
)

print("\nReward model ready for RLHF training")

del reward_trainer
torch.cuda.empty_cache()


STEP 1: Train Reward Model
Loading reward model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model is already on multiple devices. Skipping the move to device specified in `args`.


Starting reward model training (3 epochs)...
Note: Using fp32 for stability (reward model is small, still fast)


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
200,0.6921
400,0.6848
600,0.6888
800,0.6839
1000,0.6817
1200,0.6847
1400,0.6834
1600,0.676
1800,0.6806
2000,0.6759


Reward model complete in 11.4 minutes
   Saved to local: /content/models/reward_model
   Copied to Drive: /content/drive/MyDrive/Project4_Privacy_Alignment/models/reward_model

Reward model ready for RLHF training


In [34]:
# CELL 9: RLHF Training Function - COMPLETE FIX
def train_simple_rlhf(model_name, epsilon=None, num_epochs=2):
    """
    Simplified RLHF training (SFT on preferred responses)
    """
    print(f"\n{'='*60}")
    if epsilon:
        print(f"Training DP-RLHF with epsilon={epsilon}")
    else:
        print(f"Training RLHF Baseline")
    print(f"{'='*60}")

    # Load policy model
    print("Loading policy model...")
    from peft import LoraConfig, get_peft_model, TaskType

    # Load SFT baseline as starting point
    sft_path = LOCAL_MODELS_DIR / "sft_baseline"
    if sft_path.exists():
        print(f"   Loading from SFT baseline: {sft_path}")
        policy_model = AutoModelForCausalLM.from_pretrained(
            sft_path,
            torch_dtype=torch.float16,
            device_map='auto'
        )
    else:
        print(f"   Loading fresh model")
        policy_model = AutoModelForCausalLM.from_pretrained(
            config['policy_model'],
            torch_dtype=torch.float16,
            device_map='auto'
        )

        lora_config = LoraConfig(
            r=config.get('lora_r', 8),
            lora_alpha=16,
            target_modules=["c_attn", "c_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type=TaskType.CAUSAL_LM
        )

        policy_model = get_peft_model(policy_model, lora_config)

    policy_model.print_trainable_parameters()

    # Load reward model (for reference, not used in loss)
    print("Loading reward model...")
    reward_model_local = AutoModelForSequenceClassification.from_pretrained(
        LOCAL_MODELS_DIR / "reward_model",
        device_map='auto'
    )
    reward_model_local.eval()

    # Prepare training data - SIMPLE VERSION
    print("Preparing training data...")

    def tokenize_function(examples):
        """Simple tokenization with proper label handling"""
        texts = []
        for prompt, chosen in zip(examples['prompt'], examples['chosen']):
            # Format: Human: {prompt}\n\nAssistant: {chosen}
            text = f"Human: {prompt}\n\nAssistant: {chosen}"
            texts.append(text)

        # Tokenize
        model_inputs = policy_tokenizer(
            texts,
            max_length=MAX_LENGTH,
            padding='max_length',
            truncation=True,
        )

        # Set labels (copy input_ids, will handle padding below)
        model_inputs["labels"] = model_inputs["input_ids"].copy()

        return model_inputs

    # Apply tokenization
    rlhf_train_data = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=train_dataset.column_names,
        desc="Tokenizing data"
    )

    # Create custom data collator to handle padding in labels
    from transformers import DataCollatorForLanguageModeling

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=policy_tokenizer,
        mlm=False,  # We're doing causal LM, not masked LM
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=str(CHECKPOINT_DIR / model_name),
        num_train_epochs=num_epochs,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=5e-5,
        fp16=(epsilon is None),
        logging_steps=200,
        save_strategy="epoch",
        remove_unused_columns=False,
        report_to="none",
    )

    # Standard Trainer
    trainer = Trainer(
        model=policy_model,
        args=training_args,
        train_dataset=rlhf_train_data,
        data_collator=data_collator,
        processing_class=policy_tokenizer,
    )

    # Add DP if epsilon specified
    if epsilon is not None:
        print(f"Configuring privacy engine with epsilon={epsilon}...")
        policy_model = ModuleValidator.fix(policy_model)

        try:
            privacy_engine = PrivacyEngine()
            policy_model, optimizer, train_dataloader = privacy_engine.make_private_with_epsilon(
                module=trainer.model,
                optimizer=trainer.optimizer,
                data_loader=trainer.get_train_dataloader(),
                epochs=num_epochs,
                target_epsilon=epsilon,
                target_delta=config.get('delta', 1e-5),
                max_grad_norm=1.0,
            )
            print(f"   Privacy engine configured")
        except Exception as e:
            print(f"   Warning: Privacy engine setup failed: {e}")
            print(f"   Continuing with gradient clipping only")

    # Train
    print(f"Starting training ({num_epochs} epochs)...")
    start_time = time.time()
    result = trainer.train()
    training_time = time.time() - start_time

    print(f"Training complete in {training_time/60:.1f} minutes")
    print(f"Final training loss: {result.metrics.get('train_loss', 'N/A')}")

    # Save
    metrics = result.metrics
    if epsilon:
        try:
            epsilon_spent = privacy_engine.get_epsilon(config.get('delta', 1e-5))
            metrics['epsilon_spent'] = epsilon_spent
            print(f"   Final epsilon spent: {epsilon_spent:.2f}")
        except:
            metrics['epsilon_target'] = epsilon

    save_model_to_drive(policy_model, policy_tokenizer, model_name, metrics, training_time)

    # Cleanup
    del policy_model, reward_model_local, trainer
    try:
        del privacy_engine
    except:
        pass
    torch.cuda.empty_cache()

    return metrics, training_time

print("RLHF training function ready")




RLHF training function ready


In [35]:

# CELL 10: Train RLHF Baseline
print("\n" + "="*60)
print("STEP 2: RLHF Baseline (No Privacy)")
print("="*60)
metrics_rlhf, time_rlhf = train_simple_rlhf("rlhf_baseline", epsilon=None, num_epochs=4)


STEP 2: RLHF Baseline (No Privacy)

Training RLHF Baseline
Loading policy model...
   Loading fresh model




trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475
Loading reward model...
Preparing training data...


Tokenizing data:   0%|          | 0/18000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Starting training (4 epochs)...


Step,Training Loss
200,2.976
400,2.6696
600,2.625
800,2.5879
1000,2.5619
1200,2.5471
1400,2.5307
1600,2.5419
1800,2.5356
2000,2.5147


Training complete in 30.1 minutes
Final training loss: 2.5488104010687933
   Saved to local: /content/models/rlhf_baseline
   Copied to Drive: /content/drive/MyDrive/Project4_Privacy_Alignment/models/rlhf_baseline


In [36]:
# CELL 11: Train DP-RLHF epsilon=8
print("\n" + "="*60)
print("STEP 3: DP-RLHF epsilon=8")
print("="*60)

metrics_dp8, time_dp8 = train_simple_rlhf("dp_rlhf_eps8.0", epsilon=8.0, num_epochs=4)


STEP 3: DP-RLHF epsilon=8

Training DP-RLHF with epsilon=8.0
Loading policy model...
   Loading fresh model


The model is already on multiple devices. Skipping the move to device specified in `args`.


trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475
Loading reward model...
Preparing training data...
Configuring privacy engine with epsilon=8.0...


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


   Continuing with gradient clipping only
Starting training (4 epochs)...


Step,Training Loss
200,2.9763
400,2.6691
600,2.625
800,2.5882
1000,2.5618
1200,2.5469
1400,2.5297
1600,2.5409
1800,2.534
2000,2.5129


Training complete in 30.1 minutes
Final training loss: 2.547619137234158


  mesh_size = eps_error / np.sqrt(
  t_min = np.floor(t_min / dt) * dt
  t_max = np.ceil(t_max / dt) * dt


   Saved to local: /content/models/dp_rlhf_eps8.0
   Copied to Drive: /content/drive/MyDrive/Project4_Privacy_Alignment/models/dp_rlhf_eps8.0


In [37]:

# CELL 12: Optional - Train DP-RLHF epsilon=1
# Uncomment if you have time budget

# print("\n" + "="*60)
print("STEP 4: DP-RLHF epsilon=1")
print("="*60)
metrics_dp1, time_dp1 = train_simple_rlhf("dp_rlhf_eps1.0", epsilon=1.0, num_epochs=4)

STEP 4: DP-RLHF epsilon=1

Training DP-RLHF with epsilon=1.0
Loading policy model...
   Loading fresh model


The model is already on multiple devices. Skipping the move to device specified in `args`.


trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475
Loading reward model...
Preparing training data...
Configuring privacy engine with epsilon=1.0...


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


   Continuing with gradient clipping only
Starting training (4 epochs)...


Step,Training Loss
200,2.9763
400,2.6692
600,2.625
800,2.5883
1000,2.5617
1200,2.5469
1400,2.5296
1600,2.5408
1800,2.5341
2000,2.5129


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ca29b1ba-bc49-4b94-9030-456cd5c4d964)')' thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Training complete in 30.0 minutes
Final training loss: 2.547623918321398


  mesh_size = eps_error / np.sqrt(
  t_min = np.floor(t_min / dt) * dt
  t_max = np.ceil(t_max / dt) * dt
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 53a57d57-dfc3-4fb6-9e49-83a6c58073c4)')' thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/config.json
Retrying in 1s [Retry 1/5].


   Saved to local: /content/models/dp_rlhf_eps1.0
   Copied to Drive: /content/drive/MyDrive/Project4_Privacy_Alignment/models/dp_rlhf_eps1.0


In [41]:






# CELL 13: Summary
print("\n" + "="*60)
print("RLHF TRACK COMPLETE")
print("="*60)

models_trained = [
    "reward_model",
    "rlhf_baseline",
    "dp_rlhf_eps8.0",
]

print(f"\nModels trained: {len(models_trained)}")
for model_name in models_trained:
    drive_path = DRIVE_MODELS_DIR / model_name
    if drive_path.exists():
        print(f"   Success: {model_name}")

print(f"\nAll models saved to Drive: {DRIVE_MODELS_DIR}")

print("\nTraining time summary:")
total_time = reward_time + time_rlhf + time_dp8
print(f"   Reward model (3 epochs): {reward_time/60:.1f} min")
print(f"   RLHF baseline (4 epochs): {time_rlhf/60:.1f} min")
print(f"   DP-RLHF eps=8 (4 epochs): {time_dp8/60:.1f} min")
print(f"   DP-RLHF eps=1 (4 epochs): {time_dp1/60:.1f} min")


print("\nConfiguration used:")
print(f"   Samples: {len(train_dataset)}")
print(f"   Epochs: 4 (RLHF), 3 (reward)")
print(f"   MAX_LENGTH: {MAX_LENGTH}")


print("="*60)


RLHF TRACK COMPLETE

Models trained: 3
   Success: reward_model
   Success: rlhf_baseline
   Success: dp_rlhf_eps8.0

All models saved to Drive: /content/drive/MyDrive/Project4_Privacy_Alignment/models

Training time summary:
   Reward model (3 epochs): 11.4 min
   RLHF baseline (4 epochs): 30.1 min
   DP-RLHF eps=8 (4 epochs): 30.1 min
   DP-RLHF eps=1 (4 epochs): 30.0 min

Configuration used:
   Samples: 18000
   Epochs: 4 (RLHF), 3 (reward)
   MAX_LENGTH: 224
