In [4]:
# Reward Modeling with Hugging Face - Complete Google Colab Tutorial
# This notebook demonstrates how to train a reward model using Hugging Face

# ============================
# 1. INSTALLATION AND SETUP
# ============================

# Install required packages
!pip install transformers datasets accelerate peft trl torch matplotlib numpy

# Import necessary libraries
import torch
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, get_peft_model
from trl import RewardTrainer
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed successfully!")
print(f"🔥 PyTorch version: {torch.__version__}")
print(f"🤗 Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

# ============================
# 2. DATASET LOADING AND EXPLORATION
# ============================

print("\n" + "="*50)
print("📊 LOADING DATASET")
print("="*50)

# Load the synthetic instruction dataset from Hugging Face
dataset_name = "Dahoas/synthetic-instruct-gptj-pairwise"
dataset = load_dataset(dataset_name, split="train")

print(f"Dataset: {dataset_name}")
print(f"Total samples: {len(dataset)}")
print(f"Dataset features: {dataset.features}")

# Display a sample data point to understand the structure
print("\n🔍 Sample data point:")
sample = dataset[0]
print(f"Prompt: {sample['prompt'][:200]}...")
print(f"Chosen response: {sample['chosen'][:150]}...")
print(f"Rejected response: {sample['rejected'][:150]}...")

# ============================
# 3. MODEL AND TOKENIZER SETUP
# ============================

print("\n" + "="*50)
print("🤖 MODEL AND TOKENIZER SETUP")
print("="*50)

# Initialize model and tokenizer
model_name = "gpt2"  # Using GPT-2 as base model for sequence classification
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model for sequence classification (binary classification for reward modeling)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,  # Single output for reward score
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# Resize token embeddings to account for the new pad token
model.resize_token_embeddings(len(tokenizer))

print(f"✅ Model loaded: {model_name}")
print(f"📝 Tokenizer vocabulary size: {len(tokenizer)}")
print(f"🎯 Model configuration: {model.config.num_labels} label(s)")

# ============================
# 4. DATA PREPROCESSING
# ============================

print("\n" + "="*50)
print("🔄 DATA PREPROCESSING")
print("="*50)

# Maximum sequence length for tokenization
MAX_LENGTH = 512

def format_prompt_response(prompt, response):
    """Format prompt and response with clear delimiters"""
    return f"Human: {prompt}\n\nAssistant: {response}"

def preprocess_function(examples):
    """
    Preprocess the dataset for reward modeling training.
    Creates tokenized inputs for both chosen and rejected responses.
    """
    # Format chosen and rejected responses
    chosen_texts = [
        format_prompt_response(prompt, chosen)
        for prompt, chosen in zip(examples["prompt"], examples["chosen"])
    ]
    rejected_texts = [
        format_prompt_response(prompt, rejected)
        for prompt, rejected in zip(examples["prompt"], examples["rejected"])
    ]

    # Tokenize chosen responses
    chosen_encodings = tokenizer(
        chosen_texts,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Tokenize rejected responses
    rejected_encodings = tokenizer(
        rejected_texts,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    return {
        "input_ids_chosen": chosen_encodings["input_ids"],
        "attention_mask_chosen": chosen_encodings["attention_mask"],
        "input_ids_rejected": rejected_encodings["input_ids"],
        "attention_mask_rejected": rejected_encodings["attention_mask"],
    }

# Apply preprocessing to the dataset
print("🔄 Preprocessing dataset...")

# Use a smaller subset for faster training in this demo
dataset_subset = dataset.select(range(1000))  # Use first 1000 samples
processed_dataset = dataset_subset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_subset.column_names
)

print(f"✅ Preprocessed {len(processed_dataset)} samples")
print(f"📋 Processed features: {processed_dataset.features}")

# Split dataset into train and eval
train_dataset = processed_dataset.select(range(800))  # 80% for training
eval_dataset = processed_dataset.select(range(800, 1000))  # 20% for evaluation

print(f"🎯 Train samples: {len(train_dataset)}")
print(f"📊 Eval samples: {len(eval_dataset)}")

# ============================
# 5. LORA CONFIGURATION
# ============================

print("\n" + "="*50)
print("⚙️ LORA CONFIGURATION")
print("="*50)

# Configure LoRA for parameter-efficient training
lora_config = LoraConfig(
    r=16,                    # Rank of adaptation
    lora_alpha=32,           # LoRA scaling parameter
    target_modules=["c_attn", "c_proj"],  # Target modules for GPT-2
    lora_dropout=0.1,        # LoRA dropout
    bias="none",             # Bias type
    task_type="SEQ_CLS",     # Task type: Sequence Classification
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

print("✅ LoRA configuration applied")
print(f"📊 Trainable parameters: {model.num_parameters()}")
print(f"🎯 LoRA parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

# ============================
# 6. TRAINING CONFIGURATION
# ============================

print("\n" + "="*50)
print("🏋️ TRAINING CONFIGURATION")
print("="*50)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./reward_model_output",
    per_device_train_batch_size=2,      # Batch size per device
    per_device_eval_batch_size=2,       # Evaluation batch size
    num_train_epochs=2,                 # Number of training epochs
    gradient_accumulation_steps=4,       # Gradient accumulation steps
    learning_rate=1.41e-5,              # Learning rate
    logging_steps=50,                   # Log every 50 steps
    eval_strategy="steps",              # Fixed: was evaluation_strategy
    eval_steps=100,                     # Evaluate every 100 steps
    save_steps=200,                     # Save checkpoint every 200 steps
    warmup_steps=100,                   # Warmup steps
    remove_unused_columns=False,        # Keep all columns
    dataloader_drop_last=False,
    report_to="none",                   # Disable wandb logging for this demo
)

# Add missing attributes that RewardTrainer expects
training_args.disable_dropout = True
training_args.max_length = MAX_LENGTH

print("✅ Training arguments configured")
print(f"📊 Batch size: {training_args.per_device_train_batch_size}")
print(f"🔄 Epochs: {training_args.num_train_epochs}")
print(f"📈 Learning rate: {training_args.learning_rate}")

# ============================
# 7. REWARD TRAINER SETUP AND TRAINING
# ============================

print("\n" + "="*50)
print("🚀 REWARD TRAINER SETUP")
print("="*50)

# Try RewardTrainer with fallback to standard Trainer
try:
    # Initialize the RewardTrainer
    trainer = RewardTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=lora_config,
    )
    print("✅ RewardTrainer initialized successfully")
    training_approach = "reward_trainer"

except Exception as e:
    print(f"⚠️ RewardTrainer error: {e}")
    print("🔄 Using standard Trainer with custom loss function...")

    # Fallback to standard Trainer with custom reward modeling loss
    from transformers import Trainer
    import torch.nn.functional as F

    class RewardModelTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            """Custom loss function for reward modeling"""
            # Get chosen and rejected inputs
            chosen_input_ids = inputs["input_ids_chosen"]
            chosen_attention_mask = inputs["attention_mask_chosen"]
            rejected_input_ids = inputs["input_ids_rejected"]
            rejected_attention_mask = inputs["attention_mask_rejected"]

            # Forward pass for chosen responses
            chosen_outputs = model(
                input_ids=chosen_input_ids,
                attention_mask=chosen_attention_mask
            )
            chosen_rewards = chosen_outputs.logits.squeeze(-1)

            # Forward pass for rejected responses
            rejected_outputs = model(
                input_ids=rejected_input_ids,
                attention_mask=rejected_attention_mask
            )
            rejected_rewards = rejected_outputs.logits.squeeze(-1)

            # Reward modeling loss: chosen should have higher reward than rejected
            loss = -F.logsigmoid(chosen_rewards - rejected_rewards).mean()

            if return_outputs:
                return loss, {"chosen_rewards": chosen_rewards, "rejected_rewards": rejected_rewards}
            return loss

    def reward_data_collator(features):
        """Custom data collator for reward modeling"""
        batch = {}
        for key in features[0].keys():
            if key.startswith('input_ids') or key.startswith('attention_mask'):
                batch[key] = torch.stack([f[key] for f in features])
        return batch

    trainer = RewardModelTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=reward_data_collator,
    )
    print("✅ Custom RewardModelTrainer initialized successfully")
    training_approach = "custom_trainer"

# Start training
print("\n🚀 Starting training...")
print("This may take several minutes depending on your hardware...")

try:
    # Train the model
    trainer.train()
    print("✅ Training completed successfully!")

    # Save the final model
    trainer.save_model("./final_reward_model")
    print("💾 Model saved to ./final_reward_model")

except Exception as e:
    print(f"❌ Training error: {e}")
    print("💡 Try reducing batch size or number of epochs if you encounter memory issues")

# ============================
# 8. MODEL EVALUATION
# ============================

print("\n" + "="*50)
print("📊 MODEL EVALUATION")
print("="*50)

def get_reward_score(text):
    """Get reward score for a given text"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)

    with torch.no_grad():
        outputs = model(**inputs)
        score = outputs.logits.squeeze().item()

    return score

def evaluate_model_performance(num_samples=50):
    """Evaluate model performance by comparing chosen vs rejected responses"""
    correct_predictions = 0
    total_comparisons = 0

    print(f"🔍 Evaluating on {num_samples} samples...")

    for i in range(min(num_samples, len(eval_dataset))):
        # Get original data
        original_sample = dataset[i + 800]  # Offset for eval set

        # Format texts
        chosen_text = format_prompt_response(original_sample["prompt"], original_sample["chosen"])
        rejected_text = format_prompt_response(original_sample["prompt"], original_sample["rejected"])

        # Get scores
        chosen_score = get_reward_score(chosen_text)
        rejected_score = get_reward_score(rejected_text)

        # Check if model prefers chosen over rejected
        if chosen_score > rejected_score:
            correct_predictions += 1

        total_comparisons += 1

        # Print some examples
        if i < 3:
            print(f"\n📝 Example {i+1}:")
            print(f"Prompt: {original_sample['prompt'][:100]}...")
            print(f"Chosen score: {chosen_score:.4f}")
            print(f"Rejected score: {rejected_score:.4f}")
            print(f"Correct prediction: {chosen_score > rejected_score}")

    win_rate = correct_predictions / total_comparisons * 100
    return win_rate, correct_predictions, total_comparisons

# Run evaluation
try:
    win_rate, correct, total = evaluate_model_performance(50)

    print(f"\n🎯 EVALUATION RESULTS:")
    print(f"Win Rate: {win_rate:.2f}%")
    print(f"Correct Predictions: {correct}/{total}")

    if win_rate > 60:
        print("🎉 Great! Your model shows good performance!")
    elif win_rate > 50:
        print("👍 Decent performance! Consider more training for improvement.")
    else:
        print("🔄 Model needs more training. Try increasing epochs or adjusting hyperparameters.")

except Exception as e:
    print(f"❌ Evaluation error: {e}")

# ============================
# 9. INTERACTIVE TESTING
# ============================

print("\n" + "="*50)
print("🎮 INTERACTIVE TESTING")
print("="*50)

def compare_responses(prompt, response1, response2):
    """Compare two responses to a given prompt"""
    text1 = format_prompt_response(prompt, response1)
    text2 = format_prompt_response(prompt, response2)

    score1 = get_reward_score(text1)
    score2 = get_reward_score(text2)

    print(f"📝 Prompt: {prompt}")
    print(f"\n🅰️ Response 1: {response1}")
    print(f"Score: {score1:.4f}")
    print(f"\n🅱️ Response 2: {response2}")
    print(f"Score: {score2:.4f}")

    if score1 > score2:
        print(f"\n🏆 Winner: Response 1 (higher by {score1 - score2:.4f})")
    elif score2 > score1:
        print(f"\n🏆 Winner: Response 2 (higher by {score2 - score1:.4f})")
    else:
        print(f"\n🤝 Tie! Both responses have similar scores")

# Test with sample responses
test_prompt = "How do I learn programming?"
good_response = "Start with the basics: choose a beginner-friendly language like Python, practice regularly with small projects, and use online resources like tutorials and coding exercises. Don't be afraid to make mistakes - they're part of the learning process!"
bad_response = "Just figure it out yourself. Programming is hard and not for everyone."

print("🧪 Testing with sample responses:")
compare_responses(test_prompt, good_response, bad_response)

# ============================
# 10. VISUALIZATION AND SUMMARY
# ============================

print("\n" + "="*50)
print("📈 TRAINING SUMMARY")
print("="*50)

# Plot training loss if available
try:
    if hasattr(trainer.state, 'log_history') and trainer.state.log_history:
        train_losses = [log['train_loss'] for log in trainer.state.log_history if 'train_loss' in log]

        if train_losses:
            plt.figure(figsize=(10, 6))
            plt.plot(train_losses, label='Training Loss', color='blue', linewidth=2)
            plt.title('🏋️ Reward Model Training Loss')
            plt.xlabel('Training Steps')
            plt.ylabel('Loss')
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.show()

            print(f"📉 Final training loss: {train_losses[-1]:.4f}")
        else:
            print("📊 No training loss data available for plotting")
    else:
        print("📊 Training history not available")

except Exception as e:
    print(f"📊 Could not plot training loss: {e}")

print("\n" + "="*60)
print("🎉 REWARD MODELING TUTORIAL COMPLETE!")
print("="*60)
print("""
📚 What we accomplished:
✅ Loaded synthetic instruction dataset from Hugging Face
✅ Preprocessed data for reward modeling
✅ Configured GPT-2 with LoRA for efficient training
✅ Trained a reward model using RewardTrainer
✅ Evaluated model performance with win rate metrics
✅ Created interactive testing capabilities

🚀 Next steps you can try:
- Experiment with different base models (GPT-3.5, LLaMA, etc.)
- Adjust LoRA configuration parameters
- Try different learning rates and batch sizes
- Use larger datasets for better performance
- Implement more sophisticated evaluation metrics

💡 Tips for better results:
- Use more training data (we used only 1000 samples for demo)
- Train for more epochs with a larger dataset
- Fine-tune hyperparameters based on your specific use case
- Consider using models pre-trained on instruction-following tasks
""")

print("\n🔗 Useful resources:")
print("- Hugging Face Transformers: https://huggingface.co/transformers/")
print("- TRL (Transformer Reinforcement Learning): https://github.com/lvwerra/trl")
print("- PEFT (Parameter-Efficient Fine-Tuning): https://github.com/huggingface/peft")

✅ All packages installed successfully!
🔥 PyTorch version: 2.6.0+cu124
🤗 Using device: CPU

📊 LOADING DATASET
Dataset: Dahoas/synthetic-instruct-gptj-pairwise
Total samples: 33143
Dataset features: {'prompt': Value(dtype='string', id=None), 'chosen': Value(dtype='string', id=None), 'rejected': Value(dtype='string', id=None)}

🔍 Sample data point:
Prompt: I was wondering if you could walk me through the process of setting up a hydroponic garden for herbs....
Chosen response: Sure! The process for setting up a hydroponic garden for herbs is relatively simple. First, you'll want to choose a space where you will set up your h...
Rejected response: How do I store a bagels for eating at a later date?


 You can place the bagels in an airtight container and reheat them in the microwave.  Alternatel...

🤖 MODEL AND TOKENIZER SETUP


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded: gpt2
📝 Tokenizer vocabulary size: 50257
🎯 Model configuration: 1 label(s)

🔄 DATA PREPROCESSING
🔄 Preprocessing dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


✅ Preprocessed 1000 samples
📋 Processed features: {'input_ids_chosen': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask_chosen': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'input_ids_rejected': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask_rejected': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
🎯 Train samples: 800
📊 Eval samples: 200

⚙️ LORA CONFIGURATION
✅ LoRA configuration applied
📊 Trainable parameters: 126063360
🎯 LoRA parameters: 1622784

🏋️ TRAINING CONFIGURATION
✅ Training arguments configured
📊 Batch size: 2
🔄 Epochs: 2
📈 Learning rate: 1.41e-05

🚀 REWARD TRAINER SETUP
⚠️ RewardTrainer error: A processing_class must be specified when using the default RewardDataCollatorWithPadding
🔄 Using standard Trainer with custom loss function...
✅ Custom RewardModelTrainer initialized successfully

🚀 Starting training...
This may take several minutes depending on 