In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Reward Model Training - Complete Code
# Run this entire notebook AFTER SFT training

import torch
import torch.nn as nn
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [8]:
# Configuration - FIXED for Reward Model Training
class Config:
    # Base model name (needed for tokenizer if SFT model doesn't exist)
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  #  ADD THIS

    # Paths
    sft_model_dir = "/content/drive/MyDrive/outputs/sft_model"  # Fixed typo: Mydrive → MyDrive
    reward_model_dir = "/content/drive/MyDrive/outputs/reward_model"

    # Data
    dataset_name = "Anthropic/hh-rlhf"
    num_preference_samples = 5000
    max_length = 256  #  Changed from 512 to match your T4 setup

    # Training
    reward_epochs = 3
    batch_size = 4
    learning_rate = 1e-4

config = Config()
os.makedirs(config.reward_model_dir, exist_ok=True)

print(f"✓Config loaded!")
print(f"   Base model: {config.model_name}")
print(f"   SFT model path: {config.sft_model_dir}")
print(f"   Reward model path: {config.reward_model_dir}")

✓Config loaded!
   Base model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
   SFT model path: /content/drive/MyDrive/outputs/sft_model
   Reward model path: /content/drive/MyDrive/outputs/reward_model


In [9]:
# CELL: Load Tokenizer and Preference Data (COMPLETE)
# ..........................................

from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import os

print("Loading tokenizer...")

# Check if we have a trained model saved
if os.path.exists(config.sft_model_dir) and os.path.exists(os.path.join(config.sft_model_dir, "tokenizer_config.json")):
    # Load from saved model
    print(f"   Loading from saved model: {config.sft_model_dir}")
    tokenizer = AutoTokenizer.from_pretrained(config.sft_model_dir, trust_remote_code=True)
else:
    # Load from base model
    print(f"   Loading from base model: {config.model_name}")
    tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"  Tokenizer loaded successfully!")
print(f"   Vocab size: {len(tokenizer)}")
print(f"   Pad token: {tokenizer.pad_token}")

# Load preference data
print("\nLoading preference data...")
dataset = load_dataset(config.dataset_name, split="train")
pref_data = dataset.select(range(min(config.num_preference_samples, len(dataset))))

def format_preference(example):
    try:
        prompt = example['chosen'].split('Assistant:')[0].replace('Human:', '').strip()
        chosen = example['chosen'].split('Assistant:')[-1].strip()
        rejected = example['rejected'].split('Assistant:')[-1].strip()
        return {"prompt": prompt, "chosen": chosen, "rejected": rejected}
    except:
        return None

preference_data = []
for ex in tqdm(pref_data, desc="Formatting preferences"):
    formatted = format_preference(ex)
    if formatted and formatted['prompt'] and formatted['chosen'] and formatted['rejected']:
        preference_data.append(formatted)

print(f"\nPreference dataset size: {len(preference_data)}")


Loading tokenizer...
   Loading from saved model: /content/drive/MyDrive/outputs/sft_model
  Tokenizer loaded successfully!
   Vocab size: 32000
   Pad token: </s>

Loading preference data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

harmless-base/train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

helpful-base/train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

helpful-online/train.jsonl.gz:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

helpful-rejection-sampled/train.jsonl.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

harmless-base/test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

helpful-base/test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

helpful-online/test.jsonl.gz:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

helpful-rejection-sampled/test.jsonl.gz:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

Formatting preferences: 100%|██████████| 5000/5000 [00:00<00:00, 32594.99it/s]


Preference dataset size: 4999





In [10]:
# Custom Dataset
class RewardDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        chosen_text = f"Human: {item['prompt']}\n\nAssistant: {item['chosen']}"
        rejected_text = f"Human: {item['prompt']}\n\nAssistant: {item['rejected']}"

        chosen_tokens = self.tokenizer(
            chosen_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        rejected_tokens = self.tokenizer(
            rejected_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            "chosen_input_ids": chosen_tokens["input_ids"].squeeze(0),
            "chosen_attention_mask": chosen_tokens["attention_mask"].squeeze(0),
            "rejected_input_ids": rejected_tokens["input_ids"].squeeze(0),
            "rejected_attention_mask": rejected_tokens["attention_mask"].squeeze(0)
        }

train_dataset = RewardDataset(preference_data, tokenizer, config.max_length)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)

In [11]:




# Reward Model Architecture
class RewardModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

        for param in self.base_model.parameters():
            param.requires_grad = False

        hidden_size = base_model.config.hidden_size
        self.reward_head = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )

        hidden_states = outputs.hidden_states[-1]
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = hidden_states.shape[0]
        last_hidden = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths]

        reward = self.reward_head(last_hidden)
        return reward


In [12]:

# Load base model
print("Loading SFT model...")
base_model = AutoModelForCausalLM.from_pretrained(
    config.sft_model_dir,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Create reward model
print("Creating reward model...")
reward_model = RewardModel(base_model)
reward_model = reward_model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(reward_model.reward_head.parameters(), lr=config.learning_rate)

# Training loop
print("Starting reward model training...")
reward_model.train()

for epoch in range(config.reward_epochs):
    total_loss = 0
    total_acc = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.reward_epochs}")

    for batch in pbar:
        chosen_ids = batch["chosen_input_ids"].to(device)
        chosen_mask = batch["chosen_attention_mask"].to(device)
        rejected_ids = batch["rejected_input_ids"].to(device)
        rejected_mask = batch["rejected_attention_mask"].to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(dtype=torch.float16):
            chosen_reward = reward_model(chosen_ids, chosen_mask)
            rejected_reward = reward_model(rejected_ids, rejected_mask)

            loss = -torch.log(torch.sigmoid(chosen_reward - rejected_reward)).mean()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(reward_model.reward_head.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        acc = (chosen_reward > rejected_reward).float().mean().item()
        total_acc += acc

        pbar.set_postfix({"loss": loss.item(), "acc": acc})

    avg_loss = total_loss / len(train_loader)
    avg_acc = total_acc / len(train_loader)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.4f}")

# Save reward model
print("Saving reward model...")
torch.save({
    'reward_head_state_dict': reward_model.reward_head.state_dict(),
    'config': config.__dict__
}, os.path.join(config.reward_model_dir, "reward_model.pt"))

base_model.save_pretrained(config.reward_model_dir)
tokenizer.save_pretrained(config.reward_model_dir)

print(f" Reward model saved to {config.reward_model_dir}")
print("\n Reward Model Training Complete!")

Loading SFT model...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Creating reward model...
Starting reward model training...


  with torch.cuda.amp.autocast(dtype=torch.float16):
Epoch 1/3: 100%|██████████| 1250/1250 [06:02<00:00,  3.44it/s, loss=0.498, acc=0.667]


Epoch 1 - Loss: 0.6697, Accuracy: 0.6309


Epoch 2/3: 100%|██████████| 1250/1250 [05:59<00:00,  3.48it/s, loss=0.439, acc=0.667]


Epoch 2 - Loss: 0.6035, Accuracy: 0.6829


Epoch 3/3: 100%|██████████| 1250/1250 [05:55<00:00,  3.52it/s, loss=0.595, acc=0.333]


Epoch 3 - Loss: 0.5691, Accuracy: 0.7077
Saving reward model...
 Reward model saved to /content/drive/MyDrive/outputs/reward_model

 Reward Model Training Complete!
