In [None]:
# Reward Model Training and Analysis

This notebook implements a reward model to learn preferences from ranked responses.


In [None]:
# Required imports
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
from trl import RewardTrainer
from datasets import Dataset
import numpy as np
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
MAX_LENGTH = 512


In [None]:
# Load and prepare data
df = pd.read_csv('answers.csv')
print(f"Loaded {len(df)} examples")

# Create pairs of responses for comparison
def create_pairs(group):
    pairs = []
    for i in range(len(group)):
        for j in range(i + 1, len(group)):
            if group.iloc[i]['rank'] < group.iloc[j]['rank']:  # i is better than j
                pairs.append({
                    'prompt': group.iloc[i]['prompt'],
                    'better': group.iloc[i]['answer'],
                    'worse': group.iloc[j]['answer']
                })
            elif group.iloc[i]['rank'] > group.iloc[j]['rank']:  # j is better than i
                pairs.append({
                    'prompt': group.iloc[i]['prompt'],
                    'better': group.iloc[j]['answer'],
                    'worse': group.iloc[i]['answer']
                })
    return pairs

# Create training pairs
all_pairs = []
for prompt in df['prompt'].unique():
    prompt_group = df[df['prompt'] == prompt]
    pairs = create_pairs(prompt_group)
    all_pairs.extend(pairs)

train_dataset = Dataset.from_pandas(pd.DataFrame(all_pairs))
print(f"Created {len(train_dataset)} training pairs")


In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    torch_dtype=torch.float16,
    load_in_4bit=True
)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)
model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="reward_model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_steps=10
)


In [None]:
# Required imports
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardTrainer
import numpy as np
from datasets import Dataset
