In [23]:
import os
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    pipeline,
    logging,
    Trainer
)
from trl import RewardTrainer
import pandas as pd

In [22]:
batch_size = 16
num_workers = os.cpu_count()
# max_steps = 3000
bf16 = True
fp16 = False
# gradient_accumulation_steps = 2
context_length = 1024
logging_steps = 500
save_steps = 500
learning_rate = 2e-4
model_name = 'outputs/gpt2_sft_instruction/final_model/'
out_dir = 'outputs/gpt2_reward_model'

In [2]:
hh_rlhf = load_dataset('Anthropic/hh-rlhf')

In [3]:
hh_rlhf

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 160800
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 8552
    })
})

In [6]:
half_dataset_hh_rlhf = hh_rlhf['train'].train_test_split(test_size=0.5, shuffle=True, seed=42)['test'].train_test_split(test_size=0.05, shuffle=True, seed=42)
half_dataset_hh_rlhf_train = half_dataset_hh_rlhf['train']
half_dataset_hh_rlhf_val = half_dataset_hh_rlhf['test']
half_dataset_hh_rlhf_test = hh_rlhf['test']

In [7]:
print(half_dataset_hh_rlhf_train)
print(half_dataset_hh_rlhf_val)
print(half_dataset_hh_rlhf_test)

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 76380
})
Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 4020
})
Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 8552
})


In [8]:
train_dataset_dict = {"input_text": [], "label": []}
val_dataset_dict = {"input_text": [], "label": []}
test_dataset_dict = {"input_text": [], "label": []}

In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [17]:
def preprocess_function(examples):
    new_examples = {
        "input_ids": [],
        "label": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_c = tokenizer(chosen, truncation=True, max_length=context_length, padding='max_length')
        tokenized_r = tokenizer(rejected, truncation=True, max_length=context_length, padding='max_length')

        new_examples["input_ids"].append(tokenized_c["input_ids"])
        new_examples["label"].append(1)
        new_examples["input_ids"].append(tokenized_r["input_ids"])
        new_examples["label"].append(0)

    return new_examples

In [18]:
half_dataset_hh_rlhf_train = half_dataset_hh_rlhf_train.map(preprocess_function, batched=True, num_proc=4)
half_dataset_hh_rlhf_val = half_dataset_hh_rlhf_val.map(preprocess_function, batched=True, num_proc=4)
half_dataset_hh_rlhf_test = half_dataset_hh_rlhf_test.map(preprocess_function, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/76380 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4020 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/8552 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
)

In [None]:
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))

In [None]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    evaluation_strategy='steps',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=3,
    bf16=bf16,
    fp16=fp16,
    # report_to='tensorboard',
    num_train_epochs=3,
    dataloader_num_workers=num_workers,
    # gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    # lr_scheduler_type='constant',
)

In [None]:
trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=half_dataset_hh_rlhf_train,
    eval_dataset=half_dataset_hh_rlhf_val,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(f"{out_dir}/final_model")
tokenizer.save_pretrained(f"{out_dir}/final_model")