In [1]:
import random
import pandas as pd
from operator import itemgetter
import torch
import warnings
warnings.filterwarnings('ignore')

from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer,TrainingArguments
from trl import RewardTrainer

In [2]:
# dataset = load_dataset('Anthropic/hh-rlhf')
dataset = load_dataset('Dahoas/full-hh-rlhf')

In [3]:
test = dataset['test']
dataset = dataset['train'].train_test_split(test_size=0.2)
dataset['valid'] = test
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 89641
    })
    test: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 22411
    })
    valid: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 12451
    })
})

In [4]:
#Select a base model whch we need to train for reward modeling.
model_name = "FacebookAI/roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def formatting_func(examples):
    kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}
    prompt_plus_chosen_response = examples["prompt"] + "\n" + examples["chosen"]
    prompt_plus_rejected_response = examples["prompt"] + "\n" + examples["rejected"]
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)
    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }
formatted_dataset = dataset.map(formatting_func)

Map:   0%|          | 0/89641 [00:00<?, ? examples/s]

Map:   0%|          | 0/22411 [00:00<?, ? examples/s]

Map:   0%|          | 0/12451 [00:00<?, ? examples/s]

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [8]:
# formatted_dataset = formatted_dataset.train_test_split()
# Configuring the training arguments
training_args = TrainingArguments(
    output_dir="./reward_model_0420",
    per_device_train_batch_size=4,
    evaluation_strategy="steps",
    logging_steps=10000,
    num_train_epochs = 5,
    report_to=None,
)
# Loading the RewardTrainer from TRL
trainer = RewardTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
)
model.config.use_cache = False
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Accuracy
10000,0.7004,0.693147,0.780733
20000,0.6983,0.693147,0.932399
30000,0.6957,0.693147,0.956584
40000,0.6952,0.693147,0.978225
50000,0.6942,0.693147,0.994244
60000,0.6949,0.693147,0.999732
70000,0.6949,0.693147,0.999777
80000,0.6943,0.693147,0.994913
90000,0.6945,0.693147,0.99942
100000,0.6943,0.693147,0.978225


TrainOutput(global_step=112055, training_loss=0.6954710895493563, metrics={'train_runtime': 27916.4642, 'train_samples_per_second': 16.055, 'train_steps_per_second': 4.014, 'total_flos': 0.0, 'train_loss': 0.6954710895493563, 'epoch': 5.0})

In [12]:
trainer.evaluate(formatted_dataset['valid'])

{'eval_loss': 0.6931473016738892,
 'eval_accuracy': 0.8625813187695768,
 'eval_runtime': 208.6066,
 'eval_samples_per_second': 59.687,
 'eval_steps_per_second': 7.464,
 'epoch': 5.0}