In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from datasets import Dataset
from sklearn.metrics import mean_squared_error
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def compute_metrics_for_regression(eval_pred):
    """Define this metric specifically to check model is correctly
    using mse as loss function."""
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)

    return {'mse':mse}

In [None]:
# Load CSV containing scalar rewards for GPT2 responses to philosophy prompts
rewards_file = os.path.join('...', '...')
df_rewards = pd.read_csv(rewards_file).rename(columns={'response':'text', 'reward':'labels'})[['text', 'labels']]
display(df_rewards.tail())

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create dataset object and split into train and test sets
dataset = Dataset.from_pandas(df_rewards)
dataset = dataset.train_test_split(test_size=0.2)

# Define a function to tokenize the text
def tokenize_function(example):
  encoding = tokenizer(example['text'], padding='max_length', truncation=True)
  encoding['labels'] = torch.tensor(example['labels']).unsqueeze(1)
  return encoding

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
# Load BERT model for sequence classification and set num_labels=1 to make it do regression
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 1).to("cuda")

In [None]:
# Define relevant parameters and train

batch_size = 8

args = TrainingArguments(
    output_dir='trained_model',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression
)

trainer.train()

In [None]:
# Test some sentences to see if the reward model is doing something sensible
test_sentence = "..." # reward = 0.62
input_ids = tokenizer.encode(test_sentence, return_tensors='pt').to('cuda')
outputs = model(input_ids).logits.item()
outputs