In [None]:
import os
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from trl.core import respond_to_batch

In [None]:
model_id ='.'

# Get model to train and reference model for KL divergence
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
model = AutoModelForCausalLMWithValueHead.from_pretrained(model_id).to("cuda")
model_ref = create_reference_model(model)

# Get reward model
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
reward_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1).to("cuda")

# Load in prompts
prompts_file = os.path.join('...', '...')
with open(prompts_file) as file:
    prompts = [line.rstrip() for line in file]

# Define and initialise Trainer
ppo_config = PPOConfig(batch_size=1)
ppo_trainer = PPOTrainer(ppo_config, model, model_ref, gpt2_tokenizer)

In [None]:
num_steps = 1000
for step in range(num_steps):
  # Select random prompt from list as query
  query_text = np.random.choice(prompts)

  # Encode query and get response from model
  query_ids = gpt2_tokenizer.encode(query_text, return_tensors='pt').to('cuda')
  response_ids  = respond_to_batch(model, query_ids)
  response_text = gpt2_tokenizer.batch_decode(response_ids)[0]

  # Encode response and get reward value for it
  response_bert = bert_tokenizer.encode(query_text + response_text, return_tensors='pt').to('cuda')
  reward = [reward_model(response_bert).logits[0]]
  # print(query_txt + response_text, '\n')
  # print(reward[0].item(), '\n')

  # train model for one step with ppo
  train_stats = ppo_trainer.step([query_ids[0]], [response_ids[0]], reward)

In [None]:
# Test model vs reference model on a load of prompts

ref_rewards = []
model_rewards = []

test_steps = 25
for step in range(test_steps):
  # Select random prompt from list as query
  query_text = np.random.choice(prompts)

  # Encode query and get response from ref model and model
  query_ids = gpt2_tokenizer.encode(query_text, return_tensors='pt').to('cuda')
  ref_response_ids  = respond_to_batch(model_ref, query_ids)
  ref_response_text = gpt2_tokenizer.batch_decode(ref_response_ids)[0]
  model_response_ids  = respond_to_batch(model, query_ids)
  model_response_text = gpt2_tokenizer.batch_decode(model_response_ids)[0]

  # Encode reference response and get reward value for it
  ref_response_bert = bert_tokenizer.encode(query_text + ref_response_text, return_tensors='pt').to('cuda')
  ref_reward = reward_model(ref_response_bert).logits.item()
  ref_rewards.append(ref_reward)

  # Encode model response and get reward value for it
  model_response_bert = bert_tokenizer.encode(query_text + model_response_text, return_tensors='pt').to('cuda')
  model_reward = reward_model(model_response_bert).logits.item()
  model_rewards.append(model_reward)

print(f"Mean reference model reward = {np.mean(ref_rewards)}")
print(f"Mean fine-tuned model reward = {np.mean(model_rewards)}")