In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from tqdm import tqdm

In [2]:
# Hyperparams
batch_size=4
max_length=256+128

In [3]:
class CustomDataset(Dataset):
    def __init__(self, examples):
        self.examples=examples
        # self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]["prompt"]

In [4]:
# GPU Config
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.get_device_name(device))

cuda:0
NVIDIA A100-SXM4-80GB


In [5]:
# Load HF PPO Model
ppo_model_id = "smadala2/gpt2_ppo"
ppo_model = GPT2LMHeadModel.from_pretrained(ppo_model_id)
ppo_config = GPT2Config.from_pretrained(ppo_model_id)
print(ppo_model.config._name_or_path)

# Load another ppo trained model from HF
ppo_model_id2 = "jtatman/gpt2-open-instruct-v1-Anthropic-hh-rlhf"
ppo_model2 = GPT2LMHeadModel.from_pretrained(ppo_model_id2)

# Load HF raw GPT2 Model
raw_model_id = "gpt2"
raw_model = GPT2LMHeadModel.from_pretrained(raw_model_id)
raw_config = GPT2Config.from_pretrained(raw_model_id)
print(raw_model.config._name_or_path)



smadala2/gpt2_ppo
gpt2


In [6]:
# params1 = list(ppo_model.parameters())
# params2 = list(raw_model.parameters())

# # Calculate differences
# differences = []
# for param1, param2 in zip(params1, params2):
#     diff = torch.norm(param1 - param2, p=2)  # L2 norm
#     differences.append(diff.item())

# # Aggregate differences
# total_difference = sum(differences)
# print(total_difference)

In [7]:
# Load Reward Model
def freeze_model(model):
    for name, param in model.named_parameters():
        param.requires_grad = False
        
reward_model = AutoModelForSequenceClassification.from_pretrained(
                'Ray2333/gpt2-large-harmless-reward_model',
                num_labels=1).to(device)

freeze_model(reward_model)
freeze_model(ppo_model)
freeze_model(ppo_model2)

In [8]:
# Load tokenizer
tokenizer_id = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_id, padding="max_length", direction="left", padding_side="left", max_length=256, length=256)

# if tokenizer.pad_token is None:
#     print("Setting pad token")
#     tokenizer.pad_token = tokenizer.eos_token
    
# if model.config.pad_token_id is None:
#     print("Setting pad token id")
#     model.config.pad_token_id = model.config.eos_token_id


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Setting pad token id")
    ppo_model.config.pad_token_id = ppo_model.config.eos_token_id
    ppo_model2.config.pad_token_id = ppo_model2.config.eos_token_id
    raw_model.config.pad_token_id = raw_model.config.eos_token_id
    
if reward_model.config.pad_token_id is None:
    reward_model.config.pad_token_id = ppo_model.config.pad_token_id
    print("Setting pad token id")

print(len(tokenizer))
print(tokenizer.pad_token)
print(ppo_model.config.pad_token_id)
print(raw_model.config.pad_token_id)
print(reward_model.config.pad_token_id)

Setting pad token id
Setting pad token id
50257
<|endoftext|>
50256
50256
50256


In [9]:
# Load test dataset
dataset_id = "Dahoas/full-hh-rlhf"
dataset = load_dataset(dataset_id)
test_dataset = CustomDataset(dataset['test'])

print(len(test_dataset))
print(test_dataset[0])

DEBUG = False
if DEBUG == True:
    # Get 10 samples for testing
    test_dataset = test_dataset[:10]    
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
else:
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
print(len(test_dataloader))
# # Print some samples
# for i, sample in enumerate(test_dataloader):
#     print(i, sample)
#     if i == 10:
#         break

12451


Human: I've been seeing a lot of slugs outside recently, even crawling up trees. Should I do something about them, or just let them be?

Assistant:
3113


In [10]:
def forward_pass(model, inputs, device):
    outputs_test = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, \
        max_length=max_length, num_return_sequences=1, return_dict_in_generate=True, \
            output_scores=True)
    
    logits_test = torch.stack(outputs_test.scores, dim=1).to(device)
    outputs_ids_test = logits_test.argmax(-1) 
    outputs_attn_mask = (outputs_ids_test != ppo_model.config.pad_token_id).float().to(device)

    concatenated_input = {'input_ids': torch.cat([inputs["input_ids"], outputs_ids_test], dim=1).to(device),
                          'attention_mask': torch.cat([inputs["attention_mask"], outputs_attn_mask], dim=1).to(device)
                            }
    
    reward = torch.mean(reward_model(input_ids=concatenated_input["input_ids"], attention_mask= concatenated_input["attention_mask"]).logits.cpu()).item() 
    return reward

In [None]:
# Perform evaluation on dataset w/ PPO GPT2
raw_model.eval()
ppo_model.eval()
ppo_model2.eval()

raw_model.to(device)
ppo_model.to(device)
ppo_model2.to(device)

raw_rewards, raw_test_losses = [], []
ppo_rewards, ppo_test_losses = [], []
ppo_rewards2, ppo_test_losses2 = [], []

with torch.no_grad():
    
    for i, batch in tqdm(enumerate(test_dataloader)):
        inputs_test = tokenizer(batch, return_tensors="pt", padding="max_length", truncation=True, max_length=256).to(device)
        
        # Forward pass for GPT2-PPO
        ppo_reward = forward_pass(ppo_model, inputs_test, device)
        ppo_reward2 = forward_pass(ppo_model2, inputs_test, device)
        
        # Forward pass for GPT2-Raw
        raw_reward = forward_pass(raw_model, inputs_test, device)
    
        # Collect rewards and losses 
        ppo_rewards = ppo_rewards + [ppo_reward]*batch_size
        ppo_test_losses.append(1-ppo_reward)

        ppo_rewards2 = ppo_rewards2 + [ppo_reward2]*batch_size
        ppo_test_losses2.append(1-ppo_reward2)
        
        raw_rewards = raw_rewards + [raw_reward]*batch_size
        raw_test_losses.append(1-raw_reward)

0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
1it [00:12, 12.93s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
2it [00:26, 13.10s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
3it [00:39, 13.16s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
4it [00:49, 11.79s/it]Setting `pad_token_id` to `eos

In [None]:
# Print and save PPO results
print("PPO Test Loss: ", sum(ppo_test_losses)/len(ppo_test_losses))
print("PPO Test Reward: ", sum(ppo_rewards)/len(ppo_rewards))

# Print and save Raw results
print("Raw Test Loss: ", sum(raw_test_losses)/len(raw_test_losses))
print("Raw Test Reward: ", sum(raw_rewards)/len(raw_rewards))

In [None]:
print(len(ppo_rewards))
print(ppo_rewards[0:50])