
## PPO

* old trl 


In [1]:

import torch
from tqdm import tqdm
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, AutoModelForCausalLMWithValueHead, set_seed
tqdm.pandas()

## Uncomment if using sentence similarity as reward function
#from sentence_transformers import SentenceTransformer, util


In [2]:

## Define parameters for PPO training
ppo_config = PPOConfig(
    ## model_name="gpt2",
    model_name="distilbert/distilgpt2",
    query_dataset="gsm8k",
    reward_model="sentence-transformers/all-MiniLM-L6-v2",
    learning_rate=1.41e-05,    ## 32
    mini_batch_size=16,
    batch_size=16,
    seed=0,
    ppo_epochs=5,
    remove_unused_columns=False
)

set_seed(ppo_config.seed)




In [3]:

## Build dataset for PPO training. Inputs are pre-encoded for use during training

def build_dataset(config):
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["question"])
        return sample

    ## load gsm8k with datasets
    ds = load_dataset(config.query_dataset, name='main', split="train")
    #ds = ds.shuffle(seed=42)
    #ds = ds.select(range(500))
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")

    return ds


In [4]:


def collator(data):
    return {key: [d[key] for d in data] for key in data[0]}



In [5]:

## Create model. Change to AutoModelForSeq2SeqLMWithValueHead for seq2seq models such as t5

model     = AutoModelForCausalLMWithValueHead.from_pretrained(ppo_config.model_name, torch_dtype=torch.float16)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ppo_config.model_name, torch_dtype=torch.float16)


model.bfloat16()


AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-5): 6 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Li

In [6]:

## Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(ppo_config.model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token



In [7]:

## create dataset
dataset = build_dataset(ppo_config)



In [8]:

## Create PPO trainer

ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
)


Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:


## Set device for training
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug



In [10]:

## Uncomment if using sentence similarity as reward function
# reward_model = SentenceTransformer(ppo_config.reward_model)

# def get_reward(answer, response, question=None):
#     answer = reward_model.encode(answer)
#     response = reward_model.encode(response)
#     reward = util.cos_sim(answer, response)[0][0]
#     if reward > 0.9:
#         return torch.tensor(2)
#     if question is not None:
#         question = reward_model.encode(question)
#         punish = util.cos_sim(question, response)[0][0]
#         if punish > 0.97:
#             return torch.tensor(-2)
#         else:
#             return reward - punish
    
#     return reward


In [11]:


## Define heuristic reward function
def get_reward(answer, response, question=None):
    no_spaces_response = response.replace(' ', '')
    correct_steps = [substr.split('>>')[0].split('=')[0] for substr in answer.split('<<')[1:]]

    reward = torch.tensor(0, dtype=torch.float32)
    for step in correct_steps:
        if step in no_spaces_response:
            reward += 1

    if answer.split('####')[1] in response:
        reward += 1
    
    return reward


In [12]:

generation_kwargs = {
    "min_length": -1,
    "max_new_tokens": 128,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


In [13]:

## Training loop
for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), 'epoch: '):
    ## Iterate through dataset batches
    for batch in tqdm(ppo_trainer.dataloader):
        query_tensors = batch["input_ids"]
    
        ## Get response from LLM
        response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, generate_ref_response=False, **generation_kwargs)
        batch['response'] = tokenizer.batch_decode(response_tensors)
    
        # Compute reward score
        rewards = [get_reward(a, r, q) for a, r, q in zip(batch['answer'], batch['response'], batch['question'])]

        # Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)



epoch:   0%|                                                                                                                                                                          | 0/5 [00:00<?, ?it/s]
[AYou're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[A%|▋                                                                                                                                                                      | 2/467 [00:08<31:22,  4.05s/it]
[A%|█                                                                                                                                                                      | 3/467 [00:11<30:03,  3.89s/it]
[A%|█▍                                                                                                                                                   

In [14]:


## Uncomment to save model
model.save_pretrained('ppo_gpt2distil_heuristic_original')
tokenizer.save_pretrained('ppo_gpt2distil_heuristic_original')



('ppo_gpt2distil_heuristic_original/tokenizer_config.json',
 'ppo_gpt2distil_heuristic_original/special_tokens_map.json',
 'ppo_gpt2distil_heuristic_original/vocab.json',
 'ppo_gpt2distil_heuristic_original/merges.txt',
 'ppo_gpt2distil_heuristic_original/added_tokens.json',
 'ppo_gpt2distil_heuristic_original/tokenizer.json')