In [None]:
!pip install trl
!pip install transformers
!pip install argilla

In [None]:
import torch
from transformers import AutoTokenizer, pipeline
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
from trl.core import LengthSampler

In [None]:
reward_model = "argilla/roberta-base-reward-model-falcon-dolly"
reward_tokenizer = "argilla/roberta-base-reward-model-falcon-dolly"

config = PPOConfig(model_name="gpt2", batch_size=2)

In [None]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token
reward_pipe = ... # use pipeline to create reward pipe

In [None]:
import argilla as rg
from datasets import Dataset

feedback_dataset = rg.FeedbackDataset.from_huggingface("argilla/databricks-dolly-15k-curated-en")

data = {"instruction": [], "context": [], "response": []}
for entry in feedback_dataset:
    if entry.responses:
        res = entry.responses[0].values
        data["instruction"].append(res["new-instruction"].value)
        data["context"].append(res["new-context"].value)
        data["response"].append(res["new-response"].value)

dataset = Dataset.from_dict(data)
dataset

In [None]:
def formatting_func(examples):
    kwargs = {
        "padding": "max_length", "truncation": True,
        "max_length": 512, "return_tensors": "pt"
    }
    input_size = LengthSampler(min_value=2, max_value=8)
    input_text = examples["instruction"] + examples["context"] + examples["response"]
    examples["input_ids"] = tokenizer.encode(input_text, **kwargs)[0][: input_size()]
    examples["query"] = tokenizer.decode(examples["input_ids"][0])
    return examples

In [None]:
formatted_dataset = dataset.map(formatting_func, batched=False)
formatted_dataset.set_format(type="torch")

In [None]:
def collator(data): # you always need a collator for policy models
    return dict((key, [d[key] for d in data]) for key in data[0])

In [None]:
ppo_trainer = ... # PPO configuration

output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

In [None]:
for epoch, batch in enumerate(ppo_trainer.dataloader):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [...] # create the text to get the reward
    pipe_outputs = reward_pipe(texts, return_all_scores=True)
    rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ... # query, reponse and rewards
    ppo_trainer.log_stats(stats, batch, rewards)