![RLHF PPO](rlhf_ppo.png)

[PPo Trainer Huggingface](https://huggingface.co/docs/trl/en/ppo_trainer)

**Install dependencies**

In [None]:
!pip install datasets
!pip install peft
!pip install trl

**Import Necessary Libraries**

In [2]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline, HfArgumentParser

from trl import (AutoModelForCausalLMWithValueHead,
                 AutoModelForSeq2SeqLMWithValueHead,
                 PPOConfig,
                 PPOTrainer,
                 set_seed)
import accelerate
import numpy as np


from dataclasses import dataclass
from transformers import TrainingArguments
from typing import List, Literal, Optional
from trl.trainer.utils import OnPolicyConfig
from trl.trainer.ppo_config import PPOConfig


def is_xpu_available() -> bool:
    return accelerate.utils.is_xpu_available()
tqdm.pandas()

**Framwork parameters**

In [3]:
@dataclass
class ModelConfig:
    model_revision: str = "main"
    torch_dtype: Optional[Literal["auto", "bfloat16", "float16", "float32"]] = None
    trust_remote_code: bool = False
    attn_implementation: Optional[str] = None
    use_peft: bool = False
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    lora_target_modules: Optional[List[str]] = None
    lora_modules_to_save: Optional[List[str]] = None
    lora_task_type: str = "CAUSAL_LM"
    use_rslora: bool = False
    load_in_8bit: bool = False
    load_in_4bit: bool = False
    bnb_4bit_quant_type: Literal["fp4", "nf4"] = "nf4"
    use_bnb_nested_quant: bool = False

    def __post_init__(self):
        if self.load_in_8bit and self.load_in_4bit:
            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")

        if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
            self.lora_target_modules = self.lora_target_modules[0]


@dataclass
class ScriptArguments:
    model_name: str = "facebook/opt-350m"
    query_dataset: str = "Anthropic/hh-rlhf"
    reward_model: str = "text-classification:facebook/opt-350m"

    learning_rate: float = 1.41e-5
    log_with: str=None,
    mini_batch_size: int=64,
    batch_size: int=64,
    gradient_accumulation_steps: int=1,
    early_stopping: bool=False,
    target_kl: float=6.0,
    kl_penalty: str="kl"

    use_seq2seq: bool = False
    use_peft: bool = False
    """whether to use peft"""
    peft_config: Optional[dict] = None
    trust_remote_code: bool = False

    dataset_train_split: str = "train"
    dataset_test_split: str = "test"
    config: Optional[str] = None
    gradient_checkpointing_use_reentrant: bool = False
    ignore_bias_buffers: bool = False
    seed:int=2024




parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]

parser = HfArgumentParser(PPOConfig)
training_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]

parser = HfArgumentParser(ModelConfig)
model_config = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]

training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)



**Load data and train a model by RLHF & PPO**

Error => CUDA out of memory => needs more GPU memory

In [4]:
# We then define the arguments to pass to the sentiment analysis pipeline.
# We set `return_all_scores` to True to get the sentiment score for each token.
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

trl_model_class = AutoModelForCausalLMWithValueHead if not script_args.use_seq2seq else AutoModelForSeq2SeqLMWithValueHead


# Below is an example function to build the dataset. In our case, we use the IMDB dataset
# from the `datasets` library. One should customize this function to train the model on
# its own dataset.
def build_dataset(config, query_dataset, tokenizer):

    tokenizer.pad_token = tokenizer.eos_token
    ds = load_dataset(query_dataset, split="train[:5%]")
    ds = ds.rename_columns({"chosen": "review"})
    ds = ds.filter(lambda x: len(x["review"]) < 256, batched=False)

    def tokenize(sample):
        _rev = sample["review"]
        _rev = _rev.split("Assistant:")[0]
        # print(_rev)
        sample["input_ids"] = tokenizer.encode(_rev)
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds



def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])


# set seed before initializing value head for deterministic eval
set_seed(training_args.seed)

# Now let's build the model, the reference model, and the tokenizer.
if not script_args.use_peft:
    ref_model = trl_model_class.from_pretrained(script_args.model_name,
                                                trust_remote_code=script_args.trust_remote_code)
    device_map = None
    peft_config = None
else:
    peft_config = model_config
    ref_model = None
    # Copy the model to each device
    device_map = {"": Accelerator().local_process_index}



model = trl_model_class.from_pretrained(
    script_args.model_name,
    trust_remote_code=script_args.trust_remote_code,
    device_map=device_map,
    peft_config=peft_config,
)

tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)

# We retrieve the dataloader by calling the `build_dataset` function.
dataset = build_dataset(training_args, script_args.query_dataset,tokenizer)

# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
tokenizer.pad_token_id = tokenizer.eos_token_id

# ref_model = loaded_reward_model
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(training_args, model, ref_model,
                         tokenizer, dataset=dataset,
                         data_collator=collator)

# We then build the sentiment analysis pipeline, passing the model name and the
# sentiment analysis pipeline arguments. Let's also make sure to set the device
# to the same device as the PPOTrainer.
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    if is_xpu_available():
        device = "xpu:0"
    else:
        device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
task, model_name = script_args.reward_model.split(":")
if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
    with ds_plugin.zero3_init_context_manager(enable=False):
        sentiment_pipe = pipeline(task, model=model_name, device=device)
else:
    sentiment_pipe = pipeline(task, model=model_name, device=device)

# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
if sentiment_pipe.tokenizer.pad_token_id is None:
    sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id

if sentiment_pipe.model.config.pad_token_id is None:
    sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id

# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 64,
}

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # # Get response from gpt2
    # response_tensors, ref_response_tensors = ppo_trainer.generate(
    response_tensors = ppo_trainer.generate(
        query_tensors, return_prompt=False, generate_ref_response=False, **generation_kwargs
    )
    batch["response"] = tokenizer.batch_decode(response_tensors)
    # batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)

    # Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]

    # # Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

#### Save model
ppo_trainer.save_pretrained("save_new_ppo")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8040 [00:00<?, ? examples/s]

Map:   0%|          | 0/2077 [00:00<?, ? examples/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
0it [00:00, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
0it [01:04, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 52.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 9.06 MiB is free. Process 7219 has 14.74 GiB memory in use. Of the allocated memory 14.12 GiB is allocated by PyTorch, and 507.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

**Test final model**

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline

model_id = './save_new_ppo'

model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map='auto',
    )

tokenizer = AutoTokenizer.from_pretrained(model_id)
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

sequences = pipeline(
    "Human: What are next steps in developing large language models?\n\n",
    do_sample=True,
    top_k=10,
    num_return_sequences=3,
    eos_token_id=tokenizer.eos_token_id,
    max_length=128,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")
    print("***"*10)
