# LLM을 위한 RLHF - 2. RM 모델 만들기


효율적인 RLHF를 위해서는 적절한 RM 모델, 즉 보상모델이 있는 것이 좋습니다. 하지만 이를 만드는 것 역시 생각처럼 쉽지가 않습니다. 어떠한 과정을 통해서 RM 모델이 만들어지는지 확인해봅시다.

## 필요 라이브러리 설치

RM 모델은 생성된 결과값 or 레이블러들이 만든 답안들에서 어떤 내용이 더 적절하고 좋은 답변인지 판단해줄 수 있는 모델입니다. 이는 생각보다 쉽지 않은 작업으로, 작은 모델로는 잘 학습되지 않습니다. 아래의 실습은 gpt2 와 llama3 모델을 사용하여 비교해볼 수 있습니다.

In [None]:
!pip install -q transformers accelerate sentencepiece
!huggingface-cli login --token # 본인의 토큰

In [None]:
!pip install evaluate sklearn
!pip install trl[peft]
!pip install bitsandbytes loralib
# !pip install git+https://github.com/huggingface/transformers.git@main
# optional: wandb
!pip install wandb

이전에는 아래와 같은 코드로 모델을 불러와 학습시켰지만, 이번에는 보다 코드의 유지 보수 및 관리를 위해 작성된 코드로 실습을 진행합니다.

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
# from peft import LoraConfig, get_peft_model
# from trl import AutoModelForCausalLMWithValueHead

# model_name = 'meta-llama/Meta-Llama-3-8B'

# # Add LoRA adapters
# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# model = AutoModelForCausalLMWithValueHead.from_pretrained(
#     model_name,
#     load_in_8bit=True,
#     device_map={"": 0},
#     peft_config=lora_config
#     )


# model

필요 모듈을 불러옵니다.

In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import evaluate
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
    PreTrainedTokenizerBase,
    Trainer,
    TrainerCallback,
    TrainingArguments,
    set_seed,
)
from transformers.utils import PaddingStrategy

## 모델학습을 위한 클레스 준비

dataclasses 의 dataclass 를 사용하면 클래스를 보다 손쉽고 빠르게 정의할 수 있습니다. 추후에 파이썬 실행파일을 관리하기 위한 작업을 위해 아래와 같은 클래스를 정의합니다. 이와같은 클래스는 HfArgumentParser와 함께 사용하여, 터미널로부터 사용자정의 입력을 손쉽게 받도록 도와줍니다.

In [None]:

# 코드의 재사용을 도와줄 데이터 클레스 정의
@dataclass
class ScriptArguments:
    """
    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
    """

    local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
    resume_from_checkpoint: Optional[bool] = field(
        default=False,
        metadata={"help": "If you want to resume training where it left off."},
    )
    deepspeed: Optional[str] = field(
        default=None,
        metadata={
            "help": "Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU."
        },
    )
    per_device_train_batch_size: Optional[int] = field(default=8)
    per_device_eval_batch_size: Optional[int] = field(default=8)
    gradient_accumulation_steps: Optional[int] = field(default=1)
    learning_rate: Optional[float] = field(default=2e-5)
    weight_decay: Optional[float] = field(default=0.001)
    model_name: Optional[str] = field(
        default='meta-llama/Meta-Llama-3-8B',
        metadata={
            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
        },
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "The tokenizer for your model, if left empty will use the default for your model",
        },
    )
    bf16: Optional[bool] = field(
        default=True,
        metadata={
            "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."
        },
    )
    num_train_epochs: Optional[int] = field(
        default=1,
        metadata={"help": "The number of training epochs for the reward model."},
    )
    train_subset: Optional[int] = field(
        default=100000,
        metadata={"help": "The size of the subset of the training data to use"},
    )
    eval_subset: Optional[int] = field(
        default=50000,
        metadata={"help": "The size of the subset of the eval data to use"},
    )
    gradient_checkpointing: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables gradient checkpointing."},
    )
    optim: Optional[str] = field(
        default="adamw_hf",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: Optional[str] = field(
        default="linear",
        metadata={"help": "The lr scheduler"},
    )
    max_length: Optional[int] = field(default=512)
    eval_first_step: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to run eval after the first step"},
    )
    seed: Optional[int] = field(
        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
    )




In [None]:
# # 실제 코드 사용시(.py 파일)에는 아래 두줄 주석을 해제, 파라미터를 주어 사용 가능
# 예시:
"""
torchrun --nnodes 1  \
         --nproc_per_node 8 ./reward_modeling.py \
         --model_name=<LLAMA_SE_MODEL>
"""
# parser = HfArgumentParser(ScriptArguments)
# script_args = parser.parse_args_into_dataclasses()[0]
script_args = ScriptArguments() # 실제 사용 시에는 해당 코드 대신 위 코드 사용
set_seed(script_args.seed)



[StackExchang](https://stackexchange.com/)의 데이터를 불러옵니다. 학습을 위해 수정되어있는 데이터 셋입니다.   
[기존 모습 확인](https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences)

In [None]:

# Load the human stack-exchange-paired dataset for tuning the reward model.
train_dataset = load_dataset(
    "lvwerra/stack-exchange-paired", data_dir="data/reward", split="train", verification_mode="no_checks"
)
if script_args.train_subset > 0:
    train_dataset = train_dataset.select(range(script_args.train_subset))
eval_dataset = load_dataset(
    "lvwerra/stack-exchange-paired", data_dir="data/evaluation", split="train", verification_mode="no_checks"
)
if script_args.eval_subset > 0:
    eval_dataset = eval_dataset.select(range(script_args.eval_subset))
# Define the training args. Needs to be done before the model is loaded if you are using deepspeed.
model_name_split = script_args.model_name.split("/")[-1]
output_name = (
    f"{model_name_split}_peft_stack-exchange-paired_rmts__{script_args.train_subset}_{script_args.learning_rate}"
)



학습을 위한 파라미터 설정을 진행합니다.   
이전에 정의한 ScriptArguments 객체에 기본값이 담겨 있습니다. 수정해봐도 좋습니다.

In [None]:
training_args = TrainingArguments(
    output_dir=output_name,
    learning_rate=script_args.learning_rate,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    num_train_epochs=script_args.num_train_epochs,
    weight_decay=script_args.weight_decay,
    eval_strategy="steps",
    eval_steps=2000,
    save_strategy="steps",
    save_steps=500,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    gradient_checkpointing=script_args.gradient_checkpointing,
    deepspeed=script_args.deepspeed,
    local_rank=script_args.local_rank,
    remove_unused_columns=False,
    label_names=[],
    bf16=script_args.bf16,
    logging_strategy="steps",
    logging_steps=10,
    optim=script_args.optim,
    lr_scheduler_type=script_args.lr_scheduler_type,
    seed=script_args.seed,
)


In [None]:
# Load the value-head model and tokenizer.
tokenizer_name = script_args.tokenizer_name if script_args.tokenizer_name is not None else script_args.model_name
print(f"{tokenizer_name=}")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# peft 설정
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)



In [None]:
# 모델 준비
model = AutoModelForSequenceClassification.from_pretrained(
    script_args.model_name, num_labels=1, torch_dtype=torch.bfloat16
)
model


In [None]:
# LoRA 사용
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model


In [None]:
# Need to do this for gpt2, because it doesn't have an official pad token.
# 먼저 GPT2를 이용해 실습을 진행하기 때문에, 적절한 패딩 토큰을 추가해줍니다. (모델에 맞춰 변경될 수 있음)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.config.use_cache = not script_args.gradient_checkpointing
num_proc = 24  # Can adjust to be higher if you have more processors.
original_columns = train_dataset.column_names



In [None]:
# Turn the dataset into pairs of post + summaries, where text_j is the preferred question + answer and text_k is the other.
# Then tokenize the dataset.
def preprocess_function(examples):
    new_examples = {
        "input_ids_j": [],
        "attention_mask_j": [],
        "input_ids_k": [],
        "attention_mask_k": [],
    }
    for question, response_j, response_k in zip(examples["question"], examples["response_j"], examples["response_k"]):
        tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " + response_j, truncation=True)
        tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " + response_k, truncation=True)

        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])

    return new_examples




In [None]:
# preprocess the dataset and filter out QAs that are longer than script_args.max_length
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=original_columns,
)
train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=original_columns,
)
eval_dataset = eval_dataset.filter(
    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
)




In [None]:
# 배웠던 대로 특별하게 동작하는 데이터 콜레이터를 정의해줍니다. 두개의 입력값이 들어가 평가가 이루어집니다.
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append(
                {
                    "input_ids": feature["input_ids_j"],
                    "attention_mask": feature["attention_mask_j"],
                }
            )
            features_k.append(
                {
                    "input_ids": feature["input_ids_k"],
                    "attention_mask": feature["attention_mask_k"],
                }
            )
        batch_j = self.tokenizer.pad(
            features_j,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch




In [None]:
# 모델 평가를 도와줄 객체 정의
accuracy = evaluate.load("accuracy")
accuracy


In [None]:
def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    # Here, predictions is rewards_j and rewards_k.
    # We want to see how much of the time rewards_j > rewards_k.
    predictions = np.argmax(predictions, axis=0)
    labels = np.zeros(predictions.shape)
    return accuracy.compute(predictions=predictions, references=labels)


## train model

In [None]:
# 학습을 도와줄 객체 정의
class RewardTrainer(Trainer):
    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://arxiv.org/abs/2203.02155
    def compute_loss(self, model, inputs, return_outputs=False):
        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss



In [None]:
# Train the model!!
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=script_args.max_length),
)




In [None]:
if script_args.eval_first_step:

    class EvaluateFirstStepCallback(TrainerCallback):
        def on_step_end(self, args, state, control, **kwargs):
            if state.global_step == 1:
                control.should_evaluate = True

    trainer.add_callback(EvaluateFirstStepCallback())

trainer.train(script_args.resume_from_checkpoint)

print("Saving last checkpoint of the model")
tokenizer.save_pretrained(output_name + "_peft_last_checkpoint")
model.save_pretrained(output_name + "_peft_last_checkpoint")

In [None]:
!pip install huggingface_hub

In [None]:
tokenizer.save_pretrained(output_name + "_peft_last_checkpoint")

In [None]:
# 아래는 huggingface를 사용하는 코드입니다. Llama에는 사용하기 어렵습니다.(용량문제)
from huggingface_hub import HfApi

# API 객체 생성
api = HfApi()
repo_name = "reward_modeling"
user_name = None # 본인 user_name
# 저장소 생성
api.create_repo(repo_id=repo_name, exist_ok=True)

# 모델 업로드
api.upload_folder(
    folder_path=output_name+ "_peft_last_checkpoint",  # 모델이 저장된 폴더
    repo_id=f"{user_name}/{repo_name}",  # Hugging Face 저장소 이름
    repo_type="model"  # 업로드 유형 (model, dataset, space 등)
)


In [None]:
# Llama용
from huggingface_hub import HfFolder, Repository, create_repo

# Hugging Face 인증 토큰 가져오기
token = HfFolder.get_token()

# 모델 저장소 생성 (없는 경우)
repo_name = "{your_username}/lora_for_RM"
create_repo(repo_name, use_auth_token=token)

# 저장소 클론
repo = Repository(local_dir="lora_for_RM", clone_from=repo_name, use_auth_token=token)


# LoRA 적용 후 파라미터 저장
lora_parameters = {k: v for k, v in model.state_dict().items() if "lora" in k}

# 로컬에 LoRA 파라미터 저장
torch.save(lora_parameters, "lora_for_RM/lora_parameters.pth")

# 파일 푸시
repo.push_to_hub(commit_message="Upload LoRA parameters")

In [None]:
# wandb 명시적 종료
# import wandb
# wandb.finish()