# Reinforcement Learning from Verifiable Rewards (RLVR)

In [1]:
pip install transformers accelerate datasets trl




# Load Model

In [2]:
!rm -rf /content/Qwen/ #!rm -rf /content/EleutherAI/

In [3]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

import os
os.environ["WANDB_DISABLED"] = "True"

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead, create_reference_model
from peft import LoraConfig, get_peft_model

model_name = "Qwen/Qwen2-0.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
ref_model = create_reference_model(model).to(device)

lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

value_model = AutoModelForCausalLMWithValueHead(model).to(device)
value_model.eval()
value_model.base_model_prefix = model.base_model_prefix
value_model.model = value_model.pretrained_model

model = get_peft_model(model, lora_config)

In [5]:
print(model.print_trainable_parameters())

trainable params: 270,336 || all params: 494,303,104 || trainable%: 0.0547
None


# Dataset

In [6]:
from datasets import Dataset

prompts = ["Say something friendly:" for _ in range(8)]
prompts = {"prompt": prompts}

list_dataset = []
for p in prompts["prompt"]:
    d = tokenizer(p, return_tensors="pt", padding=True, truncation=True)
    list_dataset.append({
        "input_ids": d["input_ids"][0],
        "attention_mask": d["attention_mask"][0],
        "prompt": p
    })

dataset = Dataset.from_list(list_dataset)

dataset[0]

{'input_ids': [45764, 2494, 11657, 25],
 'attention_mask': [1, 1, 1, 1],
 'prompt': 'Say something friendly:'}

# GRPO

In [7]:
import torch
import torch.nn.functional as F

def score(self, hidden_states):
    # Get the logits from the hidden states
    if hasattr(self, "lm_head"):
        logits = self.lm_head(hidden_states)
    else:
        # For models that don't have a separate lm_head
        logits = self.model.lm_head(hidden_states)

    # Calculate the log probabilities
    logits = logits[:, :-1, :]
    log_probs = F.log_softmax(logits, dim=-1)
    return log_probs

# Patch the method into your model
from types import MethodType
model.score = MethodType(score, model)
value_model.score = MethodType(score, value_model)
ref_model.score = MethodType(score, ref_model)
value_model.is_peft_model = False

In [8]:
# from trl.rewards import think_format_reward
import re

def think_format_reward(completions, **kwargs):
    """
    Reward function that rewards completions that follow the format "<think>...</think>".
    """
    pattern = r"^<think>(?!.*<think>)(.*?)</think>.*$"
    # The completions are strings, not dictionaries, so we can use them directly.
    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completions]
    return [1.0 if match else 0.0 for match in matches]


from trl import GRPOConfig, GRPOTrainer
import torch
import random

grpo_config = GRPOConfig(
    output_dir=model_name
)

grpo_trainer = GRPOTrainer(
    args=grpo_config,
    model=model,
    processing_class=tokenizer,
    train_dataset=dataset,
    reward_funcs=think_format_reward
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
grpo_trainer.train()

Step,Training Loss
10,0.0
20,0.0


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainOutput(global_step=24, training_loss=0.0, metrics={'train_runtime': 362.7033, 'train_samples_per_second': 0.066, 'train_steps_per_second': 0.066, 'total_flos': 0.0, 'train_loss': 0.0})

# PPO

In [6]:
import torch
import torch.nn.functional as F


def score(self, hidden_states):
    # Get the logits from the hidden states
    if hasattr(self, "lm_head"):
        logits = self.lm_head(hidden_states)
    else:
        # For models that don't have a separate lm_head
        logits = self.model.lm_head(hidden_states)

    # Calculate the log probabilities
    logits = logits[:, :-1, :]
    log_probs = F.log_softmax(logits, dim=-1)
    return log_probs

# Patch the method into your model
from types import MethodType
model.score = MethodType(score, model)
value_model.score = MethodType(score, value_model)
ref_model.score = MethodType(score, ref_model)
value_model.is_peft_model = False

In [14]:
from trl.rewards import think_format_reward

class RewardModelModel(torch.nn.Module):

  def forward(self, completitions):
    return think_format_reward(completitions)

class RewardModel(torch.nn.Module):
  base_model_prefix = "model"
  model = RewardModelModel()

  def forward(self, completitions):
    return self.model(completitions)

reward_model = RewardModel()


In [9]:
from trl import PPOTrainer, PPOConfig
import torch
import random

ppo_config = PPOConfig(
    output_dir=model_name,
    learning_rate=1e-5,
    batch_size=1,
    # batch_size=1,
    # num_train_epochs=3,
    # logging_steps=1,
    # mini_batch_size=1,
    report_to=None  # set to "wandb" if you want logging
)

ppo_trainer = PPOTrainer(
    args=ppo_config,
    model=model,
    ref_model=ref_model,
    processing_class=tokenizer,
    train_dataset=dataset,
    reward_model=reward_model,
    value_model=value_model
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [10]:
ppo_trainer.train()

===training policy===


TypeError: RewardModelModel.forward() got an unexpected keyword argument 'input_ids'

In [None]:
# import torch.nn.functional as F
# from trl.trainer.utils import first_true_indices

# def score(self, hidden_states, input_ids):
#     # Get the logits from the hidden states
#     if hasattr(self, "lm_head"):
#         logits = self.lm_head(hidden_states)
#     else:
#         # For models that don't have a separate lm_head
#         logits = self.model.lm_head(hidden_states)

#     # Calculate the log probabilities
#     logits = logits[:, :-1, :]
#     labels = input_ids[:, 1:]
#     log_probs = F.log_softmax(logits, dim=-1)

#     # Gather log probs of the actual next tokens
#     log_probs = torch.gather(log_probs, dim=2, index=labels.unsqueeze(-1)).squeeze(-1)
#     return log_probs

# def get_reward(model, query_responses, pad_token_id, context_length):
#     """
#     Computes the reward for a given query and response.

#     Args:
#         model (`nn.Module`): The model to use for reward computation.
#         query_responses (`torch.LongTensor`): The query and response tensors.
#         pad_token_id (`int`): The pad token id.
#         context_length (`int`): The length of the query.

#     Returns:
#         `torch.FloatTensor`: The reward for the given query and response.
#     """
#     with torch.no_grad():
#         output = model(query_responses, output_hidden_states=True)
#         # We need to pass both the hidden states and the input_ids to the score function.
#         reward_logits = model.score(output.hidden_states[-1], query_responses)
#         sequence_lengths = first_true_indices(query_responses[:, context_length:] == pad_token_id) - 1 + context_length
#         # https://github.com/huggingface/transformers/blob/dc68a39c8111217683bf49a4912d0c9018bab33d/src/transformers/models/gpt2/modeling_gpt2.py#L1454
#         # gpt2's causal lm score is shifted by one layer
#         if model.config.model_type == "gpt2":
#             reward_logits = reward_logits[:, context_length - 1 : -1]
#         else:
#             reward_logits = reward_logits[:, context_length:]
#         return torch.gather(reward_logits, 1, sequence_lengths.unsqueeze(-1)).squeeze(-1)

# # Patch the method into your model
# from types import MethodType
# model.score = MethodType(score, model)
# value_model.score = MethodType(score, value_model)
# ref_model.score = MethodType(score, ref_model)

# ppo_trainer.get_reward = get_reward

In [8]:
# import torch.nn.functional as F
# from trl.trainer.utils import first_true_indices

# def score(self, hidden_states, input_ids):
#     # Get the logits from the hidden states
#     if hasattr(self, "lm_head"):
#         logits = self.lm_head(hidden_states)
#     else:
#         # For models that don't have a separate lm_head
#         logits = self.model.lm_head(hidden_states)

#     # Calculate the log probabilities
#     logits = logits[:, :-1, :]
#     labels = input_ids[:, 1:]
#     log_probs = F.log_softmax(logits, dim=-1)

#     # Gather log probs of the actual next tokens
#     log_probs = torch.gather(log_probs, dim=2, index=labels.unsqueeze(-1)).squeeze(-1)
#     return log_probs


# def get_reward(model, query_responses, pad_token_id, context_length):
#     """
#     Computes the reward for a given query and response.

#     Args:
#         model (`nn.Module`): The model to use for reward computation.
#         query_responses (`torch.LongTensor`): The query and response tensors.
#         pad_token_id (`int`): The pad token id.
#         context_length (`int`): The length of the query.

#     Returns:
#         `torch.FloatTensor`: The reward for the given query and response.
#     """
#     attention_mask = query_responses != pad_token_id
#     position_ids = attention_mask.cumsum(1) - attention_mask.long()  # exclusive cumsum
#     lm_backbone = getattr(model, model.base_model_prefix)
#     input_ids = torch.masked_fill(query_responses, ~attention_mask, 0)

#     with torch.no_grad():
#         output = lm_backbone(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             position_ids=position_ids,
#             return_dict=True,
#             output_hidden_states=True,
#             use_cache=False,  # otherwise mistral-based RM would error out
#         )
#         # We need to pass both the hidden states and the input_ids to the score function.
#         reward_logits = model.score(output.hidden_states[-1], query_responses)
#         sequence_lengths = first_true_indices(query_responses[:, context_length:] == pad_token_id) - 1 + context_length
#         # https://github.com/huggingface/transformers/blob/dc68a39c8111217683bf49a4912d0c9018bab33d/src/transformers/models/gpt2/modeling_gpt2.py#L1454
#         # gpt2's causal lm score is shifted by one layer
#         if model.config.model_type == "gpt2":
#             reward_logits = reward_logits[:, context_length - 1 : -1]
#         else:
#             reward_logits = reward_logits[:, context_length:]
#         return torch.gather(reward_logits, 1, sequence_lengths.unsqueeze(-1)).squeeze(-1)

# # Patch the method into your model
# from types import MethodType
# model.score = MethodType(score, model)
# value_model.score = MethodType(score, value_model)
# ref_model.score = MethodType(score, ref_model)