# Post Training: Reinforcement Learning from Human Feedback (RLHF)

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Import libraries

In [2]:
import torch
import transformers
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
transformers.logging.set_verbosity_error()
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
import re
import pandas as pd
from tqdm import tqdm

## Helper Functions:

In [3]:
def generate_responses(model, tokenizer, user_message=None, system_message=None, max_new_tokens=300, full_message=None):
    #Formating chat using tokenizer's chat template:
    #Preparing a list of chat messages (structured format):
    if full_message:
        messages = full_message
    else:
        messages = []
    
        #If a system message is provided, adding it first:
        #System messages define assistant behavior (e.g., tone, personality):
        if system_message:
            messages.append({"role": "system", "content": system_message})

        #Add the user message as the next entry (it's a single-turn chat setup):
        messages.append({"role": "user", "content": user_message})
    
    
    #Tokenizing the prompt into input IDs and move to the model's device (CPU or GPU):
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False, #Return raw text prompt, not tokenized output.
        add_generation_prompt=True, #Add assistant's cue to prompt generation.
        enable_thinking=False, #Optional setting (used in some chat-aware models).
    )
    
    #Disabling gradient calculation to save memory (inference-only):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    #Recommended to use vllm, sglang or TensorRT (For trying different menthods for inference):
    with torch.no_grad():
        #Generating output tokens from the model:
        outputs = model.generate(
            **inputs,   #Using a double pointer for unpacking the dictionary of inputs (model.generate(**inputs)) that is equivalent to (model.generate(input_ids=..., attention_mask=...)).
            max_new_tokens=max_new_tokens, #Limit the number of tokens generated.
            do_sample=False, #Disabling randomness (greedy decoding).
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1] #Getting the length of the input (so we can extract only the newly generated tokens).
    generated_ids = outputs[0][input_len:] #Slicing the output to keep only the new tokens (assistant's response).
    
    #Decoding the generated token IDs back into text:
    #`skip_special_tokens=True` removing tokens like <|endoftext|>
    #Strip() removes any leading/trailing whitespace or newline characters from the output string to keeps the model output clean and ready to display or use.
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [4]:
def test_model_with_questions(model, tokenizer, questions, 
                              system_message=None, title="Model Output"):
    #Printing section title for clarity
    print(f"\n=== {title} ===")
    
    #Looping through each question in the list, starting index at 1:
    for i, question in enumerate(questions, 1):
        #Generating a model response for the current question:
        #Passing in the question as user input and optional system message
        response = generate_responses(model, tokenizer, question, 
                                      system_message)
        #Print both the input question and the model's output response:
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

In [5]:
def load_model_and_tokenizer(model_name, use_gpu = False):
    
    #Loading tokenizer from the given model path or HuggingFace Hub name:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    #Loading causal language model (this is a GPT-style decoder-only model):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    #If GPU is requested and available, move the model to CUDA:
    if use_gpu:
        model.to("cuda")
    
    #If the tokenizer does not already have a chat template, defined a custom one:
    #This template is used to format multi-turn conversations into a prompt string:
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    #Ensuring tokenizer has a pad token — fallback to eos token if missing:
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
    
    #Returning the ready-to-use model and tokenizer:   
    return model, tokenizer

## Prepare for evaluation dataset for Math: GSM8K

In [6]:
#Seting the computation device flag to disable GPU usage and runing the training or evaluation entirely on CPU.
USE_GPU = False

#Defining the system instruction prompt guiding the model to show reasoning steps and instructions to place the final numeric answer within a boxed format for easy extraction during reward evaluation.
SYSTEM_PROMPT = (
    "You are a helpful assistant that solves problems step-by-step. "
    "Always include the final numeric answer inside \\boxed{}."
)

In [7]:
#Defining reward function for both training using Online RL and evaluation with GSM8K.
#Inputs to the function - models responses and the ground truth: 
def reward_func(completions, ground_truth, **kwargs):
    #Regular expression match to capture content inside \boxed{} from each model-generated response for automatic answer checking.
    matches = [re.search(r"\\boxed\{(.*?)\}", completion[0]['content']) for completion in completions]
    #Extracting the matched numeric string (very first match) from each regex result or assigns an empty string if no boxed content was found.
    contents = [match.group(1) if match else "" for match in matches]
    #Reward 1 if the content(c) is the same as the ground truth(gt), 0 otherwise
    return [1.0 if c == gt else 0.0 for c, gt in zip(contents, ground_truth)]

In [8]:
#Created a simulated model output where the assistant’s response contains a numeric answer enclosed in \boxed{} to test the reward function.
sample_pred = [[{"role": "assistant", 
                 "content": r"...Calculating the answer. \boxed{72}"}]]
ground_truth = ["72"]
#Calling the reward function using the sample model output(72) and ground truth(72) to compute the corresponding numeric reward value.
reward = reward_func(sample_pred, ground_truth)
print(f"Positive Sample Reward: {reward}")

Positive Sample Reward: [1.0]


In [9]:
#Created another simulated model output where the assistant’s response contains a numeric answer enclosed in \boxed{} to test the reward function.
sample_pred = [[{"role": "assistant", 
                 "content": r"...Calculating the answer \boxed{71}"}]]
ground_truth = ["72"]
#Calling the reward function using the sample model output(71) and ground truth(72) to compute the corresponding numeric reward value.
reward = reward_func(sample_pred, ground_truth)
print(f"Negative Sample Reward: {reward}")

Negative Sample Reward: [0.0]


## Load the Evaluation Dataset

In [10]:
#Specifying the number of math problem samples to load from the evaluation dataset for quick testing and demonstration.
data_num = 5
#Loading the GSM8K math reasoning dataset from Hugging Face, selecting the test split, and limiting it to the first 5 examples for faster processing.
eval_dataset = load_dataset("openai/gsm8k", "main")["test"].select(range(data_num))
#Converting the selected dataset subset into a pandas DataFrame for easier inspection and visualization.
sample_df = eval_dataset.to_pandas()
display(sample_df)

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Unnamed: 0,question,answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t..."


In [11]:
#Defining a function to extract clean numeric ground truth values and create structured prompts from each dataset example.
def post_processing(example):
    #Using a regular expression to locate the final numeric answer following the "####" marker in the GSM8K dataset answer text.
    match = re.search(r"####\s*(-?\d+)", example["answer"])
    #Extracting the matched numeric string as the ground truth or assign None if no valid number is found.
    example["ground_truth"] = match.group(1) if match else None
    #Building a formatted prompt containing both the system instruction and the user’s math question to prepare the input for model evaluation.
    example["prompt"] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": example["question"]}
    ]
    #Returning the example with new fields for ground truth and formatted prompt ready for further processing.
    return example
#Applying the post_processing function to every sample in the dataset and removing the original unprocessed text columns, leaving only the structured prompt and ground truth fields.
eval_dataset = eval_dataset.map(post_processing).remove_columns(["question", "answer"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [12]:
#After the post-processing, the dataset only have two columns.
#- One is ground truth number extracted from the original responses.
#- Second, is a prompt, which is always a system prompt, followed by some questions.
sample_df = eval_dataset.select(range(5)).to_pandas()
display(sample_df)

Unnamed: 0,ground_truth,prompt
0,18,[{'content': 'You are a helpful assistant that...
1,3,[{'content': 'You are a helpful assistant that...
2,70000,[{'content': 'You are a helpful assistant that...
3,540,[{'content': 'You are a helpful assistant that...
4,20,[{'content': 'You are a helpful assistant that...


## Load the model and evaluate

In [13]:
#Loading the Qwen 2.5-0.5B instruct model and evaluated on five loaded prompts from the GSM8K test dataset
model, tokenizer = load_model_and_tokenizer("./models/Qwen/Qwen2.5-0.5B-Instruct", USE_GPU)

In [14]:
#Storing predictions and ground truths:
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    #Going through the post processed dataset:
    #Extracting the structured prompt that includes the system instruction and user’s math question.
    input_prompt = example["prompt"]
    #Retrieving the numeric ground truth answer from the processed dataset for later comparison.
    ground_truth = example["ground_truth"]
    #Runing the model to generate an answer:
    with torch.no_grad():
        response = generate_responses(model, tokenizer, 
                                      full_message = input_prompt) 
    #Appending the generated model response to the list in the same structured format expected by the reward function.
    all_preds.append([{"role": "assistant", "content": response}])
    #Appending the corresponding ground truth label to the list to align with the prediction order.
    all_labels.append(ground_truth)
    print(response)
    print("Ground truth: ", ground_truth)

# 3.Evaluating using reward_func
#Applying the previously defined reward function to all generated responses and ground truths to compute scores.
rewards = reward_func(all_preds, all_labels)

# 4. Reporting accuracy:
#Calculating overall evaluation accuracy as the proportion of correct responses across all evaluated examples.
accuracy = sum(rewards) / len(rewards)
print(f"Evaluation Accuracy: {accuracy:.2%}")
del model, tokenizer

 20%|██        | 1/5 [00:15<01:00, 15.24s/it]

To determine how much Janet makes at the farmers' market each day, we need to follow these steps:

1. Calculate the total number of eggs laid by the ducks in one day.
2. Determine how many eggs are eaten in one day.
3. Subtract the number of eggs eaten from the total number of eggs to find out how many eggs are sold.
4. Calculate the revenue from selling the eggs.

Let's start with the first step:

1. The ducks lay 16 eggs per day.
2. Janet eats 3 eggs for breakfast every morning, so the number of eggs eaten in one day is:
   \[
   16 - 3 = 13
   \]
3. Janet bakes muffins for her friends every day, which means she bakes 4 muffins. So, the number of eggs baked in one day is:
   \[
   13 + 4 = 17
   \]
4. Janet sells the remaining eggs at the farmers' market. Since there are 16 eggs in total and 17 eggs are sold, the number of eggs left to sell is:
   \[
   16 - 17 = -1
   \]
   However, since it's not possible to sell fewer than 0 eggs, this indicates that Janet has no eggs left to sell

 40%|████      | 2/5 [00:25<00:36, 12.31s/it]

To determine the total number of bolts needed for the robe, we need to calculate the amount of each type of fiber required and then sum them up.

1. **Blue Fiber:**
   - The problem states that it takes 2 bolts of blue fiber.
   - Therefore, the number of bolts of blue fiber is \(2\).

2. **White Fiber:**
   - It takes half as much white fiber as blue fiber.
   - Since 2 bolts of blue fiber require 2 bolts of white fiber, the number of bolts of white fiber is:
     \[
     \frac{2}{2} = 1
     \]

3. **Total Number of Bolts:**
   - To find the total number of bolts needed, we add the number of bolts of blue fiber and the number of bolts of white fiber:
     \[
     2 + 1 = 3
     \]

Thus, the total number of bolts required for the robe is \(\boxed{3}\).
Ground truth:  3


 60%|██████    | 3/5 [00:40<00:27, 13.57s/it]

To determine Josh's profit from flipping his house, we need to follow these steps:

1. **Calculate the total cost of the house:**
   - The house costs $80,000.
   - Josh also spends an additional $50,000 on repairs.

2. **Determine the net cost after repairs:**
   - Net cost = Total cost - Cost of repairs
   - Net cost = $80,000 - $50,000 = $30,000

3. **Calculate the increase in value due to repairs:**
   - The value of the house increased by 150%.
   - Increase in value = Percentage increase × Original value
   - Increase in value = 150% × $80,000
   - Increase in value = 1.5 × $80,000 = $120,000

4. **Determine the new value of the house:**
   - New value = Original value + Increase in value
   - New value = $80,000 + $120,000 = $200,000

5. **Calculate the profit:**
   - Profit = New value - Net cost
   - Profit = $200,000 - $30,000 = $170,
Ground truth:  70000


 80%|████████  | 4/5 [00:49<00:11, 11.88s/it]

To determine how many total meters James runs in a week, we need to follow these steps:

1. Calculate the distance James runs in one sprint.
2. Multiply the distance of one sprint by the number of sprints he runs per week.

First, let's find out how far James runs in one sprint:
\[ \text{Distance per sprint} = 60 \text{ meters} \]

Next, since James runs 3 sprints per week, we multiply the distance of one sprint by 3:
\[ \text{Total distance per week} = 60 \text{ meters/sprint} \times 3 \text{ sprints/week} \]
\[ \text{Total distance per week} = 180 \text{ meters} \]

So, the total distance James runs in a week is:
\[
\boxed{180}
\]
Ground truth:  540


100%|██████████| 5/5 [01:05<00:00, 13.02s/it]

To determine how many cups of feed Wendi needs for the final meal of the day, we can follow these steps:

1. Calculate the total amount of feed needed for all the chickens.
2. Determine how much feed is given away in the morning and the afternoon.
3. Subtract the amounts given away from the total required to find out how much is left for the final meal.

First, let's calculate the total amount of feed needed for all the chickens:
- Each chicken gets 3 cups of feed per day.
- There are 20 chickens in total.

So, the total amount of feed needed is:
\[ 20 \text{ chickens} \times 3 \text{ cups/chicken} = 60 \text{ cups} \]

Next, we calculate the amount of feed given away in the morning and the afternoon:
- In the morning: \( 15 \text{ cups} \)
- In the afternoon: \( 25 \text{ cups} \)

Now, we subtract the amounts given away from the total required:
\[ 60 \text{ cups} - (15 \text{ cups} + 25 \text{ cups}) = 60 \text{ cups} - 40 \text{ cups} = 20 \text{ cups} \]

Therefore, the number of c




#### Until here I have completed designing the evaluation process, and now will do the training

## Loading the training dataset

In [15]:
#Downloading the full GSM8K dataset from Hugging Face which contains grade school math problems and their step-by-step solutions.
dataset = load_dataset("openai/gsm8k", "main")
#Selecting the training split of the GSM8K dataset to be used for model training with online reinforcement learning.
train_dataset = dataset["train"]
 
# Apply to dataset
#Applying the previously defined post_processing function to each sample to extract numeric ground truths and format prompts for training input.
train_dataset = train_dataset.map(post_processing)
#Removing the original question and answer columns to keep only the structured prompt and ground truth fields needed for training.
train_dataset = train_dataset.remove_columns(["question", "answer"])
#Reducing the dataset to the first 10 samples when running on CPU to make the training process faster and lightweight.
if not USE_GPU:
    train_dataset = train_dataset.select(range(10))
print(train_dataset[0])

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

{'ground_truth': '72', 'prompt': [{'content': 'You are a helpful assistant that solves problems step-by-step. Always include the final numeric answer inside \\boxed{}.', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]}


## GRPO Training

In [16]:
config = GRPOConfig(
    #Accumulating gradients over 8 mini-batches before performing a single weight update to simulate a larger effective batch size and stabilize training.
    gradient_accumulation_steps=8,
    #Seting the number of samples processed per device in one forward and backward pass to one for low-memory environments like CPU.
    per_device_train_batch_size=1,
    #Specifying how many responses the model should generate per prompt to form a comparison group in GRPO where rewards are computed relatively among responses.
    num_generations=4, # Can set as high as 64 or 128
    #Runing one full pass over the training dataset since this demonstration focuses on showing the pipeline rather than long convergence.
    num_train_epochs=1,
    #Defining a small step size for model weight updates to ensure stable learning during reinforcement optimization.
    learning_rate=5e-6,
    #Instructing the trainer to log intermediate metrics every two steps to monitor progress frequently during small-scale runs.
    logging_steps=2,
    #Disables GPU usage if USE_GPU is False ensuring the entire training stays on CPU.
    no_cuda= not USE_GPU
)
#Creating a configuration object for GRPO training containing all major hyperparameters and device setup instructions.

In [17]:
#Loading a small instruction-tuned model and its tokenizer for GRPO so that training is feasible on CPU-only setups.
model, tokenizer = load_model_and_tokenizer("./models/HuggingFaceTB/SmolLM2-135M-Instruct", USE_GPU)

grpo_trainer = GRPOTrainer(
    #Suppling the policy model whose weights will be updated by GRPO during training.
    model=model,
    #Passes the GRPO configuration that defines batch size, number of generations per prompt, learning rate, logging steps, and device behavior.
    args=config,
    #Providing the verifiable reward function that extracts \boxed{} answers and returns rewards which GRPO will use to compute group-relative advantages.
    reward_funcs=reward_func,
    #Giving the preprocessed training dataset where each item contains a formatted prompt and a numeric ground truth for reward calculation.
    train_dataset=train_dataset
)
#Starting the GRPO training loop where multiple responses per prompt are generated, scored by the reward function, and used to update the policy via relative advantages.
grpo_trainer.train()

{'train_runtime': 98.1195, 'train_samples_per_second': 0.102, 'train_steps_per_second': 0.01, 'train_loss': 0.0, 'completion_length': 168.4375, 'rewards/reward_func': 0.0, 'reward': 0.0, 'reward_std': 0.0, 'kl': 0.0, 'epoch': 0.8}


TrainOutput(global_step=1, training_loss=0.0, metrics={'train_runtime': 98.1195, 'train_samples_per_second': 0.102, 'train_steps_per_second': 0.01, 'total_flos': 0.0, 'train_loss': 0.0})

## Results on the small trained model

In [18]:
model = grpo_trainer.model

# Store predictions and ground truths
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]
    # Run the model to generate an answer
    with torch.no_grad():
        response = generate_responses(model, tokenizer, 
                                      full_message = input_prompt) 
    all_preds.append([{"role": "assistant", "content": response}])
    all_labels.append(ground_truth)
    print(response)
    print("Ground truth: ", ground_truth)

# 3. Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)

# 4. Report accuracy
accuracy = sum(rewards) / len(rewards)
print(f"Evaluation Accuracy: {accuracy:.2%}")

 20%|██        | 1/5 [00:07<00:29,  7.35s/it]

Janet's ducks lay 16 eggs per day.
She eats three for breakfast every morning and bakes muffins for her friends every day with four.
She sells the remainder at the farmers' market daily for $2 per fresh duck egg.
She makes 16 eggs / 3 for breakfast = 6 eggs per day.
She sells the remainder at the farmers' market daily for $2 per fresh duck egg.
She makes 6 eggs / 4 for breakfast = 1.5 eggs per day.
She makes 6 eggs / 4 for breakfast = 1.5 eggs per day.
She makes 1.5 eggs / 2 for breakfast = 1.25 eggs per day.
She makes 1.25 eggs / 4 for breakfast = 3.25 eggs per day.
She makes 3.25 eggs / 2 for breakfast = 1.5 eggs per day.
She makes 1.5 eggs / 3 for breakfast = 0.5 eggs per day.
She makes 0.5 eggs / 4 for breakfast = 0.125 eggs per day.
She makes 0.125 eggs / 2 for breakfast = 0.0625 eggs per day.
She makes 0.0625 eggs / 3 for breakfast = 0.025 eggs per day.
Ground truth:  18


 40%|████      | 2/5 [00:12<00:18,  6.33s/it]

To solve this problem, we need to consider the total amount of fiber required and the amount of blue fiber needed.

First, we need to find the total amount of blue fiber required. Since we have 2 bolts of blue fiber and half the amount of white fiber, we can multiply the number of bolts by the amount of blue fiber needed.

The total amount of blue fiber required is 2 * 0.5 = 0.10 blue fiber.

Now, we need to find the total amount of white fiber required. Since we have 1 bolt of white fiber and half the amount of blue fiber, we can multiply the number of bolts by the amount of white fiber needed.

The total amount of white fiber required is 1 * 0.5 = 0.5 bolts.

Finally, we can find the total amount of bolts by adding the blue fiber and white fiber together: 0.10 blue fiber + 0.5 bolts = 0.65 bolts.

So, the robe takes 0.65 bolts of blue fiber and 0.5 bolts of white fiber.
Ground truth:  3


 60%|██████    | 3/5 [00:15<00:09,  4.79s/it]

To find the profit, we need to calculate the difference between the original price of the house ($80,000) and the new price of the house ($85,000).

The original price was $80,000.

The new price is $85,000.

The difference is $85,000 - $80,000 = $5,000.

The profit is $5,000.

So, Josh made a profit of $5,000.
Ground truth:  70000


 80%|████████  | 4/5 [00:17<00:03,  3.61s/it]

James runs 60 meters each sprint.
He runs 3 sprints x 60 meters = 180 meters.
He runs 180 meters x 3 sprints = 540 meters.
So, James runs 540 meters a week.
#### 540
The answer is: 540
Ground truth:  540


100%|██████████| 5/5 [00:25<00:00,  5.03s/it]

Step 1: Determine the total number of cups of feed given to the chickens in the morning.

Wendi feeds her chickens 15 cups of feed in the morning.

Step 2: Determine the total number of cups of feed given to the chickens in the afternoon.

Wendi gives her chickens another 25 cups of feed in the afternoon.

Step 3: Determine the total number of cups of feed given to the chickens in the final meal of the day.

Wendi gives her chickens 15 cups of feed in the morning, 25 cups in the afternoon, and 15 cups in the evening.

Step 4: Calculate the total number of cups of feed given to the chickens in the final meal of the day.

Total number of cups of feed given to the chickens in the morning = 15 cups
Total number of cups of feed given to the chickens in the afternoon = 25 cups
Total number of cups of feed given to the chickens in the evening = 15 cups

Step 5: Calculate the total number of cups of feed given to the chickens in the final meal of the day.

Total number of cups of feed given to


