In [1]:
!pip install trl transformers

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.23.0


In [2]:
import torch
from transformers import TrainingArguments , AutoTokenizer ,AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
import re
import pandas as pd
from tqdm import tqdm

# Helper functions

In [3]:
def generate_responses(model, tokenizer, user_message=None, system_message=None, max_new_tokens=300, full_message=None):
    # Format chat using tokenizer's chat template
    if full_message:
        messages = full_message
    else:
        messages = []
        if system_message:
            messages.append({"role": "system", "content": system_message})
        messages.append({"role": "user", "content": user_message})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [4]:
def test_model_with_questions(model, tokenizer, questions, system_message=None, title="Model Output"):
    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question, system_message)
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

In [5]:
def load_model_and_tokenizer(model_name, use_gpu = False):

    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    if use_gpu:
        model.to("cuda")

    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""

    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [6]:
def display_dataset(dataset):
    # Visualize the dataset
    rows = []
    for i in range(3):
        example = dataset[i]
        user_msg = next(m['content'] for m in example['messages'] if m['role'] == 'user')
        assistant_msg = next(m['content'] for m in example['messages'] if m['role'] == 'assistant')
        rows.append({
            'User Prompt': user_msg,
            'Assistant Response': assistant_msg
        })

    # Display as table
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None)  # Avoid truncating long strings
    display(df)

# Preparation for eval dataset for math : GSM8K

In [7]:
USE_GPU=False
SYSTEM_PROMPT=(
    "You are a helpful assistant that solves problems step by step."
    "Always include the final numeric answer inside \\boxed{}."
)

In [10]:
def reward_func(completions, ground_truth, **kwargs):
    # Regular expression to capture content inside \boxed{}
    matches = [re.search(r"\\boxed\{(.*?)\}", completion[0]['content']) for completion in completions]
    contents = [match.group(1) if match else "" for match in matches]
    # Reward 1 if the content is the same as the ground truth, 0 otherwise
    return [1.0 if c == gt else 0.0 for c, gt in zip(contents, ground_truth)]

In [11]:
sample_pred = [[{"role": "assistant",
                 "content": r"...Calculating the answer. \boxed{72}"}]]
ground_truth = ["72"]
reward = reward_func(sample_pred, ground_truth)
print(f"Positive Sample Reward: {reward}")

Positive Sample Reward: [1.0]


In [12]:
sample_pred = [[{"role": "assistant",
                 "content": r"...Calculating the answer \boxed{71}"}]]
ground_truth = ["72"]
reward = reward_func(sample_pred, ground_truth)
print(f"Negative Sample Reward: {reward}")

Negative Sample Reward: [0.0]


# Load the eval dataset

In [13]:
data_num = 5
eval_dataset = load_dataset("openai/gsm8k", "main")["test"].select(range(data_num))
sample_df = eval_dataset.to_pandas()
display(sample_df)

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Unnamed: 0,question,answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t..."


In [14]:
def post_processing(example):
    match = re.search(r"####\s*(-?\d+)", example["answer"])
    example["ground_truth"] = match.group(1) if match else None
    example["prompt"] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": example["question"]}
    ]
    return example
eval_dataset = eval_dataset.map(post_processing).remove_columns(["question", "answer"])


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [15]:
sample_df = eval_dataset.select(range(5)).to_pandas()
display(sample_df)

Unnamed: 0,ground_truth,prompt
0,18,[{'content': 'You are a helpful assistant that...
1,3,[{'content': 'You are a helpful assistant that...
2,70000,[{'content': 'You are a helpful assistant that...
3,540,[{'content': 'You are a helpful assistant that...
4,20,[{'content': 'You are a helpful assistant that...


# load the model and eval

In [18]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct", USE_GPU)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [19]:
# Store predictions and ground truths
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]
    # Run the model to generate an answer
    with torch.no_grad():
        response = generate_responses(model, tokenizer,
                                      full_message = input_prompt)
    all_preds.append([{"role": "assistant", "content": response}])
    all_labels.append(ground_truth)
    print(response)
    print("Ground truth: ", ground_truth)

# 3. Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)

# 4. Report accuracy
accuracy = sum(rewards) / len(rewards)
print(f"Evaluation Accuracy: {accuracy:.2%}")
del model, tokenizer

  0%|          | 0/5 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 20%|██        | 1/5 [01:28<05:55, 88.93s/it]

To determine how much Janet makes at the farmers' market each day, we need to follow these steps:

1. Calculate the total number of eggs laid by the ducks in one day.
2. Determine how many eggs are eaten in one day.
3. Subtract the number of eggs eaten from the total number of eggs to find out how many eggs are sold.
4. Calculate the revenue from selling the eggs.

First, let's calculate the total number of eggs laid by the ducks in one day:
\[
16 \text{ (eggs per day)} + 3 \text{ (breakfast) + 4 (baking)} = 23 \text{ (total eggs)}
\]

Next, we know that Janet eats 3 eggs for breakfast and 4 eggs baked during the day, so the total number of eggs eaten is:
\[
3 + 4 = 7 \text{ (eggs eaten)}
\]

Now, we subtract the number of eggs eaten from the total number of eggs to find out how many eggs are sold:
\[
23 - 7 = 16 \text{ (eggs sold)}
\]

Finally, we calculate the revenue from selling the eggs. Since each egg is sold for $2, the total revenue is:
\[
16 \times 2 = 32 \text{ (dollars)}
\]


 40%|████      | 2/5 [02:28<03:34, 71.58s/it]

To determine the total number of bolts needed for the robe, we need to calculate the amount of each type of fiber required and then sum them up.

1. **Blue Fiber:**
   - The problem states that it takes 2 bolts of blue fiber.
   - Therefore, the number of bolts of blue fiber is \(2\).

2. **White Fiber:**
   - It takes half as much white fiber as blue fiber.
   - Since 2 bolts of blue fiber require 2 bolts of white fiber, the number of bolts of white fiber is:
     \[
     \frac{2}{2} = 1
     \]

3. **Total Number of Bolts:**
   - To find the total number of bolts needed, we add the number of bolts of blue fiber and the number of bolts of white fiber:
     \[
     2 + 1 = 3
     \]

Thus, the total number of bolts required for the robe is \(\boxed{3}\).
Ground truth:  3


 60%|██████    | 3/5 [03:55<02:37, 78.53s/it]

To determine Josh's profit from flipping his house, we need to follow these steps:

1. **Calculate the total cost of the house:**
   The house costs $80,000.

2. **Determine the increase in value due to repairs:**
   The repairs increased the value of the house by 150%.

3. **Calculate the new value of the house after repairs:**
   Increase in value = 150% of original value.
   Increase in value = \( 150\% \times \$80,000 \).
   Convert 150% to a decimal: \( 1.50 \).
   Increase in value = \( 1.50 \times \$80,000 \).
   Calculate:
   \[
   1.50 \times \$80,000 = \$120,000
   \]

4. **Find the net gain or loss on the sale:**
   Net gain = New value - Original value.
   Net gain = \$120,000 - \$80,000.
   Calculate:
   \[
   \$120,000 - \$80,000 = \$40,000
   \]

Therefore, Josh made a profit of \(\boxed{40000}\).
Ground truth:  70000


 80%|████████  | 4/5 [05:03<01:14, 74.37s/it]

To determine how many total meters James runs in a week, we need to follow these steps:

1. Calculate the distance James runs in one sprint.
2. Multiply the distance of one sprint by the number of sprints he runs per week.

First, let's find out how far James runs in one sprint. Since he runs 60 meters and he runs 3 times a week, we can calculate the distance for one sprint as follows:
\[
\text{Distance per sprint} = \frac{\text{Distance per sprint}}{\text{Number of sprints per week}} = \frac{60 \text{ meters}}{3} = 20 \text{ meters}
\]

Next, we need to find out how much distance he runs in a week. Since he runs 3 sprints per week, we multiply the distance of one sprint by 3:
\[
\text{Total distance per week} = \text{Distance per sprint} \times \text{Number of sprints per week} = 20 \text{ meters} \times 3 = 60 \text{ meters}
\]

Therefore, the total distance James runs in a week is \(\boxed{60}\) meters.
Ground truth:  540


100%|██████████| 5/5 [06:29<00:00, 77.88s/it]

To determine how many cups of feed Wendi needs for the final meal of the day, we can follow these steps:

1. Calculate the total amount of feed needed for all the chickens.
2. Determine how much feed is given away in the morning and the afternoon.
3. Subtract the amounts given away from the total required to find out how much is left for the final meal.

First, let's calculate the total amount of feed needed for all the chickens:
- Each chicken gets 3 cups of feed per day.
- There are 20 chickens in total.

So, the total amount of feed needed is:
\[ 20 \text{ chickens} \times 3 \text{ cups/chicken} = 60 \text{ cups} \]

Next, we calculate the amount of feed given away in the morning and the afternoon:
- In the morning, Wendi gives 15 cups of feed.
- In the afternoon, Wendi gives another 25 cups of feed.

Adding these together gives the total amount of feed given away:
\[ 15 \text{ cups} + 25 \text{ cups} = 40 \text{ cups} \]

Now, we subtract the amount given away from the total amount




# Loading the training dataset

In [20]:
dataset = load_dataset("openai/gsm8k", "main")
train_dataset = dataset["train"]

# Apply to dataset
train_dataset = train_dataset.map(post_processing)
train_dataset = train_dataset.remove_columns(["question", "answer"])
if not USE_GPU:
    train_dataset = train_dataset.select(range(10))
print(train_dataset[0])

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

{'ground_truth': '72', 'prompt': [{'content': 'You are a helpful assistant that solves problems step by step.Always include the final numeric answer inside \\boxed{}.', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]}


# GRPO Training

In [21]:
config = GRPOConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_generations=4, # Can set as high as 64 or 128
    num_train_epochs=1,
    learning_rate=5e-6,
    logging_steps=2,
    no_cuda= not USE_GPU     # keeps the whole run on CPU, incl. MPS
)



In [22]:

model, tokenizer = load_model_and_tokenizer("HuggingFaceTB/SmolLM2-135M-Instruct", USE_GPU)

grpo_trainer = GRPOTrainer(
    model=model,
    args=config,
    reward_funcs=reward_func,
    train_dataset=train_dataset
)

grpo_trainer.train()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshikhar_dave[0m ([33mshikhar_dave-iit-jodhpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


Step,Training Loss
2,0.0
4,0.0


TrainOutput(global_step=5, training_loss=0.0, metrics={'train_runtime': 6550.9344, 'train_samples_per_second': 0.002, 'train_steps_per_second': 0.001, 'total_flos': 0.0, 'train_loss': 0.0})

In [24]:
fully_trained_qwen = True
if fully_trained_qwen:
    model, tokenizer = load_model_and_tokenizer("banghua/Qwen2.5-0.5B-GRPO",
                                            USE_GPU)
else:
    model = grpo_trainer.model

# Store predictions and ground truths
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]
    # Run the model to generate an answer
    with torch.no_grad():
        response = generate_responses(model, tokenizer,
                                      full_message = input_prompt)
    all_preds.append([{"role": "assistant", "content": response}])
    all_labels.append(ground_truth)
    print(response)
    print("Ground truth: ", ground_truth)

# 3. Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)

# 4. Report accuracy
accuracy = sum(rewards) / len(rewards)
print(f"Evaluation Accuracy: {accuracy:.2%}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

 20%|██        | 1/5 [01:12<04:49, 72.26s/it]

To determine how much Janet makes at the farmers' market each day, we need to follow these steps:

1. Calculate the total number of eggs laid per day.
   - Janet's ducks lay 16 eggs per day.

2. Determine the number of eggs eaten in a day.
   - Janet eats 3 eggs in the morning and bakes muffins for her friends, so she eats \(3 + 4 = 7\) eggs in a day.

3. Subtract the number of eggs eaten from the total number of eggs laid to find the remaining eggs.
   - Remaining eggs = Total eggs - Eggs eaten = 16 - 7 = 9 eggs.

4. Since she sells all the remaining eggs at the farmers' market, the amount she makes is:
   - Amount made = Number of eggs sold × Price per egg = 9 × $2 = $18.

Therefore, the amount Janet makes at the farmers' market each day is \(\boxed{18}$.
Ground truth:  18


 40%|████      | 2/5 [01:58<02:51, 57.14s/it]

To determine the total number of bolts of fabric, we need to calculate the amount of blue and white fibers required for each type of robe and then sum them up.

1. **Blue Fiber:**
   - It takes 2 bolts of blue fiber.
   
2. **White Fiber:**
   - It takes half as much white fiber as blue fiber, so:
     \[
     \frac{2}{2} = 1 \text{ bolt of white fiber}
     \]

Now, let's add the number of bolts of each type:

- Total blue bolts: \(2\)
- Total white bolts: \(1\)

Therefore, the total number of bolts is:
\[
2 + 1 = 3
\]

The total number of bolts needed is \(\boxed{3}.
Ground truth:  3


 60%|██████    | 3/5 [03:33<02:28, 74.41s/it]

To determine the profit Josh made, we need to follow these steps:

1. Calculate the new value of the house after the repairs.
2. Determine the increase in value due to the repairs.
3. Find out what the increase in value represents as a percentage of the original value.
4. Subtract this percentage from 100% to find the actual profit.

First, let's calculate the new value of the house after the repairs:
\[ \text{New Value} = \text{Original Value} + (\text{Value Increase} \times \frac{\text{Percentage Increase}}{100}) \]
The value increase is $50,000, and the percentage increase is 150%.

So,
\[ \text{Increase in Value} = 50,000 \times \frac{150}{100} = 50,000 \times 0.15 = 7,500 \]

Now, add this increase to the original value to get the new value:
\[ \text{New Value} = 80,000 + 7,500 = 87,500 \]

Next, subtract the original cost of the house from the new value to find the profit:
\[ \text{Profit} = \text{New Value} - \text{Original Cost} = 87,500 - 8
Ground truth:  70000


 80%|████████  | 4/5 [04:24<01:04, 64.94s/it]

To determine the total distance James runs in a week, we need to follow these steps:

1. Calculate the distance James runs in one sprint.
   - Each sprint is 60 meters.

2. Determine the distance James runs in three sprints.
   - Since he runs 3 times per week and each sprint is 60 meters, the total distance for three sprints is \(3 \times 60 = 180\) meters.

3. Multiply the weekly distance by the number of sprints.
   - The total distance James runs in a week is \(180 \text{ meters/sprint} \times 3 \text{ sprints/week} = 540\) meters.

Therefore, the total distance James runs in a week is \(\boxed{540}$.
Ground truth:  540


100%|██████████| 5/5 [05:48<00:00, 69.68s/it]

To determine how much feed Wendi needs for the final meal of the day, we first calculate the total amount of feed she provides to all her chickens.

In the morning, she gives each chicken 15 cups of feed.
Since there are 20 chickens, the total amount of feed given in the morning is:
\[ 15 \text{ cups/Chicken} \times 20 \text{ chickens} = 300 \text{ cups} \]

In the afternoon, she gives each chicken an additional 25 cups of feed.
So, the total amount of feed given in the afternoon is:
\[ 25 \text{ cups/Chicken} \times 20 \text{ chickens} = 500 \text{ cups} \]

The total amount of feed needed for all chickens in the final meal of the day is the sum of the amounts given in the morning and the afternoon:
\[ 300 \text{ cups} + 500 \text{ cups} = 800 \text{ cups} \]

Therefore, the total number of cups of feed Wendi needs to give her chickens in the final meal of the day is:
\[
\boxed{800}
Ground truth:  20
Evaluation Accuracy: 60.00%



