# Gemma-3 GPRO Fine-Tuning

## Framework Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

In [None]:
!nvidia-smi

Mon May  5 14:04:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             49W /  400W |    6401MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## Import Libraries and Set parameters

In [None]:
import unsloth
from unsloth import FastModel
import os
import torch
import json
import re
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, TextStreamer, TrainerCallback

from peft import LoraConfig
from trl import GRPOTrainer, GRPOConfig
from tqdm.notebook import tqdm
from math import isclose
import random, math

max_seq_length = 1024
model_name = "unsloth/gemma-3-1b-it-unsloth-bnb-4bit"
load_in_4bit = True
max_prompt_length = 256

# lora parameters
lora_r = 8 # rank
lora_alpha = 8
lora_dropout = 0
lora_bias = "none"
lora_random_state = 3407

# training parameters
max_training_steps = 100
eval_steps = 10
save_steps = 10
logging_steps = 20
output_dir = "outputs_grpo_finetune_steps"

# validation parameters
evaluation_batch_size = 128
results_filename = "step_accuracies_grpo.json"

# dataset details
dataset_name = "openai/gsm8k"
dataset_subset = "main"

## prompt structure
reasoning_start = "<start_working_out>"
reasoning_end   = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

# system prompt
system_prompt = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""

print("Configuration set:")
print(f"  Model: {model_name}")
print(f"  Max Seq Length: {max_seq_length}")
print(f"  LoRA r: {lora_r}, alpha: {lora_alpha}")
print(f"  Target Steps: {max_training_steps}")
print(f"  Evaluate Every: {eval_steps} steps")
print(f"  Save Every: {save_steps} steps")
print(f"  Evaluation Batch Size: {evaluation_batch_size}")
print(f"  Results File: {results_filename}")
print(f"  Dataset: {dataset_name} ({dataset_subset})")


Configuration set:
  Model: unsloth/gemma-3-1b-it-unsloth-bnb-4bit
  Max Seq Length: 1024
  LoRA r: 8, alpha: 8
  Target Steps: 100
  Evaluate Every: 10 steps
  Save Every: 10 steps
  Evaluation Batch Size: 128
  Results File: step_accuracies_grpo.json
  Dataset: openai/gsm8k (main)


## Load model and tokenizer

Loads the base model and tokenizer using unsloth and applies LoRA

In [None]:
print(f"Loading base model '{model_name}' and tokenizer...")
model, tokenizer = FastModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit,
    load_in_8bit = False
)
print("Base model and tokenizer loaded.")

Loading base model 'unsloth/gemma-3-1b-it-unsloth-bnb-4bit' and tokenizer...
==((====))==  Unsloth 2025.4.7: Fast Gemma3 patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Base model and tokenizer loaded.


In [None]:
print("Let's add LoRA adapters...")
model = FastModel.get_peft_model(
    model,
    r = lora_r,
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    bias = lora_bias,
    random_state = lora_random_state,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules    = True,
    finetune_vision_layers = False
)
print("LoRA adapters added.")
print("----Trainable Parameters----")
model.print_trainable_parameters()

Let's add LoRA adapters...
Unsloth: Making `model.base_model.model.model` require gradients
LoRA adapters added.
----Trainable Parameters----
trainable params: 6,522,880 || all params: 1,006,408,832 || trainable%: 0.6481


## Helper Functions

Functions for data processing and regex patterns used in reward functions and evaluation

- extracts the answer from the given text
- regex for matching the full output format
- regex for matching the numbers withing the solution
- function to format a single sample into required structure
-

In [None]:
def extract_gsm8k_answer(text):
    """
    Extracts the final numerical answer following '####' in the GSM8K dataset answer string.

    Args:
        text (str): The answer string from the dataset.

    Returns:
        str or None: The extracted numerical answer string, or None if '####' is not found.
    """
    if text is None or "####" not in text:
        return None
    # get the last part which is the answer
    return text.split("####")[-1].strip()

# Test the function
print(f"Test extract_gsm8k_answer('Answer is 123 #### 123'): {extract_gsm8k_answer('Answer is 123 #### 123')}")
print(f"Test extract_gsm8k_answer('No hash here'): {extract_gsm8k_answer('No hash here')}")
print(f"Test extract_gsm8k_answer(None): {extract_gsm8k_answer(None)}")

Test extract_gsm8k_answer('Answer is 123 #### 123'): 123
Test extract_gsm8k_answer('No hash here'): None
Test extract_gsm8k_answer(None): None


In [None]:
"""
below regex is used for matching the full output format and is
used in reward functions and evaluation to extract the answer
"""
match_format_regex = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)
print("Regex 'match_format_regex' for extracting solution compiled.")
print("-----Verifying match_format_regex-----")
print(match_format_regex.search("<start_working_out>Let me think!<end_working_out>"\
    "<SOLUTION>2</SOLUTION>",))

"""
below regex is used for matching numbers within the solution and is used
in check_numbers reward function
"""
match_numbers_regex = re.compile(
    # i am extracting first sequence of digits or dots within the solution tags
    rf"{solution_start}.*?([\d\.]{{1,}})",
    flags = re.MULTILINE | re.DOTALL
)
print("Regex 'match_numbers_regex' for extracting numbers from solution compiled.")
print(match_numbers_regex.findall("<SOLUTION>  0.34  </SOLUTION>"))

# Testing regex
# test_string = f"{reasoning_start} 1+1=2 {reasoning_end} {solution_start}2{solution_end}"
# match = match_format_regex.search(test_string)
# print(f"Testing match_format_regex: Match found = {match is not None}, Extracted = {match.group(1) if match else 'N/A'}")
# match_num = match_numbers_regex.search(test_string)
# print(f"Testing match_numbers_regex: Match found = {match_num is not None}, Extracted = {match_num.group(1) if match_num else 'N/A'}")

Regex 'match_format_regex' for extracting solution compiled.
-----Verifying match_format_regex-----
<re.Match object; span=(0, 71), match='<start_working_out>Let me think!<end_working_out>>
Regex 'match_numbers_regex' for extracting numbers from solution compiled.
['0.34']


In [None]:
def format_dataset_entry(example):
    """
    Formats a single dataset example into the required structure for training,evaluation, and applies the system prompt and extracts the answer.

    Args:
        example (dict): A dictionary representing one row from the dataset
                       (expected keys: 'question', 'answer').

    Returns:
        dict: A dictionary with 'prompt' (list of chat messages) and 'answer' (extracted string).
              Returns None for 'answer' if extraction fails.
    """
    question = example.get("question")
    raw_answer = example.get("answer")

    if not question or not raw_answer:
        return {"prompt": None, "answer": None, "question": question} # Handle missing data

    # prompt messages formatting
    prompt_messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": question},
    ]

    # extracting the final answer
    final_answer = extract_gsm8k_answer(raw_answer)

    return {
        "prompt": prompt_messages,
        "answer": final_answer,
        "question": question # retaining org question for reference
    }

# example testing
# test_example = {'question': 'What is 2+2?', 'answer': '2+2=4 #### 4'}
# formatted = format_dataset_entry(test_example)
# print("Testing format_dataset_entry:")
# print(json.dumps(formatted, indent=2))

## Data Preparation

loads the gsm8k dataset and applies formatting defined earlier for both train and test splits

In [None]:
print(f"Loading training dataset: {dataset_name} ({dataset_subset}), split='train'")
raw_train_dataset = load_dataset(dataset_name, dataset_subset, split="train")

print("Applying formatting and filtering to training data...")
formatted_train_dataset = raw_train_dataset.map(format_dataset_entry, batched=False)

# handling examples where formatting/extraction failed
original_train_size = len(formatted_train_dataset)
train_dataset = formatted_train_dataset.filter(lambda x: x["prompt"] is not None and x["answer"] is not None)
filtered_train_size = len(train_dataset)

print(f"Filtered training dataset from {original_train_size} to {filtered_train_size} examples (removed {original_train_size - filtered_train_size}).")
print("Training dataset ready.")
print("\nExample training data point:")
print(train_dataset[0])

Loading training dataset: openai/gsm8k (main), split='train'
Applying formatting and filtering to training data...
Filtered training dataset from 7473 to 7473 examples (removed 0).
Training dataset ready.

Example training data point:
{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': '72', 'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]}


In [None]:
print(f"\nLoading test dataset: {dataset_name} ({dataset_subset}), split='test'")
raw_test_dataset = load_dataset(dataset_name, dataset_subset, split="test")

print("Applying formatting and filtering to test data...")
formatted_test_dataset = raw_test_dataset.map(format_dataset_entry, batched=False)

# handling out examples where formatting/extraction failed
original_test_size = len(formatted_test_dataset)
test_dataset = formatted_test_dataset.filter(lambda x: x["prompt"] is not None and x["answer"] is not None)
filtered_test_size = len(test_dataset)

print(f"Filtered test dataset from {original_test_size} to {filtered_test_size} examples (removed {original_test_size - filtered_test_size}).")
print("Test dataset ready.")
print("\nExample test data point:")
print(test_dataset[0]) # Optional: view example


Loading test dataset: openai/gsm8k (main), split='test'
Applying formatting and filtering to test data...
Filtered test dataset from 1319 to 1319 examples (removed 0).
Test dataset ready.

Example test data point:
{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", 'answer': '18', 'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>', 'role': 'system'}, {'content': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. H

## Reward Functions

Reward functions used in the GRPO trainer are defined here

In [None]:
# reward function for eact format match
def match_format_exactly(completions, **kwargs):
    """
    Rewards completions that exactly match the desired format structure
    rewards +3 for exact match
    """
    scores = []
    for completion in completions:
        score = 0.0
        response = completion[0]["content"]
        if match_format_regex.search(response) is not None: # uses earlier defined regex
            score += 3.0 # rewards +3
        scores.append(score)
    return scores


In [None]:
# reward fucntion for approximate/partial match
def match_format_approximately(completions, **kwargs):
    """ Rewards/penalizes based on the count of format keywords.
        rewards +0.5 for one occurence and -0.5 if more than one
    """
    scores = []
    for completion in completions:
        score = 0.0
        response = completion[0]["content"]
        # Rewards for exactly one occurence otherwise penalizes
        score += 0.5 if response.count(reasoning_start) == 1 else -0.5
        score += 0.5 if response.count(reasoning_end)   == 1 else -0.5
        score += 0.5 if response.count(solution_start)  == 1 else -0.5
        score += 0.5 if response.count(solution_end)    == 1 else -0.5
        scores.append(score)
    return scores

In [None]:
# reward function to check answer correctness
def check_answer(prompts, completions, answer, **kwargs):
    """ Rewards based on correctness of the extracted answer string. """
    responses = [comp[0]["content"] for comp in completions]
    extracted_responses = [
        guess.group(1)
        if (guess := match_format_regex.search(r)) is not None else None \
        for r in responses
    ]
    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0.0
        if guess is None:
            scores.append(0.0)
            continue
        if guess == true_answer: score += 3.0
        elif guess.strip() == true_answer.strip(): score += 1.5
        else:
            try:
                guess_float = float(guess)
                true_answer_float = float(true_answer)
                if true_answer_float != 0:
                    ratio = guess_float / true_answer_float
                    if   ratio >= 0.9 and ratio <= 1.1: score += 0.5
                    elif ratio >= 0.8 and ratio <= 1.2: score += 0.25
                    else: score -= 1.0
                else:
                    if guess_float == 0: score += 1.5
                    else: score -= 1.0
            except (ValueError, TypeError): score -= 0.5
        scores.append(score)
    return scores

In [None]:
# reward function for numerical correctness
def check_numbers(prompts, completions, answer, **kwargs):
    """ Rewards if the extracted number matches the true answer numerically. """
    responses = [comp[0]["content"] for comp in completions]
    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers_regex.search(r)) is not None else None \
        for r in responses
    ]
    scores = []
    for guess_str, true_answer_str in zip(extracted_responses, answer):
        if guess_str is None:
            scores.append(0.0)
            continue
        try:
            guess_float = float(guess_str)
            true_answer_float = float(true_answer_str.strip())
            if guess_float == true_answer_float: scores.append(1.5)
            else: scores.append(0.0)
        except (ValueError, TypeError): scores.append(0.0)
    return scores


## Evaluation

Function that performs evaluation after each epoch

In [None]:
def run_evaluation(model_to_eval, tokenizer_to_use, eval_dataset, batch_size, global_step, max_gen_tokens=128):
    """
    Evaluates the model's accuracy on the provided dataset.
    Saves detailed results to a step-specific JSON file.
    Corrected to handle tokenizer returning only input_ids tensor.

    Args:
        model_to_eval: The model instance to evaluate.
        tokenizer_to_use: The tokenizer associated with the model.
        eval_dataset: The preprocessed dataset (e.g., test split) to evaluate on.
        batch_size (int): How many examples to process in parallel.
        global_step (int): The current training step number (for filename).
        max_gen_tokens (int): Max tokens to generate for the answer part.

    Returns:
        float: The accuracy percentage (0.0 to 100.0).
    """
    model_to_eval.eval()
    correct_predictions = 0
    total_predictions = len(eval_dataset)
    results_details = []

    if tokenizer_to_use.pad_token_id is None:
        print("Debug: Setting pad_token_id to eos_token_id.")
        tokenizer_to_use.pad_token_id = tokenizer_to_use.eos_token_id

    print(f"Starting evaluation for step {global_step} on {total_predictions} examples with batch size {batch_size}...")

    with torch.no_grad():
        for i in tqdm(range(0, total_predictions, batch_size), desc=f"Evaluating Batches (Step {global_step})"):
            batch_indices = range(i, min(i + batch_size, total_predictions))
            batch_data = [eval_dataset[j] for j in batch_indices]

            prompts_batch = [item['prompt'] for item in batch_data]
            true_answers_batch = [item['answer'] for item in batch_data]
            questions_batch = [item['question'] for item in batch_data]

            try:
                inputs = tokenizer_to_use.apply_chat_template(
                    prompts_batch,
                    add_generation_prompt=True,
                    tokenize=True,
                    return_tensors="pt",
                    padding=True,
                ).to(model_to_eval.device)

                if not isinstance(inputs, torch.Tensor):
                    print(f"Error: Skipping batch {i // batch_size} at step {global_step} due to unexpected input type: {type(inputs)}.")
                    continue

                outputs = model_to_eval.generate(
                    input_ids=inputs,
                    max_new_tokens=max_gen_tokens,
                    eos_token_id=tokenizer_to_use.eos_token_id,
                    pad_token_id=tokenizer_to_use.pad_token_id,
                    temperature=1.0, top_p=0.95, top_k=64,
                    do_sample=True,
                )

            except Exception as e:
                print(f"\nCaught unexpected error during generate for batch starting at index {i} (Step {global_step}): {e}")
                print(f"Type of inputs was: {type(inputs)}")
                if isinstance(inputs, torch.Tensor): print(f"Shape of inputs was: {inputs.shape}")
                continue

            for idx, output_ids in enumerate(outputs):
                input_length = inputs[idx].shape[0]
                generated_sequence = output_ids[input_length:]
                decoded_text = tokenizer_to_use.decode(generated_sequence, skip_special_tokens=True).strip()

                # Ensure match_format_regex is defined and accessible
                try:
                    match = match_format_regex.search(decoded_text)
                    predicted_answer = match.group(1).strip() if match else None
                except NameError:
                     print("Error: 'match_format_regex' not defined. Cannot extract predicted answer.")
                     predicted_answer = None
                except Exception as e:
                    print(f"Error extracting answer with regex: {e}")
                    predicted_answer = None


                true_answer = true_answers_batch[idx]
                is_correct = (predicted_answer is not None and predicted_answer == true_answer)
                if is_correct:
                    correct_predictions += 1

                results_details.append({
                    "question": questions_batch[idx],
                    "generated": decoded_text,
                    "predicted": predicted_answer,
                    "expected": true_answer,
                    "correct": is_correct
                })

    accuracy = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0.0
    print(f"Evaluation finished for step {global_step}. Correct: {correct_predictions}/{total_predictions}. Accuracy: {accuracy:.2f}%")
    eval_details_filename = f"evaluation_details_step_{global_step}.json"
    try:
        eval_details_filepath = eval_details_filename
        with open(eval_details_filepath, "w") as f:
           json.dump(results_details, f, indent=2)
        print(f"Detailed evaluation results saved to {eval_details_filepath}")
    except Exception as e: print(f"Could not save detailed eval results for step {global_step}: {e}")

    model_to_eval.train()
    return accuracy

print("Evaluation function 'run_evaluation' updated to accept global_step and save step-specific JSON.")

Evaluation function 'run_evaluation' updated to accept global_step and save step-specific JSON.


In [None]:
class StepEvaluationCallback(TrainerCallback):
    """
    A TrainerCallback that runs evaluation on the test set every N steps.
    """
    def __init__(self, model, tokenizer, test_dataset, eval_batch_size, eval_steps):
        self.model = model
        self.tokenizer = tokenizer
        self.test_dataset = test_dataset
        self.eval_batch_size = eval_batch_size
        self.eval_steps = eval_steps
        self.results = [] # stores dict of step and accuracy

    def on_step_end(self, args, state, control, **kwargs):
        """
        Event triggered at the end of each training step.
        Runs evaluation if the current step is a multiple of eval_steps.
        """
        current_step = state.global_step
        if current_step > 0 and current_step % self.eval_steps == 0:
            print(f"\n--- Callback: Evaluating at Step {current_step} ---")

            accuracy = run_evaluation(
                model_to_eval=self.model,
                tokenizer_to_use=self.tokenizer,
                eval_dataset=self.test_dataset,
                batch_size=self.eval_batch_size,
                global_step=current_step
            )

            self.results.append({"step": current_step, "accuracy": accuracy})
            print(f"--- Callback: Step {current_step} Test Accuracy: {accuracy:.2f}% ---")
            print(f"--- Callback: Stored results: {self.results} ---")

            self.model.train()


## Configure Training Parameters

- setting up GRPO parameters

In [None]:
# # --- TEMPORARY SETTINGS FOR 1-STEP TEST ---
# max_training_steps = 1
# eval_steps = 1
# save_steps = 1 # Needs to be >= 1 if save_strategy is 'steps'
# logging_steps = 1
# # ------------------------

In [None]:
print("Defining GRPO training arguments (GRPOConfig)...")

training_args = GRPOConfig(
    output_dir = output_dir,
    max_steps = max_training_steps,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1,
    learning_rate = 5e-6,
    logging_steps = 1,
    save_strategy = "steps",
    save_steps = save_steps,
    seed = lora_random_state,

    # --- Optimizer params ---
    optim = "adamw_torch_fused",
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.1,
    weight_decay = 0.1,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,

    # --- GRPO specific params ---
    num_generations = 2,
    max_prompt_length = max_prompt_length,
    max_completion_length = max_seq_length - max_prompt_length,
    max_grad_norm = 0.1,
    report_to = "none",

    # --- Technical ---
    remove_unused_columns=False,
    bf16 = torch.cuda.is_bf16_supported(),
    fp16 = not torch.cuda.is_bf16_supported(),
    gradient_checkpointing = True,
)

print("GRPOConfig defined.")
print(training_args)

Defining GRPO training arguments (GRPOConfig)...
Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 2
GRPOConfig defined.
UnslothGRPOConfig(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.99,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
beta=0.04,
bf16=True,
bf16_full_eval=False,
data_seed=3407,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tq

## Trainer Definition

In [None]:
print("Instantiating evaluation callback...")
step_eval_callback = StepEvaluationCallback(
    model=model,
    tokenizer=tokenizer,
    test_dataset=test_dataset,
    eval_batch_size=evaluation_batch_size,
    eval_steps = eval_steps
)
print("Callback instantiated.")

print("\nInstantiating GRPOTrainer...")
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    args = training_args,
    train_dataset = train_dataset,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
    ],
    callbacks = [step_eval_callback],
)
print("GRPOTrainer instantiated.")

Instantiating evaluation callback...
Callback instantiated.

Instantiating GRPOTrainer...
GRPOTrainer instantiated.


In [None]:
print(f"\nStarting GRPO fine-tuning for {max_training_steps} epochs...")
print(f"Checkpoints will be saved to '{output_dir}'.")
print("Evaluation on the test set will run after every 50 steps.")

# Start the training process
training_results = trainer.train()

print("\nTraining finished.")
print(f"Training Metrics: {training_results.metrics}")


Starting GRPO fine-tuning for 100 epochs...
Checkpoints will be saved to 'outputs_grpo_finetune_steps'.
Evaluation on the test set will run after every 50 steps.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
1,0.0,-2.0,0.0,650.0,0.0,0.0,-2.0,0.0,0.0
2,0.0,-1.0,0.0,695.0,0.0,0.0,-1.0,0.0,0.0
3,0.0,-1.0,1.414214,476.5,0.0,0.0,-1.0,0.0,0.0
4,0.0,-1.0,0.0,768.0,0.000481,0.0,-1.0,0.0,0.0
5,0.0,-1.0,0.0,18.0,0.000767,0.0,-1.0,0.0,0.0
6,0.0,-1.0,0.0,768.0,0.000616,0.0,-1.0,0.0,0.0
7,0.0001,-1.5,0.707107,432.5,0.001868,0.0,-1.5,0.0,0.0
8,0.0,-1.0,0.0,768.0,0.001064,0.0,-1.0,0.0,0.0
9,0.0,-2.0,0.0,528.5,0.000682,0.0,-2.0,0.0,0.0
10,0.0,-1.0,0.0,768.0,0.000305,0.0,-1.0,0.0,0.0



--- Callback: Evaluating at Step 10 ---
Starting evaluation for step 10 on 1319 examples with batch size 128...


Evaluating Batches (Step 10):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 10. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_10.json
--- Callback: Step 10 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 20 ---
Starting evaluation for step 20 on 1319 examples with batch size 128...


Evaluating Batches (Step 20):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 20. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_20.json
--- Callback: Step 20 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 30 ---
Starting evaluation for step 30 on 1319 examples with batch size 128...


Evaluating Batches (Step 30):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 30. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_30.json
--- Callback: Step 30 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 40 ---
Starting evaluation for step 40 on 1319 examples with batch size 128...


Evaluating Batches (Step 40):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 40. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_40.json
--- Callback: Step 40 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}, {'step': 40, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 50 ---
Starting evaluation for step 50 on 1319 examples with batch size 128...


Evaluating Batches (Step 50):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 50. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_50.json
--- Callback: Step 50 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}, {'step': 40, 'accuracy': 0.0}, {'step': 50, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 60 ---
Starting evaluation for step 60 on 1319 examples with batch size 128...


Evaluating Batches (Step 60):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 60. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_60.json
--- Callback: Step 60 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}, {'step': 40, 'accuracy': 0.0}, {'step': 50, 'accuracy': 0.0}, {'step': 60, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 70 ---
Starting evaluation for step 70 on 1319 examples with batch size 128...


Evaluating Batches (Step 70):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 70. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_70.json
--- Callback: Step 70 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}, {'step': 40, 'accuracy': 0.0}, {'step': 50, 'accuracy': 0.0}, {'step': 60, 'accuracy': 0.0}, {'step': 70, 'accuracy': 0.0}] ---


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
1,0.0,-2.0,0.0,650.0,0.0,0.0,-2.0,0.0,0.0
2,0.0,-1.0,0.0,695.0,0.0,0.0,-1.0,0.0,0.0
3,0.0,-1.0,1.414214,476.5,0.0,0.0,-1.0,0.0,0.0
4,0.0,-1.0,0.0,768.0,0.000481,0.0,-1.0,0.0,0.0
5,0.0,-1.0,0.0,18.0,0.000767,0.0,-1.0,0.0,0.0
6,0.0,-1.0,0.0,768.0,0.000616,0.0,-1.0,0.0,0.0
7,0.0001,-1.5,0.707107,432.5,0.001868,0.0,-1.5,0.0,0.0
8,0.0,-1.0,0.0,768.0,0.001064,0.0,-1.0,0.0,0.0
9,0.0,-2.0,0.0,528.5,0.000682,0.0,-2.0,0.0,0.0
10,0.0,-1.0,0.0,768.0,0.000305,0.0,-1.0,0.0,0.0



--- Callback: Evaluating at Step 80 ---
Starting evaluation for step 80 on 1319 examples with batch size 128...


Evaluating Batches (Step 80):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 80. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_80.json
--- Callback: Step 80 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}, {'step': 40, 'accuracy': 0.0}, {'step': 50, 'accuracy': 0.0}, {'step': 60, 'accuracy': 0.0}, {'step': 70, 'accuracy': 0.0}, {'step': 80, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 90 ---
Starting evaluation for step 90 on 1319 examples with batch size 128...


Evaluating Batches (Step 90):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 90. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_90.json
--- Callback: Step 90 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}, {'step': 40, 'accuracy': 0.0}, {'step': 50, 'accuracy': 0.0}, {'step': 60, 'accuracy': 0.0}, {'step': 70, 'accuracy': 0.0}, {'step': 80, 'accuracy': 0.0}, {'step': 90, 'accuracy': 0.0}] ---

--- Callback: Evaluating at Step 100 ---
Starting evaluation for step 100 on 1319 examples with batch size 128...


Evaluating Batches (Step 100):   0%|          | 0/11 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Evaluation finished for step 100. Correct: 0/1319. Accuracy: 0.00%
Detailed evaluation results saved to evaluation_details_step_100.json
--- Callback: Step 100 Test Accuracy: 0.00% ---
--- Callback: Stored results: [{'step': 10, 'accuracy': 0.0}, {'step': 20, 'accuracy': 0.0}, {'step': 30, 'accuracy': 0.0}, {'step': 40, 'accuracy': 0.0}, {'step': 50, 'accuracy': 0.0}, {'step': 60, 'accuracy': 0.0}, {'step': 70, 'accuracy': 0.0}, {'step': 80, 'accuracy': 0.0}, {'step': 90, 'accuracy': 0.0}, {'step': 100, 'accuracy': 0.0}] ---

Training finished.
Training Metrics: {'train_runtime': 9540.4468, 'train_samples_per_second': 0.021, 'train_steps_per_second': 0.01, 'total_flos': 0.0, 'train_loss': 0.00014547548022505907}


## Saving Evaluation Results

In [None]:
print(f"\nSaving step evaluation results to '{results_filename}'...")
try:
    # extract results from step_Eval_callback
    final_eval_results = step_eval_callback.results
    if final_eval_results:
        with open(results_filename, "w") as f:
            json.dump(final_eval_results, f, indent=4)
        print(f"Results saved successfully.")
        print(f"\nFinal evaluation results per evaluation step:")
        print(json.dumps(final_eval_results, indent=4))
    else:
        print("Warning: No evaluation results were found in the callback.")
except Exception as e:
    print(f"Error saving evaluation results: {e}")
    try:
        print("Callback results object:", step_eval_callback.results)
    except:
        print("Could not access callback results.")

### Inference example

In [None]:
#
print("\n--- Running Inference Example on Final Model ---")
model.eval() # setting model in eval mode

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user",   "content": "What is the square root of 101?"},
]

# Applying template formatting
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
print(f"Input text:\n{text}")

# Generating response
print("\nGenerated Response:")
inputs = tokenizer(text, return_tensors="pt").to(model.device)
_ = model.generate(
    **inputs,
    max_new_tokens = 128,
    temperature = 1.0, top_p = 0.95, top_k = 64,
    do_sample=True,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)
print("\n--- Inference Example Done ---")


--- Running Inference Example on Final Model ---
Input text:
<bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION>

What is the square root of 101?<end_of_turn>
<start_of_turn>model


Generated Response:
<SOLUTION>The square root of 101 is 10.0997...
<end_of_turn>

--- Inference Example Done ---


## Model Saving

In [None]:
# saving the merged float16 Model
save_merged_float16 = True
merged_float16_path = f"{output_dir}/final_merged_float16"

if save_merged_float16:
    print(f"\nSaving merged float16 model to '{merged_float16_path}'...")
    os.makedirs(merged_float16_path, exist_ok=True)
    # Merging LoRA weights and save in float16
    model.save_pretrained_merged(merged_float16_path, tokenizer, save_method="merged_16bit")
    print(f"Merged float16 model saved.")
else:
    print("\nSkipping saving merged float16 model.")


Saving merged float16 model to 'outputs_grpo_finetune_steps/final_merged_float16'...


Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:15<00:00, 15.59s/it]


Merged float16 model saved.
