In [1]:
# Install Unsloth and GRPO dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-p9dcv1aq/unsloth_1b5158992ecc48a68736d50158e24836
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-p9dcv1aq/unsloth_1b5158992ecc48a68736d50158e24836
  Resolved https://github.com/unslothai/unsloth.git to commit 1c0ad844f170f67c7cdf6f7a9465bafb0f9627df
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.11.3 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.11.3-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.gi

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load SmolLM2-135M for GRPO reasoning training
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("Model loaded for GRPO training!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Model loaded for GRPO training!


In [3]:
# Configure LoRA for GRPO
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
)

print("LoRA configured for GRPO!")
model.print_trainable_parameters()

Unsloth 2025.11.2 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


LoRA configured for GRPO!
trainable params: 4,884,480 || all params: 139,400,064 || trainable%: 3.5039


In [15]:
from datasets import load_dataset

# Load GSM8K math reasoning dataset
dataset = load_dataset("openai/gsm8k", "main", split="train")

# Use only 50 examples for quick demo
dataset = dataset.select(range(50))

print(f"Dataset loaded: {len(dataset)} examples (reduced for speed)")
print(f"Columns: {dataset.column_names}\n")

# Show examples
print("="*70)
print("EXAMPLE 1: Math Problem")
print("="*70)
print(f"\nQuestion: {dataset[0]['question']}")
print(f"\nAnswer: {dataset[0]['answer']}")

print("\n" + "="*70)
print("EXAMPLE 2: Math Problem")
print("="*70)
print(f"\nQuestion: {dataset[1]['question']}")
print(f"\nAnswer: {dataset[1]['answer']}")

Dataset loaded: 50 examples (reduced for speed)
Columns: ['question', 'answer']

EXAMPLE 1: Math Problem

Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

EXAMPLE 2: Math Problem

Question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

Answer: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10


In [17]:
# GRPO needs prompts for generation
def format_for_grpo(examples):
    prompts = []
    for question in examples['question']:
        prompt = f"Solve this math problem step by step:\n\nQuestion: {question}\n\nAnswer:"
        prompts.append(prompt)
    return {"prompt": prompts}

dataset = dataset.map(format_for_grpo, batched=True, remove_columns=['question', 'answer'])

print("Dataset formatted for GRPO!")
print(f"Total examples: {len(dataset)}")
print(f"\nSample prompt:\n{dataset[0]['prompt']}")

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset formatted for GRPO!
Total examples: 50

Sample prompt:
Solve this math problem step by step:

Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Answer:


In [18]:
import re

def extract_answer(text):
    """Extract numerical answer from response"""
    # Look for #### ANSWER pattern (GSM8K format)
    match = re.search(r'####\s*(\d+)', text)
    if match:
        return int(match.group(1))

    # Look for any number at the end
    numbers = re.findall(r'\d+', text)
    if numbers:
        return int(numbers[-1])

    return None

def reward_function(prompts, completions, **kwargs):
    """
    GRPO reward function
    Args:
        prompts: List of input prompts
        completions: List of generated completions
    Returns:
        List of rewards (1.0 for correct, 0.0 for incorrect)
    """
    rewards = []

    for prompt, completion in zip(prompts, completions):
        # Extract answer from completion
        pred_answer = extract_answer(completion)

        # Reward if answer is reasonable
        if pred_answer is not None and pred_answer > 0:
            rewards.append(1.0)
        else:
            rewards.append(0.0)

    return rewards

# Test reward function
test_prompts = ["Solve: 2 + 2"]
test_completions = ["The answer is 4. #### 4"]
test_rewards = reward_function(test_prompts, test_completions)
print(f"\nTest reward: {test_rewards[0]} (should be 1.0)")


Test reward: 1.0 (should be 1.0)


In [19]:
from trl import GRPOTrainer, GRPOConfig
from transformers import GenerationConfig

# GRPO configuration - optimized for speed
training_args = GRPOConfig(
    output_dir="outputs",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=5e-6,
    logging_steps=1,
    max_steps=10,  # Only 10 steps for quick demo
    num_generations=2,  # 2 generations per prompt
    temperature=0.7,
    report_to="none",
)

# Generation config - shorter outputs for speed
generation_config = GenerationConfig(
    max_new_tokens=128,
    temperature=0.7,
    do_sample=True,
    top_p=0.9,
)

# Create GRPO trainer
grpo_trainer = GRPOTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset,
    reward_funcs=[reward_function],
    generation_config=generation_config,
)

print("GRPO Trainer configured!")
print(f"Max steps: {training_args.max_steps}")
print(f"Generations per prompt: {training_args.num_generations}")
print(f"Max tokens per generation: {generation_config.max_new_tokens}")

GRPO Trainer configured!
Max steps: 10
Generations per prompt: 2
Max tokens per generation: 128


In [20]:
# Start GRPO training
import time
start_time = time.time()

trainer_stats = grpo_trainer.train()

training_time = time.time() - start_time
print(f"\nGRPO training completed in {training_time/60:.2f} minutes")

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 50 | Num Epochs = 1 | Total steps = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / reward_function / mean,rewards / reward_function / std
1,0.0,0.5,0.0,76.75,1.0,256.0,0.25,17.0,1.0,49.0,0,0,0,0,0,8.8e-05,0.5,0.57735
2,0.0,0.75,0.353553,113.5,1.0,256.0,0.25,66.0,1.0,104.0,No Log,No Log,No Log,No Log,No Log,0.000475,0.75,0.5
3,0.0,1.0,0.0,195.5,16.0,256.0,0.5,135.0,16.0,254.0,No Log,No Log,No Log,No Log,No Log,2.2e-05,1.0,0.0
4,0.0,0.75,0.353553,192.25,1.0,256.0,0.75,1.0,1.0,1.0,No Log,No Log,No Log,No Log,No Log,0.000499,0.75,0.5
5,0.0,0.75,0.353553,143.75,1.0,256.0,0.25,106.333336,1.0,239.0,No Log,No Log,No Log,No Log,No Log,0.000274,0.75,0.5
6,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,No Log,No Log,No Log,No Log,No Log,0.001015,0.0,0.0
7,0.0,0.75,0.353553,256.0,256.0,256.0,1.0,0.0,0.0,0.0,No Log,No Log,No Log,No Log,No Log,3e-05,0.75,0.5
8,0.0,1.0,0.0,256.0,256.0,256.0,1.0,0.0,0.0,0.0,No Log,No Log,No Log,No Log,No Log,2.6e-05,1.0,0.0
9,0.0,1.0,0.0,256.0,256.0,256.0,1.0,0.0,0.0,0.0,No Log,No Log,No Log,No Log,No Log,2.4e-05,1.0,0.0
10,0.0,0.75,0.353553,173.25,1.0,256.0,0.5,90.5,1.0,180.0,No Log,No Log,No Log,No Log,No Log,0.00013,0.75,0.5



GRPO training completed in 3.59 minutes


In [21]:
# Enable inference
FastLanguageModel.for_inference(model)

# Test math problems
test_problems = [
    "If John has 5 apples and gives 2 to Mary, how many does he have left?",
    "A store has 24 items. If they sell half, how many remain?",
    "Calculate: 15 + 27",
]

print("="*70)
print("TESTING GRPO-TRAINED REASONING MODEL")
print("="*70)

for i, problem in enumerate(test_problems, 1):
    prompt = f"Solve this math problem step by step:\n\nQuestion: {problem}\n\nAnswer:"

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.3,
        do_sample=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = response[len(prompt):].strip()

    print(f"\nProblem {i}: {problem}")
    print(f"Response: {answer[:250]}")
    print("-"*70)

TESTING GRPO-TRAINED REASONING MODEL

Problem 1: If John has 5 apples and gives 2 to Mary, how many does he have left?
Response: John has 5 apples + 2 apples = 7 apples.

Step 2: Mary has 7 apples and gives 2 apples to John.

Step 3: Mary has 7 apples + 2 apples = 8 apples.

Step 4: Since John has 7 apples and Mary has 8 apples, and both have the same number of apples, they ar
----------------------------------------------------------------------

Problem 2: A store has 24 items. If they sell half, how many remain?
Response: 24 / 2 = 12

Step 2: 12 - 24 = 8

So, 8 - 12 = 6

Step 3: 6 - 12 = 4

So, 4 - 12 = 2

Step 4: 2 - 12 = 1

So, 1 - 12 = 0

Step 5: 0 - 12 = 0

So, 0 - 12 = 0

So, 0 - 12 = 0

Step 6: 0 - 12 =
----------------------------------------------------------------------

Problem 3: Calculate: 15 + 27
Response: 32

Step 1:
First, I add 27 to 15 to get 31.

Step 2:
Next, I subtract 27 from 15 to get 9.

Step 3:
Next, I subtract 9 from 31 to get 6.

Step 4:
Next, I subtract 6 

In [22]:
# Show GRPO's multi-generation capability
print("="*70)
print("GRPO MULTI-GENERATION DEMONSTRATION")
print("="*70)
print("\nGRPO generates multiple attempts and learns from rewards")
print("Generating 3 different solutions to the same problem:\n")

problem = "A baker makes 48 cookies and packs them equally into 6 boxes. How many cookies per box?"
prompt = f"Solve this math problem step by step:\n\nQuestion: {problem}\n\nAnswer:"

for attempt in range(3):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=120,
        temperature=0.8,
        do_sample=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = response[len(prompt):].strip()

    print(f"\nAttempt {attempt + 1}:")
    print(answer[:200])
    print("-"*70)

print("\nGRPO learns to prefer correct reasoning paths!")

GRPO MULTI-GENERATION DEMONSTRATION

GRPO generates multiple attempts and learns from rewards
Generating 3 different solutions to the same problem:


Attempt 1:
If a box is packed with cookies how many boxes will there be -

Answer: The baker packs 12 cookies into 4 boxes.
----------------------------------------------------------------------

Attempt 2:

----------------------------------------------------------------------

Attempt 3:

----------------------------------------------------------------------

GRPO learns to prefer correct reasoning paths!


In [23]:
# Save GRPO-trained reasoning model
save_dir = "smollm2_grpo_reasoning"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"GRPO model saved to: {save_dir}")

# Check size
!du -sh {save_dir}

GRPO model saved to: smollm2_grpo_reasoning
24M	smollm2_grpo_reasoning
