In [1]:
!pip install -q torch transformers trl peft bitsandbytes accelerate datasets wandb

In [2]:
pip install flash-attn --no-build-isolation



In [3]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import wandb
import re
from peft import prepare_model_for_kbit_training, get_peft_model

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33myiboc[0m ([33myiboc-massachusetts-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
%env WANDB_PROJECT=grpo-qwen-gsm8k

env: WANDB_PROJECT=grpo-qwen-gsm8k


In [11]:
model_id = "Qwen/Qwen2.5-1.5B-Instruct"


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map=None,

).to("cuda")


In [7]:
SYSTEM_PROMPT = """
Respond to the user's math problem.
You must format your output as follows:
<think>
{reasoning}
</think>
<answer>
{final_answer}
</answer>
"""

def format_data(example):
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": example["question"]},
        ]
    }

dataset = load_dataset("openai/gsm8k", "main", split="train")
dataset = dataset.map(format_data)

In [8]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

# Reward 1: Format (Did they use the tags?)
def format_reward_func(completions, **kwargs):
    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.search(pattern, r, re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

# Reward 2: Correctness (Does the number match?)
def correctness_reward_func(prompts, completions, answer, **kwargs):
    responses = [completion[0]["content"] for completion in completions]
    extracted_answers = [extract_xml_answer(r) for r in responses]

    rewards = []
    for extracted, correct in zip(extracted_answers, answer):
        # Extract the number from the GSM8K solution text (usually last number)
        correct_val = correct.split("#### ")[-1].strip()
        if extracted == correct_val:
            rewards.append(2.0) # High reward for correct answer
        else:
            rewards.append(0.0)
    return rewards

In [12]:
training_args = GRPOConfig(
    output_dir="qwen-grpo-gsm8k",
    logging_steps=1,
    per_device_train_batch_size=8, # Keep low for Colab
    gradient_accumulation_steps=1,
    num_generations=8,             # Decrease to 2 if OOM
    max_prompt_length=512,
    max_completion_length=512,     # Keep short to save memory
    learning_rate=5e-6,
    report_to="wandb",             # <--- THIS ENABLE WANDB LOGGING
    fp16=False,                     # Required for T4 GPUs
    bf16=True,
    max_steps=200
)

peft_config = LoraConfig(
    r=16, lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[format_reward_func, correctness_reward_func],
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer
)

In [13]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Step,Training Loss
1,0.1811
2,-0.0928
3,0.0197
4,-0.209
5,0.086
6,-0.0327
7,0.2666
8,0.0069
9,-0.0757
10,0.1207


KeyboardInterrupt: 