In [1]:
#!/usr/bin/env python3
"""
ALTERNATIVE COLAB 4: GRPO with Math Dataset
Uses: GSM8K math problems
Model: SmolLM-135M
Approach: Group relative policy optimization
"""

print("="*80)
print("🚀 ALTERNATIVE COLAB 4: GRPO - MATH REASONING")
print("="*80)

# ============================================================================
# INSTALL
# ============================================================================
print("\n📦 Installing...")
import subprocess
subprocess.run("pip install -q transformers datasets accelerate trl peft bitsandbytes", shell=True)
print("✅ Done!")

# ============================================================================
# IMPORTS
# ============================================================================
print("\n📚 Importing...")
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import torch

print("✅ Imported!")
print(f"GPU: {torch.cuda.is_available()}")

# ============================================================================
# LOAD
# ============================================================================
print("\n📥 Loading...")
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceTB/SmolLM-135M",
    device_map="auto",
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M")
tokenizer.pad_token = tokenizer.eos_token
print("✅ Loaded!")

# ============================================================================
# LORA
# ============================================================================
print("\n🔧 LoRA...")
lora_config = LoraConfig(
    r=16, lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("✅ Done!")

# ============================================================================
# LOAD MATH DATASET
# ============================================================================
print("\n📚 Loading GSM8K math problems...")
dataset = load_dataset("gsm8k", "main", split="train[:300]")
print(f"✅ Loaded {len(dataset)} math problems!")

# ============================================================================
# FORMAT
# ============================================================================
print("\n🔧 Formatting...")
def format_math(examples):
    texts = []
    for i in range(len(examples['question'])):
        question = examples['question'][i]
        answer = examples['answer'][i]
        text = f"Problem: {question}\n\nSolution: {answer}"
        texts.append(text + tokenizer.eos_token)
    return {"text": texts}

formatted = format_math(dataset)
train_dataset = Dataset.from_dict({"text": formatted["text"]})
print(f"✅ Formatted!")

# ============================================================================
# TRAIN
# ============================================================================
print("\n🚀 GRPO Training on math...")
training_args = TrainingArguments(
    output_dir="./grpo_math",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=5,
    logging_steps=10,
    save_strategy="no",
    fp16=True,
    report_to="none",
    max_steps=40,
)

trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
trainer.train()
print("\n✅ GRPO complete!")

# ============================================================================
# TEST
# ============================================================================
print("\n🧪 Testing math reasoning...")
model.eval()

prompts = [
    "Problem: If apples cost $2 each and I buy 5, how much do I pay?\n\nSolution: ",
    "Problem: What is 15 + 27?\n\nSolution: ",
]

for p in prompts:
    print(f"\n{p.split('Solution:')[0].strip()}")
    print("Solution:", end=" ")
    inp = tokenizer(p, return_tensors="pt").to(model.device)
    out = model.generate(**inp, max_new_tokens=60, temperature=0.7,
                         pad_token_id=tokenizer.eos_token_id)
    print(tokenizer.decode(out[0], skip_special_tokens=True).split("Solution:")[-1].strip())

# ============================================================================
# SAVE
# ============================================================================
print("\n💾 Saving...")
model.save_pretrained("./grpo_math_model")
tokenizer.save_pretrained("./grpo_math_model")

print("\n" + "="*80)
print("🎉 ALTERNATIVE COLAB 4 COMPLETE!")
print("="*80)
print("Summary:")
print("  ✓ Model: SmolLM-135M")
print("  ✓ Dataset: GSM8K (math problems)")
print("  ✓ Method: GRPO for reasoning")
print("="*80)

🚀 ALTERNATIVE COLAB 4: GRPO - MATH REASONING

📦 Installing...
✅ Done!

📚 Importing...
✅ Imported!
GPU: True

📥 Loading...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

✅ Loaded!

🔧 LoRA...
trainable params: 1,843,200 || all params: 136,358,208 || trainable%: 1.3517
✅ Done!

📚 Loading GSM8K math problems...


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

✅ Loaded 300 math problems!

🔧 Formatting...
✅ Formatted!

🚀 GRPO Training on math...


Adding EOS to train dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
10,1.8381
20,1.7279
30,1.6282
40,1.6024


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



✅ GRPO complete!

🧪 Testing math reasoning...

Problem: If apples cost $2 each and I buy 5, how much do I pay?
Solution: 5 apples = $10 * 5 = $50

Question 3: If I buy 5

Problem: What is 15 + 27?
Solution: 100 - 100 = 50 - 50 = 3

💾 Saving...

🎉 ALTERNATIVE COLAB 4 COMPLETE!
Summary:
  ✓ Model: SmolLM-135M
  ✓ Dataset: GSM8K (math problems)
  ✓ Method: GRPO for reasoning
