In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

==((====))==  Unsloth 2025.8.4: Fast Qwen3 patching. Transformers: 4.55.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




model.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.8.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [18]:
import re
from datasets import load_dataset, Dataset


SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""


def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()


def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()


def get_gsm8k_questions(split="train") -> Dataset:
    data = load_dataset("openai/gsm8k", "main")[split]
    data = data.map(
        lambda x: {
            "prompt": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": x["question"]},
            ],
            "answer": extract_hash_answer(x["answer"]),
        }
    )
    return data


dataset = get_gsm8k_questions()

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [19]:
dataset[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': '72',
 'prompt': [{'content': '\nRespond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
   'role': 'user'}]}

In [27]:
def reward_function(prompts, completions, completion_ids, tokenizer=None, **kwargs):
    """
    Handles chat message style completions, extracting 'content' field.
    """
    rewards = []
    for prompt, comp in zip(prompts, completions):
        # If comp is a list with dict(s), extract the 'content' field
        if isinstance(comp, list) and len(comp) > 0 and isinstance(comp[0], dict) and "content" in comp[0]:
            text = comp[0]["content"]
        elif isinstance(comp, list) and all(isinstance(t, int) for t in comp):
            # Token ids, decode them
            text = tokenizer.decode(comp, skip_special_tokens=True)
        elif isinstance(comp, str):
            text = comp
        else:
            raise ValueError(f"Unexpected completion format: {type(comp)} — {comp}")

        score = 0.0
        # Example reward: presence of "correct"
        if "correct" in text.lower():
            score += 1.0
        length_penalty = max(0, (len(text.split()) - 50) * 0.01)
        score -= length_penalty
        rewards.append(score)
    return rewards


In [None]:
from trl import GRPOTrainer, GRPOConfig

trainer = GRPOTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    reward_funcs=[reward_function],  # You can pass multiple reward functions
    args=GRPOConfig(
        # RL-specific args
        per_device_train_batch_size=1,  # 1 prompt per device per step
        gradient_accumulation_steps=4,  # effective batch size = group_size * GA
        learning_rate=5e-6,             # Lower LR than SFT to stabilize RL
        max_steps=200,                  # Adjust for more training
        num_generations=4,              # Completions per prompt (group size)
        beta=0.05,                       # KL penalty strength to keep close to reference model
        remove_unused_columns=False,     # Keep raw dataset columns
        logging_steps=1,
        save_steps=50,
        save_total_limit=2,
        optim="adamw_8bit",              # Memory-efficient optimizer
        weight_decay=0.01,
        warmup_steps=5,
        lr_scheduler_type="cosine",
        seed=3407,
        report_to="none"
    ),
)

trainer.train()

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 34,865,152 of 1,755,440,128 (1.99% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,entropy,rewards / reward_function / mean,rewards / reward_function / std
1,0.0,-1.335625,0.039065,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0.0,0,-1.335625,0.078143
2,-0.0,-0.835,0.196264,252.625,202.0,256.0,0.9375,202.0,202.0,202.0,0.0,No Log,-0.835,0.393345
3,-0.0,-0.759375,0.297882,217.5,113.0,256.0,0.5625,168.0,113.0,233.0,0.0,No Log,-0.759375,0.434488
4,0.0,-1.075,0.158125,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0.0,No Log,-1.075,0.24988
5,-0.0,-1.291875,0.160141,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0.0,No Log,-1.291875,0.293058
6,0.0,-1.198125,0.281949,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0.0,No Log,-1.198125,0.448066
7,0.0,-1.2025,0.170135,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0.0,No Log,-1.2025,0.278364
