In [1]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-24 16:17:53 [__init__.py:239] Automatically detected platform cuda.


In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = False, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-24 16:28:40 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA GeForce RTX 4070 Laptop GPU. Num GPUs = 1. Max memory: 7.747 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.19 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [5]:
prompt = """Based on given instruction and context, generate an appropriate response

### Instruction:
{}

### Context:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    contexts = examples["context"]
    responses = examples["response"]
    texts = []

    for i,j,k  in zip(instructions, contexts,responses):
        text = prompt.format(i,j,k) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [6]:
inputs = tokenizer(
    [
        prompt.format(
            "What is the GSM8k benchmark dataset?",  # instruction
            " ",  # context
            " ",  # response
        )
    ] * 1,
    return_tensors="pt",
).to("cuda")

# Generate response
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=1028)


Based on given instruction and context, generate an appropriate response

### Instruction:
What is the GSM8k benchmark dataset?

### Context:
 

### Response:
 
The GSM8K benchmark dataset is a collection of 8,059 multiple-choice questions that are designed to test a reader's understanding of English language passages. Each question is associated with a short text passage, and the goal is to select the most appropriate answer from a set of four choices. This dataset is used to evaluate natural language understanding (NLU) models and assess their ability to comprehend and extract information from textual content. The questions cover a wide range of topics, including science, history, literature, and general knowledge, making it a versatile tool for testing various aspects of reading comprehension.

### Revised Response:
The GSM8K benchmark dataset consists of 8,059 multiple-choice questions paired with short text passages. Its primary purpose is to evaluate the performance of natural la

In [None]:
import re
import pandas as pd
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def extract_answer_from_output(completion):
    ANS_RE = re.compile(r"(\-?[0-9\.\,]+)")
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return None

def check_answer(output_text, answer):
    # Extract the answer from the output text
    extracted_answer = extract_answer_from_output(extract_xml_answer(output_text))
    print(f"Extracted answer: {extracted_answer}")
    # Compare with the expected answer
    return extracted_answer == answer


# uncomment middle messages for 1-shot prompting
def get_gsm8k_french_questions():
    data = pd.read_csv("gsm8k-french2-translated.csv")
    # print(data['french_question'])
    data = data.apply(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['french_question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }, axis=1) # type: ignore
    return data # type: ignore

def get_gsm8k_questions():
    data = pd.read_csv("gsm8k-french.csv")
    # print(data['french_question'])
    data = data.apply(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }, axis=1) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_answer_from_output(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_answer_from_output(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [15]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8


In [16]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

AttributeError: 'Qwen2ForCausalLM' object has no attribute 'vllm_engine'

In [12]:
model.save_pretrained_merged("merged_model", tokenizer, save_method="merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 0.66 out of 14.84 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 61%|██████    | 22/36 [00:00<00:00, 45.61it/s]
We will save to Disk and not RAM now.
100%|██████████| 36/36 [00:03<00:00,  9.50it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [11]:
model.save_pretrained_merged("lora_model", tokenizer, save_method = "lora",)

Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... Done.


In [33]:
from vllm import LLM, SamplingParams


# text = tokenizer.apply_chat_template([
#     {"role" : "system", "content" : SYSTEM_PROMPT},
#     {"role" : "user", "content" : "Calculate pi."},
# ], tokenize = False, add_generation_prompt = True)

prompt = tokenizer.apply_chat_template(
    dataset[0]['prompt'],
    tokenize = False,
    add_generation_prompt = True,
)
# print(prompt)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output_ids = model.generate(
    **inputs,
    do_sample=True,
    temperature=0.8,
    top_p=0.95,
    max_new_tokens=1024,
)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [34]:
print(output_text)

system

Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>

user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
assistant
<reasoning>
To find out how many clips Natalia sold altogether in April and May, we first need to determine how many clips she sold in May. Since she sold half as many clips in May as she did in April, we can calculate the number of clips sold in May by dividing the number of clips sold in April (48) by 2. After finding the number of clips sold in May, we add it to the number of clips sold in April to get the total.

1. Clips sold in April = 48.
2. Clips sold in May = 48 / 2 = 24.
3. Total clips sold in April and May = Clips in April + Clips in May = 48 + 24 = 72.

So, Natalia sold a total of 72 clips in April and May.
</reasoning>
<answer>
Natalia sold a total of 72 clips in April and May.
</answer>


In [None]:
# # check if output is correct

# # Example usage 
correct = check_answer(output_text, dataset[0]['answer'])

Extracted answer: 72
