### Libraries, random seed, wandb installation

In [1]:
%%capture
!pip install uv
!uv pip install --no-deps unsloth vllm==0.8.5.post1
import sys, re, requests; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
!uv pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!uv pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer

f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
with open("vllm_requirements.txt", "wb") as file:
    file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
!uv pip install -r vllm_requirements.txt
!uv pip install pympler

In [2]:
import json
import os
import random
import wandb
from collections import defaultdict
import re
from scipy.stats import pearsonr
from pympler import asizeof
import matplotlib.pyplot as plt
import time
import numpy as np
import torch

In [3]:
run_name = "Group_method_GRPO"

In [4]:
seed = 42

### Unsloth

Load up `Qwen 2.5 0.5B Instruct`, and set parameters

In [5]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-0.5B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.8, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = seed,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-22 12:15:39 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-22 12:15:39 [__init__.py:239] Automatically detected platform cuda.
Unsloth: Patching vLLM v1 graph capture
Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.7.7: Fast Qwen2 patching. Transformers: 4.53.2. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-0.5b-instruct-unsloth-bnb-4bit with actual GPU utilization = 79.24%
Unsloth: Your GPU has CUDA compute 

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

INFO 07-22 12:16:18 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 07-22 12:16:18 [cuda.py:289] Using XFormers backend.
INFO 07-22 12:16:19 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 07-22 12:16:19 [model_runner.py:1108] Starting to load model unsloth/qwen2.5-0.5b-instruct-unsloth-bnb-4bit...
INFO 07-22 12:16:19 [loader.py:1187] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 07-22 12:16:19 [weight_utils.py:265] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

INFO 07-22 12:16:28 [weight_utils.py:281] Time spent downloading weights for unsloth/qwen2.5-0.5b-instruct-unsloth-bnb-4bit: 8.239109 seconds
INFO 07-22 12:16:28 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-22 12:16:29 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 07-22 12:16:29 [model_runner.py:1140] Model loading took 0.5747 GiB and 9.950732 seconds
INFO 07-22 12:16:37 [worker.py:287] Memory profiling takes 7.13 seconds
INFO 07-22 12:16:37 [worker.py:287] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.79) = 11.68GiB
INFO 07-22 12:16:37 [worker.py:287] model weights take 0.57GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.23GiB; the rest of the memory reserved for KV Cache is 9.85GiB.
INFO 07-22 12:16:37 [executor_base.py:112] # cuda blocks: 53816, # CPU blocks: 0
INFO 07-22 12:16:37 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 840.88x
INFO 07-22 12:16:37 [vllm_utils.py:669] Unsloth: Running patched vLLM v0 `capture_model`.
INFO 07-22 12:16:37 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run t

Capturing CUDA graph shapes:   0%|          | 0/31 [00:00<?, ?it/s]

INFO 07-22 12:16:59 [model_runner.py:1592] Graph capturing finished in 21 secs, took 0.43 GiB
INFO 07-22 12:16:59 [vllm_utils.py:676] Unsloth: Patched vLLM v0 graph capture finished in 21 secs.
INFO 07-22 12:17:00 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 30.53 seconds
Unsloth: Just some info: will skip parsing ['k_norm', 'q_norm', 'post_feedforward_layernorm', 'pre_feedforward_layernorm']
Unsloth: Just some info: will skip parsing ['k_norm', 'q_norm', 'post_feedforward_layernorm', 'pre_feedforward_layernorm']


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.7.7 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.


### Get eval dataset

In [6]:
from datasets import load_dataset

In [7]:
def extract_xml_answer(text: str) -> str:
    try:
        answer = text.split("<answer>")[-1].split("</answer>")[0].strip()
        return answer
    except IndexError:
        return ""

def extract_xml_reasoning(text: str) -> str:
    try:
        answer = text.split("<reasoning>")[-1].split("</reasoning>")[0].strip()
        return answer
    except IndexError:
        return ""

In [8]:
dataset = load_dataset('json', data_files={
    # 'train': 'https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl',
    'test': 'https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl'
})['test']

Downloading data:   0%|          | 0.00/750k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [9]:
R1_STYLE_SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. "
    "The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. "
    "The reasoning process and answer are enclosed within <reasoning> </reasoning> and "
    "<answer> </answer> tags, respectively, i.e., <reasoning> reasoning process here </reasoning>\n"
    "<answer> answer here </answer>."
)

TASK_SPECIFIC_INSTRUCTIONS = "The answer must be a single integer."

CONTENT_EXAMPLE = (
    "<reasoning>To calculate 2+2, we simply add the numbers together: 2 + 2 = 4.</reasoning>\n"
    "<answer>4</answer>"
)

def preprocess_dataset(dataset, chunk_size=1000):
    total_samples = len(dataset)
    print(f"Loaded {total_samples} samples")

    def extract_hash_answer(text: str) -> str | None:
        try:
            return text.split("####")[1].strip()
        except IndexError:
            return None

    def process_batch(batch):
        prompts = [[
            {'role': 'system', 'content': R1_STYLE_SYSTEM_PROMPT + "\n" + TASK_SPECIFIC_INSTRUCTIONS},
            {'role': 'user', 'content': "What is 2+2?"},
            {'role': 'assistant', 'content': CONTENT_EXAMPLE},
            {'role': 'user', 'content': q.strip()}
        ] for q in batch['question']]

        return {
            'prompt': prompts,
            'answer': batch['answer'],
            'num_answer': [extract_hash_answer(a) for a in batch['answer']],
        }

    return dataset.map(process_batch, batched=True, batch_size=chunk_size)

dataset = preprocess_dataset(dataset, chunk_size=500)

Loaded 1319 samples


Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [10]:
print(json.dumps(dataset[0], indent=4))

{
    "question": "Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
    "answer": "Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer\u2019s market.\n#### 18",
    "prompt": [
        {
            "content": "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <reasoning> </reasoning> and <answer> </answer> tags, respectively, i.e., <reasoning> reasoning process here </reasoning>\n<answer> answer here </answer>.\nThe answer must be a single integer.",
            "role": "system"
   

### Eval after training

In [11]:
from vllm import LLM, SamplingParams
from tqdm.notebook import tqdm
from datetime import datetime
from pathlib import Path
import gdown

In [12]:
batch_size=16
lora_path = run_name + "_grpo_saved_lora"

In [13]:
URLS = {
    "https://drive.google.com/uc?id=1Bl-qc_WdWmE_qtVH2llZuH_Xwqah_Vzd": f"{lora_path}.zip"
}

In [14]:
print(lora_path)

Group_method_GRPO_grpo_saved_lora


In [15]:
for url, path in URLS.items():
    gdown.download(url, path)

Downloading...
From (original): https://drive.google.com/uc?id=1Bl-qc_WdWmE_qtVH2llZuH_Xwqah_Vzd
From (redirected): https://drive.google.com/uc?id=1Bl-qc_WdWmE_qtVH2llZuH_Xwqah_Vzd&confirm=t&uuid=03e2128f-bd1b-4463-b7ed-5e12dc886b5b
To: /content/Group_method_GRPO_grpo_saved_lora.zip
100%|██████████| 64.1M/64.1M [00:00<00:00, 86.9MB/s]


In [16]:
!unzip "Group_method_GRPO_grpo_saved_lora.zip" -d .

Archive:  Group_method_GRPO_grpo_saved_lora.zip
   creating: ./Group_method_GRPO_grpo_saved_lora/
  inflating: ./Group_method_GRPO_grpo_saved_lora/README.md  
  inflating: ./Group_method_GRPO_grpo_saved_lora/adapter_config.json  
  inflating: ./Group_method_GRPO_grpo_saved_lora/adapter_model.safetensors  


In [17]:
sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=4096,
    stop_token_ids=[tokenizer.eos_token_id],
)

total_samples = len(dataset)
results = []
correct = 0
total = 0
len_sum = 0
len_min = 4096
len_max = 0
correct_format = 0

def format_reward_func(response) -> float:
    pattern = r"^<reasoning>.*?</reasoning>\s*<answer>\s*[+-]?\d*\.?\d+\s*</answer>$"
    matched = bool(re.match(pattern, response))
    return int(matched)

progress_bar = tqdm(
    total=total_samples,
    desc="Processing samples",
    unit="examples",
    dynamic_ncols=True,
)

progress_bar.set_postfix({
    'acc': '0.00%',
    'correct': '0',
})


for i in range(0, total_samples, batch_size):
    batch_data = dataset[i:i + batch_size]
    current_batch_size = len(batch_data['prompt'])

    formatted_prompts = [
        tokenizer.apply_chat_template(
            p,
            tokenize=False,
            add_generation_prompt=True
        )
        for p in batch_data['prompt']
    ]

    outputs = model.fast_generate(
        formatted_prompts,
        sampling_params = sampling_params,
        lora_request = model.load_lora(lora_path),
    )

    for j, output in enumerate(outputs):
        response = output.outputs[0].text
        response_len = len(tokenizer.encode(response))

        generated_answer = extract_xml_answer(response)
        true_answer = batch_data['num_answer'][j]

        result = {
            'question': batch_data['question'][j],
            'true_answer': true_answer,
            'generated_answer': generated_answer,
            'full_response': response,
            'correct': generated_answer == true_answer,
            "response_len": response_len
        }
        results.append(result)

        if generated_answer == true_answer:
            correct += 1
        total += 1
        len_sum += response_len
        len_min = min(len_min, response_len)
        len_max = max(len_max, response_len)
        correct_format += format_reward_func(response)

    progress_bar.update(current_batch_size)
    progress_bar.set_postfix({
        'acc': f'{(correct/total)*100:.2f}%',
        'correct': f'{correct}/{total}',
    })

progress_bar.close()

Processing samples:   0%|          | 0/1319 [00:00<?, ?examples/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/7 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [18]:
accuracy = correct / total if total > 0 else 0
metrics = {
    'accuracy': accuracy,
    'correct': correct,
    'format_accuracy': correct_format / total,
    'correct_format': correct_format,
    'total': total,
    'avg_len': len_sum / total,
    'len_min': len_min,
    'len_max': len_max,
    'timestamp': datetime.now().isoformat()
}

save_path = f"{run_name}_eval_results_{datetime.now().strftime('day_%d_time_%H_%M')}.json"
with open(save_path, 'w') as f:
    json.dump({
        'metrics': metrics,
        'results': results
    }, f, indent=4)
print(f"\nResults saved to {save_path}")

print("\nFinal Evaluation Results:")
print(f"Accuracy: {metrics['accuracy']:.2%}")
print(f"Correct: {metrics['correct']}/{metrics['total']}")

print(f"Format accuracy: {metrics['format_accuracy']:.2%}")
print(f"Correct format: {metrics['correct_format']}/{metrics['total']}")


Results saved to Group_method_GRPO_eval_results_day_22_time_12_26.json

Final Evaluation Results:
Accuracy: 34.80%
Correct: 459/1319
Format accuracy: 99.32%
Correct format: 1310/1319
