In [None]:
import re, argparse
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
)
import torch
from tqdm import tqdm
from pathlib import Path
from accelerate.utils import fsdp_utils
from vllm import LLM, SamplingParams


INFO 07-24 04:35:22 [__init__.py:244] Automatically detected platform cuda.


In [None]:
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

In [None]:
def extract_inter_answer(text: str) -> int:
    numbers = re.findall(r'\d+', text)
    if numbers:
        return int(numbers[0])
    return 0

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return extract_inter_answer(answer)

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip().replace(",", "").replace("$", "")

def extract_numerical_answer(answer_text):
    # GSM8K answers end with #### followed by the numerical answer
    match = re.search(r"#### ([-\d,]+)", answer_text)
    if match:
        # Remove commas and convert to int
        return int(match.group(1).replace(",", ""))
    return None

In [1]:
model_name = '/home/ubuntu/alex/verifiers/outputs/Qwen/Qwen2.5-7B-Instruct-gsm8k-discount0.99999-seed42capacityblock1/checkpoint-1870'

In [None]:
llm = LLM(
    model=model_name,
    tensor_parallel_size=1,
    dtype="auto",
    trust_remote_code=True,
    max_model_len=256,
    gpu_memory_utilization=0.95,
    enforce_eager=False,  # Use Flash Attention 2
)

INFO 07-24 04:35:28 [config.py:841] This model supports multiple tasks: {'generate', 'classify', 'embed', 'reward'}. Defaulting to 'generate'.
INFO 07-24 04:35:28 [config.py:1472] Using max model len 256
INFO 07-24 04:35:29 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 07-24 04:35:29 [core.py:526] Waiting for init message from front-end.
INFO 07-24 04:35:29 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='/home/ubuntu/alex/verifiers/outputs/Qwen/Qwen2.5-7B-Instruct-gsm8k-discount0.99999-seed42capacityblock1/checkpoint-1870', speculative_config=None, tokenizer='/home/ubuntu/alex/verifiers/outputs/Qwen/Qwen2.5-7B-Instruct-gsm8k-discount0.99999-seed42capacityblock1/checkpoint-1870', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=256, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, p

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-24 04:35:37 [default_loader.py:272] Loading weights took 3.16 seconds
INFO 07-24 04:35:37 [gpu_model_runner.py:1801] Model loading took 14.2488 GiB and 3.362404 seconds
INFO 07-24 04:35:42 [backends.py:508] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/8304f5f314/rank_0_0/backbone for vLLM's torch.compile
INFO 07-24 04:35:42 [backends.py:519] Dynamo bytecode transform time: 5.24 s
INFO 07-24 04:35:45 [backends.py:181] Cache the graph of shape None for later use
INFO 07-24 04:36:03 [backends.py:193] Compiling a graph for general shape takes 19.89 s
INFO 07-24 04:36:10 [monitor.py:34] torch.compile takes 25.14 s in total
INFO 07-24 04:36:11 [gpu_worker.py:232] Available KV cache memory: 112.75 GiB
INFO 07-24 04:36:11 [kv_cache_utils.py:716] GPU KV cache size: 2,111,200 tokens
INFO 07-24 04:36:11 [kv_cache_utils.py:720] Maximum concurrency for 256 tokens per request: 8246.88x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:18<00:00,  3.62it/s]


INFO 07-24 04:36:30 [gpu_model_runner.py:2326] Graph capturing finished in 19 secs, took 0.65 GiB
INFO 07-24 04:36:30 [core.py:172] init engine (profile, create kv cache, warmup model) took 52.75 seconds


In [None]:
data = load_dataset("openai/gsm8k", "main")["test"]

In [None]:
eval_data = []
for i, item in enumerate(data):
    proccessed = {
        "question": item["question"],
        "prompt": SYSTEM_PROMPT + " " + item["question"],
        "answer": item["answer"],
        "numerical_answer": extract_numerical_answer(item["answer"]),
        "other_answer": extract_hash_answer(item["answer"]),
    }
    eval_data.append(proccessed)

NameError: name 'data' is not defined

: 

In [None]:
prompts = [item["prompt"] for item in eval_data]

In [None]:
sampling_params = SamplingParams(temperature=0.0, max_tokens=786, stop=None)
outputs = llm.generate(prompts, sampling_params)


Generating:   0%|          | 0/11 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating:   0%|          | 0/11 [00:06<?, ?it/s]


In [None]:
correct = 0
for i, output in enumerate(outputs):
    generated_text = output.outputs[0].text
    predicted_answer = extract_xml_answer(generated_text)
    ground_truth = eval_data[i]["numerical_answer"]
    #print(ground_truth, predicted_answer)
    if int(predicted_answer) == int(ground_truth):
        correct += 1
    print(correct / (i+1))

{'input_ids': tensor([[  198, 65354,   304,   279,  2701,  3561,   510,    27, 19895,   287,
           397,  9338,   522, 19895,   287,   397,    27,  9217,   397,  9338,
           522,  9217,   397, 17599,   323,   220,    18,   315,   806,  4780,
          1973,   220,    22, 87770,   369, 15786,    13,  8886, 22502,   374,
          3931,  1119,   220,    23, 34254,    13,  1416, 17599,   323,   806,
          4780,  1366,   311,  4332,   279, 87770, 18308,    11,  1246,  1657,
         34254,   646,  1817,   315,  1105,   614,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
response_lengths = []
for output in outputs:
  # Get the first completion for the prompt
  first_completion = output.outputs[0]
  
  # Get the number of tokens in this completion
  num_tokens = len(first_completion.token_ids)
  
  # Add it to our list
  response_lengths.append(num_tokens)

In [None]:
sum(response_lengths) / len(response_lengths)