In [1]:
import re, argparse
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
)
import torch
from tqdm import tqdm
from pathlib import Path
from accelerate.utils import fsdp_utils
from vllm import LLM, SamplingParams


INFO 07-24 04:30:48 [__init__.py:244] Automatically detected platform cuda.


In [2]:
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

In [None]:
def extract_inter_answer(text: str) -> int:
    numbers = re.findall(r'\d+', text)
    if numbers:
        return int(numbers[0])
    return 0

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return extract_inter_answer(answer)

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip().replace(",", "").replace("$", "")

def extract_numerical_answer(answer_text):
    # GSM8K answers end with #### followed by the numerical answer
    match = re.search(r"#### ([-\d,]+)", answer_text)
    if match:
        # Remove commas and convert to int
        return int(match.group(1).replace(",", ""))
    return None

In [None]:
model_name = '/home/ubuntu/alex/verifiers/outputs/Qwen/Qwen2.5-7B-Instruct-gsm8k-discount0.99999-seed42capacityblock1/checkpoint-1870'

In [None]:
llm = LLM(
    model=model_name,
    tensor_parallel_size=1,
    dtype="auto",
    trust_remote_code=True,
    max_model_len=256,
    gpu_memory_utilization=0.95,
    enforce_eager=False,  # Use Flash Attention 2
)

In [48]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set pad token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Set padding side to left for decoder-only models
tokenizer.padding_side = "left"

In [49]:
data = load_dataset("openai/gsm8k", "main")["test"]

In [50]:
eval_data = []
for i, item in enumerate(data):
    proccessed = {
        "question": item["question"],
        "prompt": SYSTEM_PROMPT + " " + item["question"],
        "answer": item["answer"],
        "numerical_answer": extract_numerical_answer(item["answer"]),
        "other_answer": extract_hash_answer(item["answer"]),
    }
    eval_data.append(proccessed)

In [54]:
prompts = [item["prompt"] for item in eval_data]

In [None]:
sampling_params = SamplingParams(temperature=0.0, max_tokens=786, stop=None)
outputs = llm.generate(prompts, sampling_params)


Generating:   0%|          | 0/11 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating:   0%|          | 0/11 [00:06<?, ?it/s]


In [None]:
correct = 0
for i, output in enumerate(outputs):
    generated_text = output.outputs[0].text
    predicted_answer = extract_xml_answer(generated_text)
    ground_truth = eval_data[i]["numerical_answer"]
    #print(ground_truth, predicted_answer)
    if int(predicted_answer) == int(ground_truth):
        correct += 1
    print(correct / (i+1))

{'input_ids': tensor([[  198, 65354,   304,   279,  2701,  3561,   510,    27, 19895,   287,
           397,  9338,   522, 19895,   287,   397,    27,  9217,   397,  9338,
           522,  9217,   397, 17599,   323,   220,    18,   315,   806,  4780,
          1973,   220,    22, 87770,   369, 15786,    13,  8886, 22502,   374,
          3931,  1119,   220,    23, 34254,    13,  1416, 17599,   323,   806,
          4780,  1366,   311,  4332,   279, 87770, 18308,    11,  1246,  1657,
         34254,   646,  1817,   315,  1105,   614,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
response_lengths = []
for output in outputs:
  # Get the first completion for the prompt
  first_completion = output.outputs[0]
  
  # Get the number of tokens in this completion
  num_tokens = len(first_completion.token_ids)
  
  # Add it to our list
  response_lengths.append(num_tokens)

In [None]:
sum(response_lengths) / len(response_lengths)