In [None]:
from datasets import load_dataset

ds = load_dataset("HuggingFaceH4/MATH-500",split='test')

In [None]:
ds

In [None]:
from batch_utils import create_batch # a little library I implemented, basically just interacts with OpenAI API.

In [None]:
create_batch(
    api_key='API-KEY',
    prompts=ds['problem'],
    model_name='unsloth/Qwen2.5-14B-Instruct', # unsloth's version since it isn't gated
    output_file_name='responses/qwen.jsonl'
)

In [None]:
create_batch(
    api_key='API-KEY',
    prompts=ds['problem'],
    model_name='qingy2024/QwQ-14B-Math-v0.2',
    output_file_name='responses/qwq.jsonl'
)

In [None]:
from iv2_utils import iv2
qwq_outputs = iv2.jsonl_read('responses/qwq.jsonl')

qwen25_outputs = iv2.jsonl_read('responses/qwen.jsonl')

qwq_responses = [x['response']['body']['choices'][0]['message']['content'] for x in qwq_outputs]

qwen25_responses = [x['response']['body']['choices'][0]['message']['content'] for x in qwen25_outputs]

In [None]:
from iv2_utils import iv2

for model in ['qwen', 'qwq']:
    outputs = iv2.jsonl_read(f'responses/{model}.jsonl')
    responses = [x['response']['body']['choices'][0]['message']['content'] for x in outputs]
    
    def verify_prompt(problem,qwq,answer):
        return f"""Here's a problem:\n{problem}\n\nSolution:\n{qwq[-500:]}\n\nExtract just the final answer in LaTeX form (without wrappers like \\boxed), and return it and say nothing else."""
    
    create_batch(
        api_key='API-KEY',
        prompts=[verify_prompt(x,y,z) for x,y,z in zip(ds['problem'],qwen25_responses,ds['solution'])],
        model_name='unsloth/Qwen2.5-32B-Instruct',
        output_file_name=f'answers/{model}.jsonl'
    )

### Checking if extracted answer matches with the result

In [None]:
from iv2_utils import iv2 # Useful utility library you can get on PyPI: pip install iv2_utils
from batch_utils import create_batch

qwq_answers = iv2.json_read('answers/qwq.jsonl')
qwen_answers = iv2.json_read('answers/qwen.jsonl')

print("Example answer:",qwq_answers[0])
print("Example answer:",qwen_answers[0])

In [None]:
def create_answer_check_prompt(problem, attempt, answer):
    return """Do these two answers represent the same thing?

First answer: {answer1}
Correct answer: {answer2}

Disregarding any units if there are any, like percentages or ^\\circ, degrees, etc., are these two answers equivalent? Simply reply 'Yes.' or 'No.'
""".format(answer1 = attempt, answer2 = answer)

create_batch(
    api_key='API-KEY',
    prompts=[create_answer_check_prompt(x,y,z) for x,y,z in list(zip(ds['problem'], qwq_answers, ds['answer']))],
    model_name='unsloth/Qwen2.5-32B-Instruct',
    output_file_name='qwq-final-math500.jsonl'
)

In [None]:
create_batch(
    api_key='API-KEY',
    prompts=[create_answer_check_prompt(x,y,z) for x,y,z in list(zip(ds['problem'], qwen_answers, ds['answer']))],
    model_name='unsloth/Qwen2.5-32B-Instruct',
    output_file_name='qwen2.5-final-math500.jsonl'
)

In [None]:
data = iv2.jsonl_read('qwq-final-math500.jsonl')
data = [x['response']['body']['choices'][0]['message']['content'] for x in data]
iv2.jsonl_write(data, file_name='qwq-final-math500.jsonl')

data = iv2.jsonl_read('qwen2.5-final-math500.jsonl')
data = [x['response']['body']['choices'][0]['message']['content'] for x in data]
iv2.jsonl_write(data, file_name='qwen2.5-final-math500.jsonl')

In [None]:
qwq_final = iv2.jsonl_read('qwq-final-math500.jsonl')
qwen_final = iv2.jsonl_read('qwen2.5-final-math500.jsonl')

"""
I did a manual check of the outputs and found this special case where both QwQ 14B and Qwen2.5 14B got 10%
and the answer was supposedly 10. In the context of that problem, they meant the same thing.

The prompt could probably be improved though!
"""
qwq_final = ['yes' in x.lower() if qwq_answers[idx] != '10\\%' else True for idx,x in enumerate(qwq_final)]
qwen_final = ['yes' in x.lower() if qwq_answers[idx] != '10\\%' else True for idx,x in enumerate(qwen_final)]

In [None]:
validated_qwq = Dataset.from_dict({
    "problem": ds['problem'],
    "correct-solution":ds['solution'],
    'correct-answer':ds['answer'],
    'qwq': qwq_responses,
    'qwq-answer':qwq_answers,
    'qwq-correct':qwq_final
})
validated_qwen = Dataset.from_dict({
    "problem": ds['problem'],
    "correct-solution":ds['solution'],
    'correct-answer':ds['answer'],
    'qwen': qwen25_responses,
    'qwen-answer':qwen_answers,
    'qwen-correct':qwen_final
})
validated_qwq.push_to_hub('QwQ-14B-v0.2-MATH500-Eval','qwq')
validated_qwen.push_to_hub('QwQ-14B-v0.2-MATH500-Eval','qwen2.5')