In [None]:
import sys
import os
import json
from glob import glob
import shutil

In [2]:
from dynamic_cheatsheet.utils.evaluation import eval_for_GameOf24, eval_for_exact_matching_with_no_punctuation, eval_for_multiple_choice, eval_equation_balancer

In [3]:
# Open JSONL file
def read_jsonl(file_path):
    with open(file_path, "r") as file:
        return [json.loads(line) for line in file.readlines()]

In [None]:
for TASK in ["AIME_2024", "AIME_2025", "AIME_2020_2024", "GameOf24", "GPQA_Diamond", "MMLU_Pro_Engineering", "MMLU_Pro_Physics", "MathEquationBalancer"]:
    files = glob(f"results/{TASK}/**/**/*.jsonl", recursive=True)
    files = list(set(files))
    files.sort()
    print(TASK)

    for file in files:
        data = read_jsonl(file)

        correct_n = 0
        total_n = len(data)
        
        for idx, example in enumerate(data):
            input = example["input"].strip()
            target = example["target"].strip()
            model_output = example['steps'][-1]['generator_output'].strip()
            if "<answer>" in model_output:
                final_answer = model_output.split("<answer>")[1].split("</answer>")[0].strip()
            else:
                # print(f"Could not find <answer> in the model output for the example {idx}.")
                final_answer = model_output
                
            result = False
            if TASK == "GameOf24":
                numbers = input.split("\n")[-1].strip()
                result = eval_for_GameOf24(numbers, final_answer)
                correct_n += result
            elif TASK in ["AIME_2025", "AIME_2024", "AIME_2020_2024"]:
                final_answer = final_answer.split("boxed{")[-1].split("}")[0].strip()
                result = eval_for_exact_matching_with_no_punctuation(final_answer.lower(), target.lower())
                correct_n += result
            elif TASK in ["GPQA_Diamond", "MMLU_Pro_Engineering", "MMLU_Pro_Physics"]:
                result = result = eval_for_multiple_choice(input, final_answer, target)
                correct_n += result
            elif TASK == "MathEquationBalancer":
                result = eval_equation_balancer(None, final_answer, target)
                correct_n += result

        print(f"{file}:\n{correct_n}/{total_n} ({correct_n/total_n:.2%})\n")

    print('--------------------------------')