In [1]:
from mr_eval.utils.utils import *
import os
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

def list_jsonl_files(folder_path):
    """
    列举文件夹中的所有 .jsonl 文件
    Args:
        folder_path (str): 文件夹路径
    Returns:
        List[str]: 所有 .jsonl 文件的路径
    """
    return [f for f in os.listdir(folder_path) if f.endswith(".jsonl")]
## Model names
## Model names
prm_model_name_dict = dict(
    skyworkprm_1_5B="Skywork-1.5B",
    skyworkprm_7B="Skywork-PRM-7B",
    llemma7b_prm_prm800k="Llemma-PRM800k-7B",
    llemma7b_prm_metamath="Llemma-MetaMath-7B",
    llemma7b_oprm_prm800k="Llemma-oprm-7B",
    mathminos_mistral="MATHMinos-7B",
    mathshepherd="MathShepherd-7B",
    reasoneval7b="ReasonEval-7B",
    llama3_1_8b_prm_mistral="RLHFlow-PRM-Mistral-8B",
    llama3_1_8b_prm_deepseek="RLHFlow-PRM-Deepseek-8B",
    reasoneval34b="ReasonEval-34B",
)
close_model_name_dict = dict(
    gpt4o="GPT-4o",
    o1mini="o1-mini",
    o1preview="o1-preview",
    gemini_2_flash="Gemini-2.0-flash-exp",
    gemini_2_thinking="Gemini-thinking",
)
    
open_model_name_dict = dict(
    qwen_qwq="QwQ-Preview-32B",
)
all_model_name_dict = {**prm_model_name_dict, **close_model_name_dict, **open_model_name_dict}


classification_name_dict = dict(
    domain_inconsistency="DC.",
    redundency="NR.",
    multi_solutions="MS.",
    deception="DR.",
    confidence="CI.",
    step_contradiction="SC.",
    circular="NCL.",
    missing_condition="PS.",
    counterfactual="ES."
)
classification_parallel_dict = dict(
    simplicity=dict(
        redundency="NR.",
        circular="NCL.",
    ),
    soundness=dict(
        counterfactual="ES.",
        step_contradiction="SC.",
        domain_inconsistency="DC.",
        confidence="CI.",
    ),
    sensitivity=dict(
        missing_condition="PS.",
        deception="DR.",
        multi_solutions="MS.",
    )
)
classifications = ["redundency", "circular", "counterfactual", "step_contradiction", "domain_inconsistency",  "confidence", "missing_condition", "deception", "multi_solutions", ]
metrics = ["f1", "negative_f1", "total_step_acc", "correct_step_acc", "wrong_step_acc", "first_error_acc", "similarity",]

## File paths
res_dir = "/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified"
res_files = list_jsonl_files(res_dir)
res_names = [f.split(".")[0] for f in res_files]
res_paths = [os.path.join(res_dir, f) for f in res_files]
file_dict = dict(zip(res_names, res_paths))
res_dict = {k: process_jsonl(v)[-1] for k, v in file_dict.items()}
# detailed_log_dict = {k:v[""]}
display_models = ["reasoneval34b","mathshepherd","gpt4o","gemini_2_thinking"]

In [32]:
# load test data
data_dir = "/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/tasks/prmtest_classified/data"
dataset_type = "dir_of_jsonl"
data_files = os.listdir(data_dir)
data_files = [f for f in data_files if f.endswith(".jsonl")]
raw_data = []
for data_file in data_files:
    raw_data.extend(process_jsonl(os.path.join(data_dir, data_file)))

In [35]:
# model_dict_construct 

model_log_dict = {}
for model_name, model_res in res_dict.items():
    detailed_log = model_res["detailed_logs"]
    model_log_dict[model_name] = {}
    for log in detailed_log:
        if  "validitiy" in log and not log["validitiy"]:
            continue
        idx = log["idx"]
        wrong_step_acc_list = log["results"]["wrong_step_acc_list"]   
        
        error_num = 0
        for err in wrong_step_acc_list:
            if err < 1:
                error_num += 1
                
        res = dict(log=log,error_num=error_num)
        model_log_dict[model_name][idx] = res
        

# build error cnt dict
meta_data_dict = {}
for item in raw_data:
    idx = item["idx"]
    classification = item["classification"]
    new_idx = f"{classification}_{idx}"
    total_error_cnt = 0
    for model_name in model_log_dict.keys():
        if new_idx in model_log_dict[model_name]:
            total_error_cnt += model_log_dict[model_name][new_idx]["error_num"]
    meta_data_dict[new_idx] = dict(item=item, error_cnt=total_error_cnt,new_idx=new_idx)
    
raw_data_w_error = list(meta_data_dict.values())    
raw_data_w_error.sort(key=lambda x: x["error_cnt"], reverse=True)


def get_split_str(string,max_len=12):
    # return string
    str_split = string.split()
    if len(str_split) <= max_len:
        return string
    
    result = [' '.join(str_split[i:i+max_len]) for i in range(0, len(str_split), max_len)]  # 每 n 个单词分成一组
    return ' \\\\ '.join(result)   





In [47]:
## Form Latex str
select_idx = 6
max_single_line_Words = 16
item = raw_data_w_error[select_idx]["item"]
item_idx = raw_data_w_error[select_idx]["new_idx"]
error_steps = item["error_steps"]
res_str = ""
question = get_split_str(item["modified_question"])
question_str = f"\\textbf{{Question}} & \\tabincell{{c}}{{{question}}}"
# question_str = f"\\textbf{{Question}} & {question}"
for model_name in display_models:
    model_display_name = all_model_name_dict[model_name]
    question_str += f"& \\textbf{{{model_display_name}}} "
question_str += "\\\\\n"
res_str += question_str

for step_idx, step in enumerate(item["modified_process"]):
    adjusted_step_str  = get_split_str(step)
    # if step_idx + 1 in error_steps:
    #     adjusted_step_str = f"\\textcolor{{myred}}{{{adjusted_step_str}}}"
    if step_idx + 1 in error_steps:
        label = f"\\textcolor{{myred}}{{\\ding{{55}}}}"
    else:
        label = f"\\textcolor{{mygreen}}{{\\ding{{51}}}}"
        
    step_str = f"\\textbf{{Step {step_idx + 1}}} & \\tabincell{{c}}{{{adjusted_step_str}}} & {label}"
    # step_str = f"\\textbf{{Step {step_idx + 1}}} & {step}"

    for model_name in display_models:
        model_log = model_log_dict[model_name][item_idx]["log"]
        validity_score = model_log["prediction"]["scores"]["step_level_validity_scores"]
        current_step_validity_score = validity_score[step_idx]
        if current_step_validity_score > 0.5:
            step_str += f"& \\textcolor{{mygreen}}{{{current_step_validity_score*100:.1f}}}"
        else:
            step_str += f"& \\textcolor{{myred}}{{{current_step_validity_score*100:.1f}}}"
    step_str += "\\\\\n"
    res_str += step_str

final_acc_str = f"\\textbf{{Final Acc.}} & - & -   "
for model_name in display_models:
    model_log = model_log_dict[model_name][item_idx]["log"]
    total_acc = model_log["results"]["total_step_acc"]
    final_acc_str += f"& {total_acc*100:.1f}"
final_acc_str += "\\\\\n"
res_str += final_acc_str


print(res_str)
        


\textbf{Question} & \tabincell{c}{Compute \[\left( 1 + \cos \frac {\pi}{8} \right) \left( 1 + \cos \\ \frac {3 \pi}{8} \right) \left( 1 + \cos \frac {5 \pi}{8} \right) \\ \left( 1 + \cos \frac {7 \pi}{8} \right).\]}& \textbf{ReasonEval-34B} & \textbf{MathShepherd-7B} & \textbf{GPT-4o} & \textbf{Gemini-thinking} \\
\textbf{Step 1} & \tabincell{c}{Let's call the expression we're trying to compute $x$.} & \textcolor{mygreen}{\ding{51}}& \textcolor{mygreen}{89.1}& \textcolor{mygreen}{87.5}& \textcolor{mygreen}{100.0}& \textcolor{mygreen}{100.0}\\
\textbf{Step 2} & \tabincell{c}{Use the fact that $\cos(\pi-a)=-\cos a$.} & \textcolor{mygreen}{\ding{51}}& \textcolor{mygreen}{77.5}& \textcolor{mygreen}{82.8}& \textcolor{mygreen}{100.0}& \textcolor{mygreen}{100.0}\\
\textbf{Step 3} & \tabincell{c}{Then we have $x=\left( 1 + \cos \frac {\pi}{8} \right) \left( 1 \\ + \cos \frac {3 \pi}{8} \right) \left( 1 + \cos \frac {5 \\ \pi}{8} \right) \left( 1 + \cos \frac {7 \pi}{8} \right)$.} & \textcolor{

In [45]:
raw_data_w_error[select_idx]

{'item': {'original_question': 'Compute \\[\\left( 1 + \\cos \\frac {\\pi}{8} \\right) \\left( 1 + \\cos \\frac {3 \\pi}{8} \\right) \\left( 1 + \\cos \\frac {5 \\pi}{8} \\right) \\left( 1 + \\cos \\frac {7 \\pi}{8} \\right).\\]',
  'modified_question': 'Compute \\[\\left( 1 + \\cos \\frac {\\pi}{8} \\right) \\left( 1 + \\cos \\frac {3 \\pi}{8} \\right) \\left( 1 + \\cos \\frac {5 \\pi}{8} \\right) \\left( 1 + \\cos \\frac {7 \\pi}{8} \\right).\\]',
  'original_process': ["Let's call the expression we're trying to compute $x$.",
   'Use the fact that $\\cos(\\pi-a)=-\\cos a$.',
   'Then we have $x=\\left( 1 + \\cos \\frac {\\pi}{8} \\right) \\left( 1 + \\cos \\frac {3 \\pi}{8} \\right) \\left( 1 + \\cos \\frac {5 \\pi}{8} \\right) \\left( 1 + \\cos \\frac {7 \\pi}{8} \\right)$.',
   'This equals $\\left( 1 + \\cos \\frac {\\pi}{8} \\right) \\left( 1 + \\cos \\frac {3 \\pi}{8} \\right) \\left( 1 - \\cos \\frac {\\pi}{8} \\right) \\left( 1 - \\cos \\frac {3 \\pi}{8} \\right)$.',
   'By t