In [1]:
from mr_eval.utils.utils import *
import os

def list_jsonl_files(folder_path):
    """
    列举文件夹中的所有 .jsonl 文件
    Args:
        folder_path (str): 文件夹路径
    Returns:
        List[str]: 所有 .jsonl 文件的路径
    """
    return [f for f in os.listdir(folder_path) if f.endswith(".jsonl")]



In [44]:
## Model names
prm_model_name_dict = dict(
    skyworkprm_1_5B="\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B}",
    skyworkprm_7B="\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B}",
    llemma7b_prm_prm800k="\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B}",
    llemma7b_prm_metamath="\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B}",
    llemma7b_oprm_prm800k="\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B}",
    mathminos_mistral="\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B}",
    mathshepherd="\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B}",
    reasoneval7b="\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B}",
    llama3_1_8b_prm_mistral="\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B}",
    llama3_1_8b_prm_deepseek="\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B}",
    reasoneval34b="\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B}",
)
close_model_name_dict = dict(
    gpt4o="\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o}",
    o1mini="\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\dagger$",
    
    gemini_2_flash="\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp}",
    gemini_2_thinking="\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219}",
)
    
open_model_name_dict = dict(
    o1preview="\\href{https://openai.com/index/introducing-openai-o1-preview/}{o1-preview}$^\dagger$",
    qwen_qwq="\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B}",
)



classification_name_dict = dict(
    domain_inconsistency="DC.",
    redundency="NR.",
    multi_solutions="MS.",
    deception="DR.",
    confidence="CI.",
    step_contradiction="SC.",
    circular="NCL.",
    missing_condition="PS.",
    counterfactual="ES."
)
classification_parallel_dict = dict(
    simplicity=dict(
        redundency="NR.",
        circular="NCL.",
    ),
    soundness=dict(
        counterfactual="ES.",
        step_contradiction="SC.",
        domain_inconsistency="DC.",
        confidence="CI.",
    ),
    sensitivity=dict(
        missing_condition="PS.",
        deception="DR.",
        multi_solutions="MS.",
    )
)
classifications = ["redundency", "circular", "counterfactual", "step_contradiction", "domain_inconsistency",  "confidence", "missing_condition", "deception", "multi_solutions", ]
metrics = ["f1", "negative_f1", "total_step_acc", "correct_step_acc", "wrong_step_acc", "first_error_acc", "similarity",]

## File paths
res_dir = "/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified"
res_files = list_jsonl_files(res_dir)
res_names = [f.split(".")[0] for f in res_files]
res_paths = [os.path.join(res_dir, f) for f in res_files]
file_dict = dict(zip(res_names, res_paths))

In [45]:
def get_res_dict(file_dict,model_lists=None):
    res_dict = {}
    if not model_lists:
        for model_name, file_path in file_dict.items():
            res_dict[model_name] = process_jsonl(file_path)[-1]
    else:
        for model_name in model_lists:
            file_path = file_dict[model_name]
            res_dict[model_name] = process_jsonl(file_path)[-1]
    return res_dict


def get_prmscore_from_current_res_dict(res_dict,classification=None):
    '''
    Get PRM score from model level dict
    '''
    if not classification:
        prm_score = res_dict["total_hallucination_results"]['f1'] * 0.5 + res_dict["total_hallucination_results"]['negative_f1'] * 0.5
    else:
        if classification in ["multi_solutions"]:
            prm_score = res_dict["hallucination_type_results"]['f1'][classification]
        else:
            prm_score = res_dict["hallucination_type_results"]['f1'][classification] * 0.5 + res_dict["hallucination_type_results"]['negative_f1'][classification] * 0.5
    return prm_score


def get_avg_prmscore_from_current_res_dict(res_dict,classifications):
    '''
    Get AVG PRM score from model level dict
    '''
    assert classifications
    res = [get_prmscore_from_current_res_dict(res_dict,classification) for classification in classifications]
    return sum(res) / len(res)

def get_metric_from_current_res_dict(res_dict,metric,classification=None):
    '''
    Get metric from model level dict
    '''
    if not classification:
        if metric == "similarity":
            return 1 - res_dict["total_hallucination_results"][metric]
        else:
            return res_dict["total_hallucination_results"][metric]
    else:
        if metric == "similarity":
            return 1 - res_dict["hallucination_type_results"][metric][classification]
        else:
            return res_dict["hallucination_type_results"][metric][classification]
    
# def get_avg_metric_from_current_res_dict(res_dict,metric,classifications):
#     '''
#     Get AVG metric from model level dict
#     '''
#     assert classifications
#     res = [get_metric_from_current_res_dict(res_dict,metric,classification) for classification in classifications]
#     return sum(res) / len(res)
    

def get_res_str(model_dict,classification_dict,res_dict):
    res_str = ""
    # current_classification_dict = classification_dict[classification_name]
    avg_res_list = []
    for idx,(model_name, model_display_name) in enumerate(model_dict.items()):
        temp_str = f"{model_display_name}"
        current_res_dict = res_dict[model_name]
        prm_score = get_prmscore_from_current_res_dict(current_res_dict)
        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()],reverse=True)
        if idx == 0:
            avg_res_list.append(sum(all_model_scores) / len(all_model_scores))
        if prm_score == max(all_model_scores):
            temp_str += f" & \\textbf{{{prm_score * 100:.1f}}}"
        elif prm_score == all_model_scores[1]:
            temp_str += f" & \\underline{{{prm_score * 100:.1f}}}"
        else:
            temp_str += f" & {prm_score * 100:.1f}"
        
        for big_classification, current_classification_dict in classification_dict.items():
            all_avt = sorted([get_avg_prmscore_from_current_res_dict(res,list(current_classification_dict.keys())) for res in res_dict.values()], reverse=True)
            avg = []
            for classification, display_classification_name in current_classification_dict.items():
                prm_score = get_prmscore_from_current_res_dict(current_res_dict,classification)
                all_prm_scores = sorted([get_prmscore_from_current_res_dict(res,classification) for res in res_dict.values()], reverse=True)
                if idx == 0:
                    avg_res_list.append(sum(all_prm_scores) / len(all_prm_scores))
                avg.append(prm_score)
                if prm_score == max(all_prm_scores):
                    temp_str += f" & \\textbf{{{prm_score * 100:.1f}}}"
                elif prm_score == all_prm_scores[1]:
                    temp_str += f" & \\underline{{{prm_score * 100:.1f}}}"
                else:
                    temp_str += f" & {prm_score * 100:.1f}"
            avg_score = sum(avg) / len(avg)
            if avg_score == max(all_avt):
                temp_str += f" & \\textbf{{{avg_score * 100:.1f}}}"
            elif avg_score == all_avt[1]:
                temp_str += f" & \\underline{{{avg_score * 100:.1f}}}"
            else:
                temp_str += f" & {avg_score * 100:.1f}"
            if idx == 0:
                avg_res_list.append(sum(all_avt) / len(all_avt))
        temp_str += "\\\\\n"
        res_str += temp_str
    avg_res_str = "\\cellcolor{gray!10} \\textbf{Avg.} "
    for res in avg_res_list:
        avg_res_str += f"& \\cellcolor{{gray!10}} {res * 100:.1f} "
    avg_res_str += "\\\\\n"
    res_str += avg_res_str
    
    return res_str



In [36]:
res_str = ""

## PRMs
model_type_panel="\hline \multicolumn{14}{c}{\\textit{\\textbf{Open-source Process Level Reward Models}}} \\\\   \hline \n"
res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))
prm_str = get_res_str(prm_model_name_dict, classification_parallel_dict, res_dict,)
res_str += model_type_panel + prm_str

## Close Models
model_type_panel= "\hline \multicolumn{14}{c}{\\textit{\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\   \hline \n"
res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))
close_str = get_res_str(close_model_name_dict, classification_parallel_dict, res_dict,)
res_str += model_type_panel + close_str


print(res_str)

\hline \multicolumn{14}{c}{\textit{\textbf{Open-source Process Level Reward Models}}} \\   \hline 
\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B} & 31.7 & 31.4 & 35.8 & 33.6 & 32.4 & 25.7 & 26.0 & 30.2 & 28.6 & 33.1 & 32.3 & 81.1 & 48.8\\
\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B} & 36.2 & 35.7 & 41.2 & 38.4 & 36.7 & 29.1 & 30.6 & 34.4 & 32.7 & 36.8 & 37.4 & 88.8 & 54.3\\
\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B} & 52.0 & 49.3 & \underline{53.4} & 51.4 & 56.4 & 47.1 & 46.7 & 53.3 & 50.9 & 51.0 & 53.5 & 93.6 & 66.0\\
\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B} & 50.5 & 50.2 & 50.5 & 50.3 & 51.9 & 47.6 & 44.4 & 52.1 & 49.0 & 50.5 & 51.3 & 96.0 & 66.0\\
\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B} & 50.3 & 48.7 & 49.3 & 49.0 & 54.2 & 46.8 & 44.

## appendix latex str

In [46]:
display_metrics = ["f1", "negative_f1", "total_step_acc", "correct_step_acc", "wrong_step_acc", "first_error_acc", "similarity",]
def get_appendix_res_str(model_dict,res_dict, classification):
    res_str = ""
    # current_classification_dict = classification_dict[classification_name]
    avg_res_list = []
    for idx,(model_name, model_display_name) in enumerate(model_dict.items()):
        temp_str = f"{model_display_name}"
        current_res_dict = res_dict[model_name]
        prm_score = get_prmscore_from_current_res_dict(current_res_dict,classification)
        all_model_scores = sorted([get_prmscore_from_current_res_dict(res,classification) for res in res_dict.values()],reverse=True)
        if idx == 0:
            avg_res_list.append(sum(all_model_scores) / len(all_model_scores))
        if prm_score == max(all_model_scores):
            temp_str += f" & \\textbf{{{prm_score * 100:.1f}}}"
        elif prm_score == all_model_scores[1]:
            temp_str += f" & \\underline{{{prm_score * 100:.1f}}}"
        else:
            temp_str += f" & {prm_score * 100:.1f}"
        
        # detailed metrics
        for display_metric in display_metrics:
            metric_score = get_metric_from_current_res_dict(current_res_dict,display_metric,classification)
            all_metric_scores = sorted([get_metric_from_current_res_dict(res,display_metric,classification) for res in res_dict.values()],reverse=True)
            if idx == 0:
                avg_res_list.append(sum(all_metric_scores) / len(all_metric_scores))
            if display_metric == "similarity":
                temp_str += f" & {metric_score * 100:.1f}"
            else:
                if metric_score == max(all_metric_scores):
                    temp_str += f" & \\textbf{{{metric_score * 100:.1f}}}"
                elif metric_score == all_metric_scores[1]:
                    temp_str += f" & \\underline{{{metric_score * 100:.1f}}}"
                else:
                    temp_str += f" & {metric_score * 100:.1f}"
        temp_str += "\\\\\n"
        res_str += temp_str
    avg_res_str = "\\cellcolor{gray!10} \\textbf{Avg.} "
    for res in avg_res_list:
        avg_res_str += f"& \\cellcolor{{gray!10}} {res * 100:.1f} "
    avg_res_str += "\\\\\n"
    res_str += avg_res_str
    
    return res_str

In [None]:
classification_parallel_dict = dict(
    simplicity=dict(
        redundency="NR.",
        circular="NCL.",
    ),
    soundness=dict(
        counterfactual="ES.",
        step_contradiction="SC.",
        domain_inconsistency="DC.",
        confidence="CI.",
    ),
    sensitivity=dict(
        missing_condition="PS.",
        deception="DR.",
        multi_solutions="MS.",
    )
)

In [57]:
classification = "deception"
res_str = ""

## PRMs
model_type_panel="\hline \multicolumn{9}{c}{\\textit{\\textbf{Open-source Process Level Reward Models}}} \\\\   \hline \n"
res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))
prm_str = get_appendix_res_str(prm_model_name_dict,res_dict, classification)
res_str += model_type_panel + prm_str

## Close Models
model_type_panel= "\hline \multicolumn{9}{c}{\\textit{\\textbf{Proprietary LLMs, Prompted as Critic Models}}} \\\\   \hline \n"
res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))
close_str = get_appendix_res_str(close_model_name_dict, res_dict, classification)
res_str += model_type_panel + close_str


print(res_str)


\hline \multicolumn{9}{c}{\textit{\textbf{Open-source Process Level Reward Models}}} \\   \hline 
\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B} & 32.3 & 36.3 & 28.2 & 32.5 & 22.8 & \textbf{84.0} & \textbf{79.1} & 74.5\\
\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B} & 37.4 & 44.9 & 29.9 & 38.3 & 29.8 & \underline{83.3} & \underline{77.8} & 73.3\\
\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B} & 53.5 & 77.8 & 29.1 & 66.2 & 70.4 & 43.8 & 23.1 & 85.1\\
\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B} & 51.3 & 80.6 & 22.1 & 68.9 & 76.6 & 27.9 & 15.1 & 85.6\\
\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B} & 51.3 & 78.4 & 24.1 & 66.4 & 72.5 & 33.8 & 17.2 & 86.6\\
\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B} & 55.8 & 79.1 & \underline{32.4