In [1]:
from mr_eval.utils.utils import *
import os

def list_jsonl_files(folder_path):
    """
    列举文件夹中的所有 .jsonl 文件
    Args:
        folder_path (str): 文件夹路径
    Returns:
        List[str]: 所有 .jsonl 文件的路径
    """
    return [f for f in os.listdir(folder_path) if f.endswith(".jsonl")]



In [16]:
## Model names
prm_model_name_dict = dict(
    skyworkprm_1_5B="\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B}",
    skyworkprm_7B="\\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B}",
    llemma7b_prm_prm800k="\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf}{Llemma-PRM800k-7B}",
    llemma7b_prm_metamath="\\href{https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf}{Llemma-MetaMath-7B}",
    llemma7b_oprm_prm800k="\\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B}",
    mathminos_mistral="\\href{https://github.com/KbsdJames/MATH-Minos}{MATHMinos-Mistral-7B}",
    mathshepherd="\\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B}",
    reasoneval7b="\\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B}",
    llama3_1_8b_prm_mistral="\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B}",
    llama3_1_8b_prm_deepseek="\\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B}",
    reasoneval34b="\\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B}",
)
close_model_name_dict = dict(
    gpt4o="\\href{https://openai.com/index/hello-gpt-4o/}{GPT-4o}",
    o1mini="\\href{https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/}{o1-mini}$^\dagger$",
    
    gemini_2_flash="\\href{https://deepmind.google/technologies/gemini/flash/}{Gemini-2.0-flash-exp}",
    gemini_2_thinking="\\href{https://ai.google.dev/gemini-api/docs/thinking-mode}{Gemini-2.0-thinking-exp-1219}",
)
    
open_model_name_dict = dict(
    o1preview="\\href{https://openai.com/index/introducing-openai-o1-preview/}{o1-preview}$^\dagger$",
    qwen_qwq="\\href{https://huggingface.co/Qwen/QwQ-32B-Preview}{QwQ-Preview-32B}",
)

all_model_name_dict = {**prm_model_name_dict, **close_model_name_dict, **open_model_name_dict}
datasets = ["gsm8k", "math", "olympiadbench", "mmlu"]
datasets_dict = dict(
    gsm8k="GSM8k",
    math="Math",
    olympiadbench="OlympiadBench",
    mmlu="MMLU",
)


## File paths
res_dir = "/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmbench_bon"
res_files = list_jsonl_files(res_dir)
res_names = [f.split(".")[0] for f in res_files]
res_paths = [os.path.join(res_dir, f) for f in res_files]
file_dict = dict(zip(res_names, res_paths))

In [29]:
def get_res_dict(file_dict,model_lists=None):
    res_dict = {}
    if not model_lists:
        for model_name, file_path in file_dict.items():
            res_dict[model_name] = process_jsonl(file_path)[-1]
    else:
        for model_name in model_lists:
            file_path = file_dict[model_name]
            res_dict[model_name] = process_jsonl(file_path)[-1]
    return res_dict


    

def get_res_str(model_dict, datasets_dict, res_dict):
    res = ""
    example_model_name, example_res = list(res_dict.items())[0]
    res += f"pass@8"
    for dataset_name, dataset in datasets_dict.items():
        pass_at_n = example_res["pass_at_n"].get(dataset_name, "N/A")
        res += f" & {pass_at_n*100 :.2f}" if pass_at_n != "N/A" else "& N/A"
    res += "\\\\\n"
    res += f"maj@8"
    for dataset_name, dataset in datasets_dict.items():
        maj_at_n = example_res["maj_of_n"].get(dataset_name, "N/A")
        res += f" & {maj_at_n*100 :.2f}" if maj_at_n != "N/A" else "& N/A"
    res += "\\\\\n \midrule \n"
    
    for model_name, res_item in res_dict.items():
        model = model_dict[model_name]
        res += f"{model}"
        for dataset_name, dataset in datasets_dict.items():
            if dataset_name in res_item['prm_bon']:
                res += f" & {res_item['prm_bon'][dataset_name]*100:.2f}"
            else:
                res += "& N/A "
        res += "\\\\\n"
    return res
    


In [30]:
res_str = ""
res_dict = get_res_dict(file_dict,)
prm_str = get_res_str(all_model_name_dict, datasets_dict, res_dict,)
res_str += prm_str
print(res_str)

pass@8 & 94.77& N/A& N/A& N/A\\
maj@8 & 91.66& N/A& N/A& N/A\\
 \midrule 
\href{https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm}{MathShepherd-Mistral-7B} & 90.45& N/A & N/A & N/A \\
\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data}{RLHFlow-PRM-Deepseek-8B} & 90.60& N/A & N/A & N/A \\
\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B}{Skywork-PRM-1.5B} & 90.75& N/A & N/A & N/A \\
\href{https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf}{Llemma-oprm-7B} & 89.92& N/A & N/A & N/A \\
\href{https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data}{RLHFlow-PRM-Mistral-8B} & 90.45& N/A & N/A & N/A \\
\href{https://huggingface.co/GAIR/ReasonEval-7B}{ReasonEval-7B} & 90.45& N/A & N/A & N/A \\
\href{https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B}{Skywork-PRM-7B} & 91.74& N/A & N/A & N/A \\
\href{https://huggingface.co/GAIR/ReasonEval-34B}{ReasonEval-34B} & 90.30& N/A & N/A & N/A \\
\href{https://github.c

## Markdown CHART

In [28]:
## Model names
prm_model_name_dict = dict(
    skyworkprm_1_5B="[Skywork-PRM-1.5B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B)",
    skyworkprm_7B="[Skywork-PRM-7B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B)",
    llemma7b_prm_prm800k="[Llemma-PRM800k-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf)",
    llemma7b_prm_metamath="[Llemma-MetaMath-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf)",
    llemma7b_oprm_prm800k="[Llemma-oprm-7B](https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf)",
    mathminos_mistral="[MATHMinos-Mistral-7B](https://github.com/KbsdJames/MATH-Minos)",
    mathshepherd="[MathShepherd-Mistral-7B](https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm)",
    reasoneval7b="[ReasonEval-7B](https://huggingface.co/GAIR/ReasonEval-7B)",
    llama3_1_8b_prm_mistral="[RLHFlow-PRM-Mistral-8B](https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data)",
    llama3_1_8b_prm_deepseek="[RLHFlow-PRM-Deepseek-8B](https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data)",
    reasoneval34b="[ReasonEval-34B](https://huggingface.co/GAIR/ReasonEval-34B)",
    qwen_prm7b="[Qwen2.5-Math-PRM-7B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B)",
    qwen_prm72b="[Qwen2.5-Math-PRM-72B](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B)",
)

close_model_name_dict = dict(
    gpt4o="[GPT-4o](https://openai.com/index/hello-gpt-4o/)",
    o1mini="[o1-mini](https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/)\\$^\dagger$",
    
    gemini_2_flash="[Gemini-2.0-flash-exp](https://deepmind.google/technologies/gemini/flash/)",
    gemini_2_thinking="[Gemini-2.0-thinking-exp-1219](https://ai.google.dev/gemini-api/docs/thinking-mode)",
)

open_model_name_dict = dict(
    o1preview="[o1-preview](https://openai.com/index/introducing-openai-o1-preview/)\\$^\dagger$",
    qwen_qwq="[QwQ-Preview-32B](https://huggingface.co/Qwen/QwQ-32B-Preview)",
)


In [29]:
def get_res_str(model_dict, classification_dict, res_dict):
    res_str = ""
    avg_res_list = []

    # 表头部分
    header_row = "| Model | Overall"
    separator_row = "|-------|-------"
    for big_classification, current_classification_dict in classification_dict.items():
        for classification, display_classification_name in current_classification_dict.items():
            header_row += f"| {display_classification_name} "
            separator_row += "|-------"
        header_row += f"| Avg ({big_classification}) "
        separator_row += "|-------"
    header_row += " |\n"
    separator_row += " |\n"
    res_str += header_row + separator_row

    # 数据部分
    for idx, (model_name, model_display_name) in enumerate(model_dict.items()):
        temp_str = f"| {model_display_name} "
        current_res_dict = res_dict[model_name]

        # 计算 PRM Score
        prm_score = get_prmscore_from_current_res_dict(current_res_dict)
        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()], reverse=True)
        if idx == 0:
            avg_res_list.append(sum(all_model_scores) / len(all_model_scores))
        if prm_score == max(all_model_scores):
            temp_str += f"| **{prm_score * 100:.1f}** "
        elif prm_score == all_model_scores[1]:
            temp_str += f"| _{prm_score * 100:.1f}_ "
        else:
            temp_str += f"| {prm_score * 100:.1f} "

        # 分类指标部分
        for big_classification, current_classification_dict in classification_dict.items():
            all_avt = sorted([get_avg_prmscore_from_current_res_dict(res, list(current_classification_dict.keys())) for res in res_dict.values()], reverse=True)
            avg = []
            for classification, display_classification_name in current_classification_dict.items():
                prm_score = get_prmscore_from_current_res_dict(current_res_dict, classification)
                all_prm_scores = sorted([get_prmscore_from_current_res_dict(res, classification) for res in res_dict.values()], reverse=True)
                if idx == 0:
                    avg_res_list.append(sum(all_prm_scores) / len(all_prm_scores))
                avg.append(prm_score)
                if prm_score == max(all_prm_scores):
                    temp_str += f"| **{prm_score * 100:.1f}** "
                elif prm_score == all_prm_scores[1]:
                    temp_str += f"| _{prm_score * 100:.1f}_ "
                else:
                    temp_str += f"| {prm_score * 100:.1f} "

            # 分类指标的平均分
            avg_score = sum(avg) / len(avg)
            if avg_score == max(all_avt):
                temp_str += f"| **{avg_score * 100:.1f}** "
            elif avg_score == all_avt[1]:
                temp_str += f"| _{avg_score * 100:.1f}_ "
            else:
                temp_str += f"| {avg_score * 100:.1f} "
            if idx == 0:
                avg_res_list.append(sum(all_avt) / len(all_avt))

        # 行结束
        temp_str += "\n"
        res_str += temp_str

    # 平均行
    avg_res_str = "| **Avg.** "
    for res in avg_res_list:
        avg_res_str += f"| **{res * 100:.1f}** "
    avg_res_str += "|\n"
    res_str += avg_res_str

    return res_str

In [30]:
res_str = ""

## PRMs
res_dict = get_res_dict(file_dict,model_lists=list(prm_model_name_dict.keys()))
prm_str = get_res_str(prm_model_name_dict, classification_parallel_dict, res_dict,)
res_str += prm_str

## Close Models
res_dict = get_res_dict(file_dict,model_lists=list(close_model_name_dict.keys()))
close_str = get_res_str(close_model_name_dict, classification_parallel_dict, res_dict,)
res_str += close_str


print(res_str)

| Model | Overall| NR. | NCL. | Avg (simplicity) | ES. | SC. | DC. | CI. | Avg (soundness) | PS. | DR. | MS. | Avg (sensitivity)  |
|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|------- |
| [Skywork-PRM-1.5B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B) | 31.5 | 31.2 | 35.6 | 33.4 | 32.3 | 25.6 | 25.7 | 29.9 | 28.4 | 32.8 | 32.0 | 80.9 | 48.6 
| [Skywork-PRM-7B](https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B) | 36.2 | 35.7 | 41.2 | 38.4 | 36.7 | 29.1 | 30.6 | 34.4 | 32.7 | 36.8 | 37.4 | 88.8 | 54.3 
| [Llemma-PRM800k-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf) | 52.0 | 49.3 | 53.4 | 51.4 | 56.4 | 47.1 | 46.7 | 53.3 | 50.9 | 51.0 | 53.5 | 93.6 | 66.0 
| [Llemma-MetaMath-7B](https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf) | 50.5 | 50.2 | 50.5 | 50.3 | 51.9 | 47.6 | 44.4 | 52.1 | 49.0 | 50.5 | 51.3 | 96.0 | 66.0 
| [Llemma-oprm-7B](http

## Form HTML str

In [31]:
prm_model_dict = {
    "skyworkprm_1_5B": {"Name": "Skywork-PRM-1.5B", "Source": "https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-1.5B", "Class": "PRM"},
    "skyworkprm_7B": {"Name": "Skywork-PRM-7B", "Source": "https://huggingface.co/Skywork/Skywork-o1-Open-PRM-Qwen-2.5-7B", "Class": "PRM"},
    "llemma7b_prm_prm800k": {"Name": "Llemma-PRM800k-7B", "Source": "https://huggingface.co/ScalableMath/llemma-7b-prm-prm800k-level-1to3-hf", "Class": "PRM"},
    "llemma7b_prm_metamath": {"Name": "Llemma-MetaMath-7B", "Source": "https://huggingface.co/ScalableMath/llemma-7b-prm-metamath-level-1to3-hf", "Class": "PRM"},
    "llemma7b_oprm_prm800k": {"Name": "Llemma-oprm-7B", "Source": "https://huggingface.co/ScalableMath/llemma-7b-oprm-prm800k-level-1to3-hf", "Class": "PRM"},
    "mathminos_mistral": {"Name": "MATHMinos-Mistral-7B", "Source": "https://github.com/KbsdJames/MATH-Minos", "Class": "PRM"},
    "mathshepherd": {"Name": "MathShepherd-Mistral-7B", "Source": "https://huggingface.co/peiyi9979/math-shepherd-mistral-7b-prm", "Class": "PRM"},
    "reasoneval7b": {"Name": "ReasonEval-7B", "Source": "https://huggingface.co/GAIR/ReasonEval-7B", "Class": "PRM"},
    "llama3_1_8b_prm_mistral": {"Name": "RLHFlow-PRM-Mistral-8B", "Source": "https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Mistral-Data", "Class": "PRM"},
    "llama3_1_8b_prm_deepseek": {"Name": "RLHFlow-PRM-Deepseek-8B", "Source": "https://huggingface.co/RLHFlow/Llama3.1-8B-PRM-Deepseek-Data", "Class": "PRM"},
    "reasoneval34b": {"Name": "ReasonEval-34B", "Source": "https://huggingface.co/GAIR/ReasonEval-34B", "Class": "PRM"},
    "gpt4o": {"Name": "GPT-4o", "Source": "https://openai.com/index/hello-gpt-4o/", "Class": "LM-C"},
    "o1mini": {"Name": "o1-mini", "Source": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "Class": "LM-C"},
    "gemini_2_flash": {"Name": "Gemini-2.0-flash-exp", "Source": "https://deepmind.google/technologies/gemini/flash/", "Class": "LM-C"},
    "gemini_2_thinking": {"Name": "Gemini-2.0-thinking-exp-1219", "Source": "https://ai.google.dev/gemini-api/docs/thinking-mode", "Class": "LM-C"},
    # "o1preview": {"Name": "o1-preview", "Source": "https://openai.com/index/introducing-openai-o1-preview/", "Class": "LM-C"},
    "qwen_qwq": {"Name": "QwQ-Preview-32B", "Source": "https://huggingface.co/Qwen/QwQ-32B-Preview", "Class": "LM-O"},
    "qwen_prm7b": {"Name": "Qwen2.5-Math-PRM-7B", "Source": "https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B", "Class": "PRM"},
    "qwen_prm72b": {"Name": "Qwen2.5-Math-PRM-72B", "Source": "https://huggingface.co/Qwen/Qwen2.5-Math-PRM-72B", "Class": "PRM"},
}

In [32]:
def get_html_table(model_dict, classification_dict, res_dict):
    res_str = ""
    html_str = '<table class="js-sort-table" id="results">\n'
    
    # 表头部分
    html_str += '  <tr>\n'
    html_str += '    <td class="js-sort-number"><strong>#</strong></td>\n'
    html_str += '    <td class="js-sort-number"><strong>Model</strong></td>\n'
    html_str += '    <td class="js-sort-number"><strong>Class</strong></td>\n'
    html_str += '    <td class="js-sort-number"><strong>Source</strong></td>\n'
    html_str += '    <td class="js-sort-number"><strong>Overall</strong></td>\n'
    
    # 动态生成分类列标题
    for big_classification_idx, (big_classification, current_classification_dict) in enumerate(classification_dict.items()):
        for classification, display_classification_name in current_classification_dict.items():
            html_str += f'    <td class="js-sort-number"><strong>{display_classification_name}</strong></td>\n'
        html_str += f'    <td class="js-sort-number"><strong>S{big_classification_idx+1}</strong></td>\n'  # 添加大类 Avg 列
    html_str += '  </tr>\n'
    res_str += html_str
    sort_list = []
    # 数据部分
    for idx, (model_k, model) in enumerate(model_dict.items()):
        

        # 计算 PRM Score
        all_model_scores = sorted([get_prmscore_from_current_res_dict(res) for res in res_dict.values()], reverse=True)
        current_res_dict = res_dict.get(model_k, {})
        prm_score = get_prmscore_from_current_res_dict(current_res_dict)
        if prm_score == all_model_scores[0]:
            current_total_res_str= f'    <td><b class="best-score-text">{prm_score * 100:.1f}</b></td>\n'
            current_model_name_str = f'     <td><b class="best-score-text">{model["Name"]} 🥇</b></td>\n'
        elif prm_score == all_model_scores[1]:
            current_total_res_str= f'    <td><b class="best-score-text">{prm_score * 100:.1f}</b></td>\n'
            current_model_name_str = f'     <td><b class="best-score-text">{model["Name"]} 🥈</b></td>\n'
        elif prm_score == all_model_scores[2]:
            current_total_res_str= f'    <td><b class="best-score-text">{prm_score * 100:.1f}</b></td>\n'
            current_model_name_str = f'     <td><b class="best-score-text">{model["Name"]} 🥉</b></td>\n'
        else:
            current_total_res_str= f'    <td><b class="">{prm_score * 100:.1f}</b></td>\n'
            current_model_name_str = f'     <td><b class="">{model["Name"]}</b></td>\n'
        html_str = ''
        html_str += '  <tr>\n'
        html_str += "    <td>{CURRENT_RANK}</td>\n"
        html_str += current_model_name_str
        html_str += f'    <td>{model["Class"]}</td>\n'
        html_str += f'    <td><a href="{model["Source"]}" class="ext-link" target="_blank">Link</a></td>\n'
        html_str += current_total_res_str
        currunt_total_prm_score = prm_score
        # 分类指标部分
        for big_classification, current_classification_dict in classification_dict.items():
            avg = []  # 保存当前大类的分类指标分数
            for classification, display_classification_name in current_classification_dict.items():
                prm_score = get_prmscore_from_current_res_dict(current_res_dict, classification)
                avg.append(prm_score)
                html_str += f'    <td>{prm_score * 100:.1f}</td>\n'
            
            #大类平均值
            avg_score = sum(avg) / len(avg) if avg else 0
            html_str += f'    <td><b class="">{avg_score * 100:.1f}</b></td>\n'

        html_str += '  </tr>\n'
        sort_list.append((currunt_total_prm_score, html_str))
    sort_list.sort(key=lambda x: x[0], reverse=True)
    for idx,(_, html_str) in enumerate(sort_list):
        res_str += html_str.format(CURRENT_RANK=idx+1)
    res_str += '</table>\n'
    return res_str

In [33]:
res_str = ""

## PRMs
res_dict = get_res_dict(file_dict,model_lists=list(prm_model_dict.keys()))
prm_str = get_html_table(prm_model_dict, classification_parallel_dict, res_dict,)
res_str += prm_str

print(res_str)

<table class="js-sort-table" id="results">
  <tr>
    <td class="js-sort-number"><strong>#</strong></td>
    <td class="js-sort-number"><strong>Model</strong></td>
    <td class="js-sort-number"><strong>Class</strong></td>
    <td class="js-sort-number"><strong>Source</strong></td>
    <td class="js-sort-number"><strong>Overall</strong></td>
    <td class="js-sort-number"><strong>NR.</strong></td>
    <td class="js-sort-number"><strong>NCL.</strong></td>
    <td class="js-sort-number"><strong>S1</strong></td>
    <td class="js-sort-number"><strong>ES.</strong></td>
    <td class="js-sort-number"><strong>SC.</strong></td>
    <td class="js-sort-number"><strong>DC.</strong></td>
    <td class="js-sort-number"><strong>CI.</strong></td>
    <td class="js-sort-number"><strong>S2</strong></td>
    <td class="js-sort-number"><strong>PS.</strong></td>
    <td class="js-sort-number"><strong>DR.</strong></td>
    <td class="js-sort-number"><strong>MS.</strong></td>
    <td class="js-sort-number"