In [1]:
from mr_eval.utils.utils import *
import os

def list_jsonl_files(folder_path):
    """
    列举文件夹中的所有 .jsonl 文件
    Args:
        folder_path (str): 文件夹路径
    Returns:
        List[str]: 所有 .jsonl 文件的路径
    """
    return [f for f in os.listdir(folder_path) if f.endswith(".jsonl")]



In [2]:
res_dir = "/mnt/petrelfs/songmingyang/code/reasoning/MR_Hallucination/mr_eval/scripts/logs/prmtest_classified"
res_files = list_jsonl_files(res_dir)
classification_name_dict = dict(
    domain_inconsistency="DC.",
    redundency="NR.",
    multi_solutions="MS.",
    deception="DR.",
    confidence="CI.",
    step_contradiction="SC.",
    circular="NCL.",
    missing_condition="PS.",
    counterfactual="ES."
)
classifications = ["redundency", "circular", "counterfactual", "step_contradiction", "domain_inconsistency",  "confidence", "missing_condition", "deception", "multi_solutions", ]
metrics = ["f1", "negative_f1", "total_step_acc", "correct_step_acc", "wrong_step_acc", "first_error_acc", "similarity",]

res_names = [f.split(".")[0] for f in res_files]
res_paths = [os.path.join(res_dir, f) for f in res_files]
res_str = ""
for name, path in zip(res_names, res_paths):
    temp_str = f"{name}"
    temp_res = process_jsonl(path)[-1]
    total_results = temp_res["total_hallucination_results"]
    type_results = temp_res["hallucination_type_results"]
    prm_score = total_results['f1'] * 0.5 + total_results['negative_f1'] * 0.5
    
    temp_str += f"\t{prm_score * 100:.1f}"
    for metric in metrics:
        temp_str += f"\t{total_results[metric] * 100:.1f}"
    
    for classification in classifications:
        prm_score = type_results['f1'][classification] * 0.5 + type_results['negative_f1'][classification] * 0.5
        temp_str += f"\t{prm_score * 100:.1f}"
        for metric in metrics:
            temp_str += f"\t{type_results[metric][classification]*100:.1f}"
            
    
    temp_str += "\n"
    res_str += temp_str
print(res_str)

llemma7b_prm_prm800k	52.0	75.7	28.3	63.7	66.4	48.5	22.2	20.5	49.3	77.7	20.9	65.2	69.2	36.9	20.5	13.6	53.4	65.5	41.3	56.6	54.9	61.4	15.7	34.4	56.4	76.8	36.1	65.9	69.2	51.7	20.0	18.8	47.1	75.2	18.9	62.1	64.3	42.5	24.1	19.5	46.7	71.6	21.9	58.3	59.9	47.1	20.5	25.7	53.3	79.9	26.6	68.5	70.9	50.0	29.4	13.8	51.0	71.7	30.3	59.7	62.3	47.9	24.4	25.0	53.5	77.8	29.1	66.2	70.4	43.8	23.1	14.9	46.8	93.6	0.0	88.0	88.0	-100.0	-100.0	10.2
gpt4o	66.8	86.9	46.7	79.0	82.9	58.2	64.4	23.4	57.0	77.8	36.3	67.0	66.5	70.4	77.1	31.1	62.4	73.5	51.3	65.6	65.4	66.2	80.6	40.1	72.0	88.9	55.2	82.2	88.8	55.4	63.2	21.3	69.7	89.9	49.6	83.1	84.3	74.3	76.9	23.7	70.7	88.2	53.3	81.2	81.3	80.4	81.4	28.6	71.1	92.1	50.2	86.3	90.4	56.5	60.4	14.8	62.5	86.8	38.3	78.2	88.8	34.7	33.3	16.2	65.7	89.2	42.2	81.8	90.5	39.3	41.3	15.2	49.6	99.2	0.0	98.4	98.4	-100.0	-100.0	3.2
gemini_2_thinking	68.8	89.7	47.8	82.8	89.0	49.8	57.0	18.0	68.5	91.4	45.6	85.1	90.9	47.1	56.5	15.3	63.8	81.2	46.4	72.2	82.8	44.0	54.8	25.4	72.9	89.4	56.4	83.0	89.8	55.5	

In [3]:
classifications_str = "\t".join(["all"]+classifications) 
print(classifications_str)

all	redundency	circular	counterfactual	step_contradiction	domain_inconsistency	confidence	missing_condition	deception	multi_solutions
