In [1]:
import json
import os

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, auc, confusion_matrix, f1_score, accuracy_score
import seaborn as sns
import pandas as pd

# 加载JSON数据
def load_predictions(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# 提取预测概率和标签
def extract_data(predictions_data):
    true_labels = []
    pred_labels = []
    probs = {
        "Yes": [],
        "To some extent": [],
        "No": []
    }
    
    for item in predictions_data["predictions"]:
        true_labels.append(item["true_label"])
        pred_labels.append(item["predicted_label"])
        for class_name in probs:
            probs[class_name].append(item["probabilities"][class_name])
            
    return np.array(true_labels), np.array(pred_labels), probs

# 1. 阈值调整分析 - 针对二分类场景
def threshold_analysis(true_labels, probabilities, positive_class=0, target_class_name="Yes"):
    """
    对于二分类问题计算不同阈值下的性能指标
    positive_class: 要作为正类的类别索引
    """
    # 转换为二分类问题(one-vs-rest)
    binary_true = (true_labels == positive_class).astype(int)
    class_probs = np.array(probabilities[target_class_name])
    
    # 计算ROC曲线
    fpr, tpr, thresholds = roc_curve(binary_true, class_probs)
    roc_auc = auc(fpr, tpr)
    
    # 计算PR曲线
    precision, recall, pr_thresholds = precision_recall_curve(binary_true, class_probs)
    pr_auc = auc(recall, precision)
    
    # 可视化ROC曲线
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Class: {target_class_name}')
    plt.legend(loc="lower right")
    
    # 可视化PR曲线
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, color='green', lw=2, label=f'PR curve (area = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for Class: {target_class_name}')
    plt.legend(loc="lower left")
    
    plt.tight_layout()
    plt.savefig(f'threshold_analysis_{target_class_name}.png')
    plt.close()
    
    # 找到最佳阈值 - 使用F1分数
    f1_scores = []
    for threshold in np.arange(0.1, 1.0, 0.05):
        pred = (class_probs >= threshold).astype(int)
        f1 = f1_score(binary_true, pred, average='binary')
        f1_scores.append((threshold, f1))
    
    f1_scores.sort(key=lambda x: x[1], reverse=True)
    best_threshold, best_f1 = f1_scores[0]
    
    print(f"类别 {target_class_name} 的最佳阈值为 {best_threshold:.2f}，F1分数为 {best_f1:.4f}")
    
    return best_threshold, best_f1, thresholds, fpr, tpr

# 2. 混淆矩阵分析
def confusion_matrix_analysis(true_labels, pred_labels, class_names=['Yes', 'To some extent', 'No']):
    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    return cm

# 3. 阈值优化 - 多分类情况
def optimize_thresholds_multiclass(true_labels, probabilities, class_names=['Yes', 'To some extent', 'No']):
    """通过验证集优化每个类别的阈值"""
    best_thresholds = {}
    overall_scores = []
    
    # 为每个类别寻找最佳阈值
    for i, class_name in enumerate(class_names):
        best_threshold, best_f1, _, _, _ = threshold_analysis(
            true_labels, probabilities, positive_class=i, target_class_name=class_name
        )
        best_thresholds[class_name] = best_threshold
        overall_scores.append(best_f1)
    
    # 应用优化后的阈值进行预测
    optimized_preds = apply_optimized_thresholds(probabilities, best_thresholds)
    
    # 计算优化后的整体性能
    overall_accuracy = accuracy_score(true_labels, optimized_preds)
    overall_f1 = f1_score(true_labels, optimized_preds, average='macro')
    
    print(f"优化阈值后的准确率: {overall_accuracy:.4f}")
    print(f"优化阈值后的F1分数: {overall_f1:.4f}")
    
    # 优化后的混淆矩阵
    confusion_matrix_analysis(true_labels, optimized_preds, class_names)
    
    return best_thresholds, overall_accuracy, overall_f1

# 应用优化后的阈值
def apply_optimized_thresholds(probabilities, thresholds, class_names=['Yes', 'To some extent', 'No']):
    """根据优化后的阈值对样本进行分类"""
    # 提取每个类别的概率
    probs_array = np.column_stack([probabilities[class_name] for class_name in class_names])
    
    # 应用阈值
    adjusted_probs = np.zeros_like(probs_array)
    for i, class_name in enumerate(class_names):
        # 应用调整后的阈值
        adjusted_probs[:, i] = probs_array[:, i] / thresholds[class_name]
    
    # 选择调整后概率最高的类别
    return np.argmax(adjusted_probs, axis=1)

# 4. 错误分析
def error_analysis(data, true_labels, pred_labels):
    """分析模型错误预测的案例"""
    error_indices = np.where(true_labels != pred_labels)[0]
    error_cases = []
    
    for idx in error_indices:
        error_cases.append({
            'index': idx,
            'true_label': true_labels[idx],
            'pred_label': pred_labels[idx],
            'probabilities': {
                class_name: data["predictions"][idx]["probabilities"][class_name]
                for class_name in ["Yes", "To some extent", "No"]
            }
        })
    
    # 按错误类型分组
    error_types = {}
    for case in error_cases:
        error_type = f"真实:{case['true_label']} -> 预测:{case['pred_label']}"
        if error_type not in error_types:
            error_types[error_type] = []
        error_types[error_type].append(case)
    
    # 打印错误分布
    print("\n错误类型分布:")
    for error_type, cases in error_types.items():
        print(f"{error_type}: {len(cases)} 个案例 ({len(cases)/len(error_indices)*100:.1f}%)")
    
    # 返回错误案例及其分布
    return error_cases, error_types

# 5. 概率分布可视化
def visualize_probability_distributions(probabilities, true_labels, class_names=['Yes', 'To some extent', 'No']):
    """可视化各类别的概率分布"""
    plt.figure(figsize=(15, 5 * len(class_names)))
    
    for i, class_name in enumerate(class_names):
        # 获取当前类别的概率
        class_probs = np.array(probabilities[class_name])
        
        # 区分正例和负例
        positive_probs = class_probs[true_labels == i]
        negative_probs = class_probs[true_labels != i]
        
        plt.subplot(len(class_names), 1, i+1)
        
        # 绘制概率分布直方图
        if len(positive_probs) > 0:
            plt.hist(positive_probs, alpha=0.5, bins=20, range=(0, 1), label=f'真实为{class_name}')
        if len(negative_probs) > 0:
            plt.hist(negative_probs, alpha=0.5, bins=20, range=(0, 1), label=f'真实不为{class_name}')
            
        plt.title(f' {class_name} distrubtion')
        plt.xlabel('probability')
        plt.ylabel('count')
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('probability_distributions.png')
    plt.close()

# 6. 设置不同阈值后的性能评估
def evaluate_with_thresholds(probabilities, true_labels, thresholds=None, class_names=['Yes', 'To some extent', 'No']):
    """评估不同阈值设置下的模型性能"""
    if thresholds is None:
        # 默认使用平均阈值
        thresholds = {class_name: 0.33 for class_name in class_names}
    
    # 应用阈值进行预测
    predictions = apply_optimized_thresholds(probabilities, thresholds, class_names)
    
    # 计算性能指标
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='macro')
    class_f1 = f1_score(true_labels, predictions, average=None)
    
    print(f"阈值设置: {thresholds}")
    print(f"整体准确率: {accuracy:.4f}")
    print(f"整体F1分数: {f1:.4f}")
    for i, class_name in enumerate(class_names):
        print(f"类别 {class_name} 的F1分数: {class_f1[i]:.4f}")
    
    # 绘制混淆矩阵
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix with Custom Thresholds\nAcc: {accuracy:.4f}, F1: {f1:.4f}')
    plt.savefig('confusion_matrix_custom_thresholds.png')
    plt.close()
    
    return accuracy, f1, predictions

# 主函数 - 分析JSON数据
def analyze_prediction_json(json_path):
    # 加载数据
    data = load_predictions(json_path)
    print(f"分析Epoch {data['epoch']} 的预测结果")
    print(f"原始性能 - 损失: {data['loss']:.4f}, 准确率: {data['accuracy']:.4f}, F1: {data['f1_score']:.4f}")
    
    # 提取数据
    true_labels, pred_labels, probabilities = extract_data(data)
    
    # 1. 分析原始混淆矩阵
    print("\n原始混淆矩阵分析:")
    cm = confusion_matrix_analysis(true_labels, pred_labels)
    
    # 2. 类别概率分布可视化
    print("\n绘制概率分布...")
    visualize_probability_distributions(probabilities, true_labels)
    
    # 3. 针对每个类别的阈值分析
    print("\n各类别阈值分析:")
    best_thresholds, overall_accuracy, overall_f1 = optimize_thresholds_multiclass(true_labels, probabilities)
    
    # 4. 错误案例分析
    print("\n错误案例分析:")
    error_cases, error_types = error_analysis(data, true_labels, pred_labels)
    
    # 5. 尝试不同的阈值组合
    print("\n尝试自定义阈值组合:")
    # 例如，提高Yes类别的阈值，降低其它类别的阈值
    custom_thresholds = {
        "Yes": 0.4,
        "To some extent": 0.3,
        "No": 0.3
    }
    custom_accuracy, custom_f1, custom_preds = evaluate_with_thresholds(
        probabilities, true_labels, custom_thresholds
    )
    
    return {
        "best_thresholds": best_thresholds,
        "optimized_accuracy": overall_accuracy,
        "optimized_f1": overall_f1,
        "custom_thresholds": custom_thresholds,
        "custom_accuracy": custom_accuracy,
        "custom_f1": custom_f1
    }

# 使用示例
if __name__ == "__main__":
    # 替换为你的JSON文件路径
    json_path = "/mnt/cfs/huangzhiwei/BAE2025/projects/checkpoints_3/bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_3.json"
    results = analyze_prediction_json(json_path)
    
    # 保存最佳阈值设置到文件
    with open("optimized_thresholds.json", "w", encoding="utf-8") as f:
        json.dump({
            "best_thresholds": results["best_thresholds"],
            "optimized_accuracy": float(results["optimized_accuracy"]),
            "optimized_f1": float(results["optimized_f1"]),
            "custom_thresholds": results["custom_thresholds"],
            "custom_accuracy": float(results["custom_accuracy"]),
            "custom_f1": float(results["custom_f1"])
        }, f, indent=2)
    
    print("\n分析完成，结果已保存到图表和阈值文件中。")

分析Epoch 3 的预测结果
原始性能 - 损失: 0.7987, 准确率: 0.7126, F1: 0.5541

原始混淆矩阵分析:

绘制概率分布...


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig('probability_distributions.png')
  plt.savefig('probability_distributions.png')
  plt.savefig('probability_distributions.png')
  plt.savefig('probability_distributions.png')



各类别阈值分析:
类别 Yes 的最佳阈值为 0.40，F1分数为 0.8143
类别 To some extent 的最佳阈值为 0.25，F1分数为 0.3522
类别 No 的最佳阈值为 0.40，F1分数为 0.6396
优化阈值后的准确率: 0.6487
优化阈值后的F1分数: 0.5442

错误案例分析:

错误类型分布:
真实:2 -> 预测:0: 42 个案例 (29.2%)
真实:0 -> 预测:2: 11 个案例 (7.6%)
真实:1 -> 预测:0: 67 个案例 (46.5%)
真实:2 -> 预测:1: 8 个案例 (5.6%)
真实:1 -> 预测:2: 8 个案例 (5.6%)
真实:0 -> 预测:1: 8 个案例 (5.6%)

尝试自定义阈值组合:
阈值设置: {'Yes': 0.4, 'To some extent': 0.3, 'No': 0.3}
整体准确率: 0.6747
整体F1分数: 0.5488
类别 Yes 的F1分数: 0.7823
类别 To some extent 的F1分数: 0.2270
类别 No 的F1分数: 0.6373

分析完成，结果已保存到图表和阈值文件中。


In [9]:
with open("/mnt/cfs/huangzhiwei/BAE2025/projects/checkpoints_3/bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_3.json", "r", encoding="utf-8") as f:
    datas = json.load(f)
    # print(datas)

labels = {
    "Yes": 0,
    "To some extent": 1,
    "No": 2
}

min_yes = 1.0
pre_yes = 0.0
max_yes = 0.0
avg_yes = 0.0
cnt_yes = 0


for data in datas["predictions"]:
    # print(data)
    if data["true_label"] == 0 and data["predicted_label"] == 0: 
        print(data)
        cnt_yes += 1
        if data["probabilities"]["Yes"] < min_yes:
            min_yes = data["probabilities"]["Yes"]
        if data["probabilities"]["Yes"] > max_yes:
            max_yes = data["probabilities"]["Yes"]
        avg_yes += data["probabilities"]["Yes"]

avg_yes = avg_yes / cnt_yes
print("Yes的最小值: ", min_yes)
print("Yes的最大值: ", max_yes)
print("Yes的平均值: ", avg_yes)
print("Yes的数量: ", cnt_yes)


{'true_label': 0, 'predicted_label': 0, 'probabilities': {'Yes': 0.7180535197257996, 'To some extent': 0.16217131912708282, 'No': 0.11977510154247284}}
{'true_label': 0, 'predicted_label': 0, 'probabilities': {'Yes': 0.5850518941879272, 'To some extent': 0.16684912145137787, 'No': 0.24809901416301727}}
{'true_label': 0, 'predicted_label': 0, 'probabilities': {'Yes': 0.7025795578956604, 'To some extent': 0.16257573664188385, 'No': 0.13484467566013336}}
{'true_label': 0, 'predicted_label': 0, 'probabilities': {'Yes': 0.700442373752594, 'To some extent': 0.13803955912590027, 'No': 0.16151806712150574}}
{'true_label': 0, 'predicted_label': 0, 'probabilities': {'Yes': 0.684078574180603, 'To some extent': 0.13243776559829712, 'No': 0.18348364531993866}}
{'true_label': 0, 'predicted_label': 0, 'probabilities': {'Yes': 0.6637284159660339, 'To some extent': 0.129995658993721, 'No': 0.20627591013908386}}
{'true_label': 0, 'predicted_label': 0, 'probabilities': {'Yes': 0.5205386281013489, 'To som

In [12]:
with open("/mnt/cfs/huangzhiwei/BAE2025/projects/checkpoints_3/bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_3.json", "r", encoding="utf-8") as f:
    datas = json.load(f)
    # print(datas)

labels = {
    "Yes": 0,
    "To some extent": 1,
    "No": 2
}

min_to_some_extent = 1.0
pre_to_some_extent = 0.0
max_to_some_extent = 0.0
avg_to_some_extent = 0.0
cnt_to_some_extent = 0

min_no = 1.0
pre_no = 0.0
max_no = 0.0
avg_no = 0.0
cnt_no = 0

cnt_yes = 0


for data in datas["predictions"]:
    # print(data)
    if data["true_label"] == 0 and data["predicted_label"] == 0: 
        cnt_yes += 1
        if data["probabilities"]["To some extent"] < min_to_some_extent:
            min_to_some_extent = data["probabilities"]["To some extent"]
        if data["probabilities"]["To some extent"] > max_to_some_extent:
            max_to_some_extent = data["probabilities"]["To some extent"]
        avg_to_some_extent += data["probabilities"]["To some extent"]
        
        if data["probabilities"]["No"] < min_no:
            min_no = data["probabilities"]["No"]
        if data["probabilities"]["No"] > max_no:
            max_no = data["probabilities"]["No"]
        avg_no += data["probabilities"]["No"]

avg_to_some_extent = avg_to_some_extent / cnt_yes
avg_no = avg_no / cnt_yes

print("数量：" , cnt_yes)

print("To some extent的最小值: ", min_to_some_extent)
print("To some extent的最大值: ", max_to_some_extent)
print("To some extent的平均值: ", avg_to_some_extent)

# print("To some extent的数量: ", cnt_yes)
print("No的最小值: ", min_no)
print("No的最大值: ", max_no)
print("No的平均值: ", avg_no)


数量： 281
To some extent的最小值:  0.10238726437091827
To some extent的最大值:  0.431673526763916
To some extent的平均值:  0.23424549125055402
No的最小值:  0.04535079002380371
No的最大值:  0.3973469138145447
No的平均值:  0.11475024451882813


In [13]:
with open("/mnt/cfs/huangzhiwei/BAE2025/projects/checkpoints_3/bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_3.json", "r", encoding="utf-8") as f:
    datas = json.load(f)
    # print(datas)

labels = {
    "Yes": 0,
    "To some extent": 1,
    "No": 2
}

min_to_some_extent = 1.0
pre_to_some_extent = 0.0
max_to_some_extent = 0.0
avg_to_some_extent = 0.0
cnt_to_some_extent = 0

min_no = 1.0
pre_no = 0.0
max_no = 0.0
avg_no = 0.0
cnt_no = 0

cnt_yes = 0


for data in datas["predictions"]:
    # print(data)
    if data["true_label"] == 1 and data["predicted_label"] == 0: 
        cnt_yes += 1
        if data["probabilities"]["To some extent"] < min_to_some_extent:
            min_to_some_extent = data["probabilities"]["To some extent"]
        if data["probabilities"]["To some extent"] > max_to_some_extent:
            max_to_some_extent = data["probabilities"]["To some extent"]
        avg_to_some_extent += data["probabilities"]["To some extent"]
        
        if data["probabilities"]["No"] < min_no:
            min_no = data["probabilities"]["No"]
        if data["probabilities"]["No"] > max_no:
            max_no = data["probabilities"]["No"]
        avg_no += data["probabilities"]["No"]

avg_to_some_extent = avg_to_some_extent / cnt_yes
avg_no = avg_no / cnt_yes

print("数量：" , cnt_yes)

print("To some extent的最小值: ", min_to_some_extent)
print("To some extent的最大值: ", max_to_some_extent)
print("To some extent的平均值: ", avg_to_some_extent)

# print("To some extent的数量: ", cnt_yes)
print("No的最小值: ", min_no)
print("No的最大值: ", max_no)
print("No的平均值: ", avg_no)


数量： 67
To some extent的最小值:  0.11125070601701736
To some extent的最大值:  0.44628775119781494
To some extent的平均值:  0.26487843414295964
No的最小值:  0.048949457705020905
No的最大值:  0.42154496908187866
No的平均值:  0.11452662138574159


In [14]:
with open("/mnt/cfs/huangzhiwei/BAE2025/projects/checkpoints_3/bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_3.json", "r", encoding="utf-8") as f:
    datas = json.load(f)
    # print(datas)

labels = {
    "Yes": 0,
    "To some extent": 1,
    "No": 2
}

min_to_some_extent = 1.0
pre_to_some_extent = 0.0
max_to_some_extent = 0.0
avg_to_some_extent = 0.0
cnt_to_some_extent = 0

min_no = 1.0
pre_no = 0.0
max_no = 0.0
avg_no = 0.0
cnt_no = 0

cnt_yes = 0


for data in datas["predictions"]:
    # print(data)
    if data["true_label"] == 2 and data["predicted_label"] == 0: 
        cnt_yes += 1
        if data["probabilities"]["To some extent"] < min_to_some_extent:
            min_to_some_extent = data["probabilities"]["To some extent"]
        if data["probabilities"]["To some extent"] > max_to_some_extent:
            max_to_some_extent = data["probabilities"]["To some extent"]
        avg_to_some_extent += data["probabilities"]["To some extent"]
        
        if data["probabilities"]["No"] < min_no:
            min_no = data["probabilities"]["No"]
        if data["probabilities"]["No"] > max_no:
            max_no = data["probabilities"]["No"]
        avg_no += data["probabilities"]["No"]

avg_to_some_extent = avg_to_some_extent / cnt_yes
avg_no = avg_no / cnt_yes

print("数量：" , cnt_yes)

print("To some extent的最小值: ", min_to_some_extent)
print("To some extent的最大值: ", max_to_some_extent)
print("To some extent的平均值: ", avg_to_some_extent)

# print("To some extent的数量: ", cnt_yes)
print("No的最小值: ", min_no)
print("No的最大值: ", max_no)
print("No的平均值: ", avg_no)


数量： 42
To some extent的最小值:  0.10769475996494293
To some extent的最大值:  0.4309940040111542
To some extent的平均值:  0.24430051108910925
No的最小值:  0.04302017390727997
No的最大值:  0.29060858488082886
No的平均值:  0.11206147739929813


In [28]:
import json
with open("/mnt/cfs/huangzhiwei/BAE2025/checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_6.json", "r", encoding="utf-8") as f:
    datas = json.load(f)
    # print(datas)

# 分析stage1中的数据
labels = {
    "Yes": 0,
    "To some extent": 1,
    "No": 2
}

min_yes = 1.0
pre_yes = 0.0
max_yes = 0.0
avg_yes = 0.0
cnt_yes = 0

min_yes_x = 1.0
pre_yes_x = 0.0
max_yes_x = 0.0
avg_yes_x = 0.0
cnt_yes_x = 0

min_noyes = 1.0
pre_noyes = 0.0
max_noyes = 0.0
avg_noyes = 0.0
cnt_noyes = 0

min_noyes_x = 1.0
pre_noyes_x = 0.0
max_noyes_x = 0.0
avg_noyes_x = 0.0
cnt_noyes_x = 0

min_wrong = 1.0
pre_wrong = 0.0
max_wrong = 0.0
avg_wrong = 0.0
cnt_wrong = 0

min_wrong_x = 1.0
pre_wrong_x = 0.0
max_wrong_x = 0.0
avg_wrong_x = 0.0
cnt_wrong_x = 0

min_wrong_yes_to_non_yes = 1.0
pre_wrong_yes_to_non_yes = 0.0
max_wrong_yes_to_non_yes = 0.0
avg_wrong_yes_to_non_yes = 0.0
cnt_wrong_yes_to_non_yes = 0

min_wrong_yes_to_non_yes_x = 1.0
pre_wrong_yes_to_non_yes_x = 0.0
max_wrong_yes_to_non_yes_x = 0.0
avg_wrong_yes_to_non_yes_x = 0.0
cnt_wrong_yes_to_non_yes_x = 0

data = datas['stage1']
for i in range(len(data["probs"])):
    if data["preds"][i] == 0 and data["labels"][i] == 0:
        cnt_yes += 1
        if data["probs"][i][0] < min_yes:
            min_yes = data["probs"][i][0]
        if data["probs"][i][0] > max_yes:
            max_yes = data["probs"][i][0]
        avg_yes += data["probs"][i][0]
        
        if data["probs"][i][1] < min_yes_x:
            min_yes_x = data["probs"][i][1]
        if data["probs"][i][1] > max_yes_x:
            max_yes_x = data["probs"][i][1]
        avg_yes_x += data["probs"][i][1]
        
        
    if data["preds"][i] == 1 and data["labels"][i] == 1:
        cnt_noyes += 1
        if data["probs"][i][0] < min_noyes:
            min_noyes = data["probs"][i][0]
        if data["probs"][i][0] > max_noyes:
            max_noyes = data["probs"][i][0]
        avg_noyes += data["probs"][i][0]
        
        if data["probs"][i][1] < min_noyes_x:
            min_noyes_x = data["probs"][i][1]
        if data["probs"][i][1] > max_noyes_x:
            max_noyes_x = data["probs"][i][1]
        avg_noyes_x += data["probs"][i][1]
    
    
    if data["preds"][i] == 1 and data["labels"][i] == 0:
        cnt_wrong_yes_to_non_yes += 1
        if data["probs"][i][0] < min_wrong_yes_to_non_yes:
            min_wrong_yes_to_non_yes = data["probs"][i][0]
        if data["probs"][i][0] > max_wrong_yes_to_non_yes:
            max_wrong_yes_to_non_yes = data["probs"][i][0]
        avg_wrong_yes_to_non_yes += data["probs"][i][0]
        
        if data["probs"][i][1] < min_wrong_yes_to_non_yes_x:
            min_wrong_yes_to_non_yes_x = data["probs"][i][1]
        if data["probs"][i][1] > max_wrong_yes_to_non_yes_x:
            max_wrong_yes_to_non_yes_x = data["probs"][i][1]
        avg_wrong_yes_to_non_yes_x += data["probs"][i][1]
        
    
    if data["preds"][i] == 0 and data["labels"][i] == 1:
        cnt_wrong += 1
        if data["probs"][i][0] < min_wrong:
            min_wrong = data["probs"][i][0]
        if data["probs"][i][0] > max_wrong:
            max_wrong = data["probs"][i][0]
        avg_wrong += data["probs"][i][0]
        
        if data["probs"][i][1] < min_wrong_x:
            min_wrong_x = data["probs"][i][1]
        if data["probs"][i][1] > max_wrong_x:
            max_wrong_x = data["probs"][i][1]
        avg_wrong_x += data["probs"][i][1]
        
    
avg_yes = avg_yes / cnt_yes
avg_yes_x = avg_yes_x / cnt_yes
avg_noyes = avg_noyes / cnt_noyes
avg_noyes_x = avg_noyes_x / cnt_noyes
avg_wrong = avg_wrong / cnt_wrong
avg_wrong_x = avg_wrong_x / cnt_wrong
avg_wrong_yes_to_non_yes = avg_wrong_yes_to_non_yes / cnt_wrong_yes_to_non_yes
avg_wrong_yes_to_non_yes_x = avg_wrong_yes_to_non_yes_x / cnt_wrong_yes_to_non_yes

# 左侧：correct(yes)
print("correct(yes)", end="")
print("\t\t\t\t\t\tcorrect(no-yes)")
print("correct(yes)的数量: ", cnt_yes, end="")
print("\t\t\t\t\tcorrect(no-yes)的数量: ", cnt_noyes)
# 右侧：correct(no-yes)



print("correct(预测yes)的最小值: ", min_yes, end="")
print("\t\tcorrect(预测yes)的最小值: ", min_noyes)

print("correct(预测yes)的最大值: ", max_yes, end="")
print("\t\tcorrect(预测yes)的最大值: ", max_noyes)

print("correct(预测yes)的平均值: ", avg_yes, end="")
print("\t\tcorrect(预测yes)的平均值: ", avg_noyes)

print("\ncorrect(预测no-yes)的最小值: ", min_yes_x, end="")
print("\tcorrect(预测no-yes)的最小值: ", min_noyes_x)

print("correct(预测no-yes)的最大值: ", max_yes_x, end="")
print("\t\tcorrect(预测no-yes)的最大值: ", max_noyes_x)

print("correct(预测no-yes)的平均值: ", avg_yes_x, end="")
print("\t\tcorrect(预测no-yes)的平均值: ", avg_noyes_x)

print("\n" + "-"*80 + "\n")

# 左侧：Wrong(no-yes但被预测成yes)
print("Wrong(no-yes但被预测成yes)", end="")
# 右侧：Wrong(yes但被预测成no-yes)
print("\t\t\t\tWrong(yes但被预测成no-yes)")

print("Wrong的数量: ", cnt_wrong, end="")
print("\t\t\t\t\tWrong的数量: ", cnt_wrong_yes_to_non_yes)

print("Wrong(预测yes)的最小值: ", min_wrong, end="")
print("\t\tWrong(预测yes)的最小值: ", min_wrong_yes_to_non_yes)

print("Wrong(预测yes)的最大值: ", max_wrong, end="")
print("\t\tWrong(预测yes)的最大值: ", max_wrong_yes_to_non_yes)

print("Wrong(预测yes)的平均值: ", avg_wrong, end="")
print("\t\tWrong(预测yes)的平均值: ", avg_wrong_yes_to_non_yes)

print("\nWrong(预测no-yes)的最小值: ", min_wrong_x, end="")
print("\t\tWrong(预测no-yes)的最小值: ", min_wrong_yes_to_non_yes_x)

print("Wrong(预测no-yes)的最大值: ", max_wrong_x, end="")
print("\t\tWrong(预测no-yes)的最大值: ", max_wrong_yes_to_non_yes_x)

print("Wrong(预测no-yes)的平均值: ", avg_wrong_x, end="")
print("\t\tWrong(预测no-yes)的平均值: ", avg_wrong_yes_to_non_yes_x)

correct(yes)						correct(no-yes)
correct(yes)的数量:  266					correct(no-yes)的数量:  91
correct(预测yes)的最小值:  0.5117904543876648		correct(预测yes)的最小值:  0.003536677686497569
correct(预测yes)的最大值:  0.9780089855194092		correct(预测yes)的最大值:  0.499489963054657
correct(预测yes)的平均值:  0.8516360496667991		correct(预测yes)的平均值:  0.09398987988818559

correct(预测no-yes)的最小值:  0.021990962326526642	correct(预测no-yes)的最小值:  0.500510036945343
correct(预测no-yes)的最大值:  0.4882095456123352		correct(预测no-yes)的最大值:  0.9964633584022522
correct(预测no-yes)的平均值:  0.14836394529145464		correct(预测no-yes)的平均值:  0.9060101260195722

--------------------------------------------------------------------------------

Wrong(no-yes但被预测成yes)				Wrong(yes但被预测成no-yes)
Wrong的数量:  110					Wrong的数量:  34
Wrong(预测yes)的最小值:  0.5223598480224609		Wrong(预测yes)的最小值:  0.006224358920007944
Wrong(预测yes)的最大值:  0.9759572148323059		Wrong(预测yes)的最大值:  0.4944572150707245
Wrong(预测yes)的平均值:  0.8163959871638905		Wrong(预测yes)的平均值:  0.26022673859808815

Wrong(预测no

In [17]:
with open("/mnt/cfs/huangzhiwei/BAE2025/checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_6.json", "r", encoding="utf-8") as f:
    datas = json.load(f)
    # print(datas)

# 分析stage1中的数据
labels = {
    "Yes": 0,
    "To some extent": 1,
    "No": 2
}

min_yes = 1.0
pre_yes = 0.0
max_yes = 0.0
avg_yes = 0.0
cnt_yes = 0

min_noyes = 1.0
pre_noyes = 0.0
max_noyes = 0.0
avg_noyes = 0.0
cnt_noyes = 0

min_wrong = 1.0
pre_wrong = 0.0
max_wrong = 0.0
avg_wrong = 0.0
cnt_wrong = 0

data = datas['stage1']
for i in range(len(data["probs"])):
    if data["preds"][i] == 0 and data["labels"][i] == 0:
        cnt_yes += 1
        if data["probs"][i][1] < min_yes:
            min_yes = data["probs"][i][1]
        if data["probs"][i][1] > max_yes:
            max_yes = data["probs"][i][1]
        avg_yes += data["probs"][i][1]
        
    if data["preds"][i] == 1 and data["labels"][i] == 1:
        cnt_noyes += 1
        if data["probs"][i][1] < min_noyes:
            min_noyes = data["probs"][i][1]
        if data["probs"][i][1] > max_noyes:
            max_noyes = data["probs"][i][1]
        avg_noyes += data["probs"][i][1]
        
    if data["preds"][i] == 0 and data["labels"][i] == 1:
        cnt_wrong += 1
        if data["probs"][i][1] < min_wrong:
            min_wrong = data["probs"][i][1]
        if data["probs"][i][1] > max_wrong:
            max_wrong = data["probs"][i][1]
        avg_wrong += data["probs"][i][1]
        
    
avg_yes = avg_yes / cnt_yes
avg_noyes = avg_noyes / cnt_noyes
avg_wrong = avg_wrong / cnt_wrong

print("Yes的最小值: ", min_yes)
print("Yes的最大值: ", max_yes)
print("Yes的平均值: ", avg_yes)
print("Yes的数量: ", cnt_yes)

print("No的最小值: ", min_noyes)
print("No的最大值: ", max_noyes)
print("No的平均值: ", avg_noyes)
print("No的数量: ", cnt_noyes)

print("Wrong的最小值: ", min_wrong)
print("Wrong的最大值: ", max_wrong)
print("Wrong的平均值: ", avg_wrong)
print("Wrong的数量: ", cnt_wrong)


Yes的最小值:  0.021990962326526642
Yes的最大值:  0.4882095456123352
Yes的平均值:  0.14836394529145464
Yes的数量:  266
No的最小值:  0.500510036945343
No的最大值:  0.9964633584022522
No的平均值:  0.9060101260195722
No的数量:  91
Wrong的最小值:  0.024042798206210136
Wrong的最大值:  0.4776401221752167
Wrong的平均值:  0.18360401667993176
Wrong的数量:  110


In [2]:
import json
with open('/mnt/cfs/huangzhiwei/BAE2025/projects/checkpoints_error/bert-base-uncased_fp_b16_e50_len512_lr1e-05/error_analysis/error_examples_epoch_3.json', 'r', encoding='utf-8') as f:
    datas = json.load(f)
    # print(data)

cnt = 0
for data in datas:
    if len(data['first_sentence']) > 512:
        cnt += 1

print(len(datas))
print("句子长度超过512的数量: ", cnt)

144
句子长度超过512的数量:  107
