In [1]:
import os
# 设置环境变量，只让程序看到 GPU 2
os.environ['CUDA_VISIBLE_DEVICES'] = '1'


import torch
import torch.nn as nn
import wandb
import random
import argparse
import numpy as np
from tqdm import tqdm
from transformers import BertModel, AutoModel
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch.utils.data import Dataset
import json

class BAE2025Dataset(Dataset):
    def __init__(
            self,
            data_path,
            labels={
                "Yes": 0,
                "To some extent": 1, 
                "No": 2,
            }
    ):
        self.data_path = data_path
        self.labels = labels
        self._get_data()
    
    def _get_data(self):
        with open(self.data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        self.data = []
        for item in data:
            tutor_responses = item['tutor_responses']
            for response in tutor_responses.values():
                sent1 = item['conversation_history']
                sent2 = response['response']
                label = response['annotation']["Providing_Guidance"]
                if label in self.labels:
                    self.data.append(((sent1, sent2), self.labels[label]))
    
    def __len__(self):
        return len(self.data)
    
    def get_labels(self):
        return self.labels

    def __getitem__(self, idx):
        return self.data[idx]

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

class BAE2025DataLoader:
    def __init__(
        self,
        dataset,
        batch_size=16,
        max_length=512,
        shuffle=True,
        drop_last=True,
        device=None,
        # tokenizer_name='chinese-bert-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext-large'
        # tokenizer_name='/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
        # tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/bge-base-en-v1.5'
        tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/bert-base-uncased'
        # tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.dataset = dataset
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.drop_last = drop_last

        if device is None:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu'
            )
        else:
            self.device = device

        self.loader = DataLoader(
            dataset=self.dataset,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            shuffle=self.shuffle,
            drop_last=self.drop_last
        )

    def collate_fn(self, data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 修改这里，处理两个句子的情况
        data = self.tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=[(sent[0], sent[1]) for sent in sents],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_length=True
        )
        input_ids = data['input_ids'].to(self.device)
        attention_mask = data['attention_mask'].to(self.device)
        token_type_ids = data['token_type_ids'].to(self.device)
        labels = torch.LongTensor(labels).to(self.device)

        return input_ids, attention_mask, token_type_ids, labels
        # return input_ids, attention_mask, labels


    def __iter__(self):
        for data in self.loader:
            yield data

    def __len__(self):
        return len(self.loader)



In [4]:
import torch
import torch.nn as nn
from transformers import BertModel

# 修改模型类以支持分层分类
class HierarchicalBertClassifier(nn.Module):
    def __init__(self, pretrained_model_name, freeze_pooler=0, dropout=0.3):
        super().__init__()
        
        # 第一阶段分类器：Yes vs 非Yes
        self.bert_stage1 = BertModel.from_pretrained(pretrained_model_name, output_hidden_states=True)
        
        # 第二阶段分类器：To some extent vs No
        self.bert_stage2 = BertModel.from_pretrained(pretrained_model_name, output_hidden_states=True)
        
        # 冻结BERT底层，保留顶层微调
        if freeze_pooler > 0:
            # 冻结第一阶段模型的底层
            modules1 = [self.bert_stage1.embeddings, *self.bert_stage1.encoder.layer[:freeze_pooler]]
            for module in modules1:
                for param in module.parameters():
                    param.requires_grad = False
            
            # 冻结第二阶段模型的底层
            modules2 = [self.bert_stage2.embeddings, *self.bert_stage2.encoder.layer[:freeze_pooler]]
            for module in modules2:
                for param in module.parameters():
                    param.requires_grad = False
        
        # 获取bert隐藏层大小
        bert_hidden_size = self.bert_stage1.config.hidden_size
        
        # 第一阶段的分类头（二分类：Yes vs 非Yes）
        self.stage1_classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(bert_hidden_size, bert_hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(bert_hidden_size, 2)  # 二分类
        )
        
        # 第二阶段的分类头（二分类：To some extent vs No）
        self.stage2_classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(bert_hidden_size, bert_hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(bert_hidden_size, 2)  # 二分类
        )
    
    def forward_stage1(self, input_ids, attention_mask, token_type_ids=None):
        """第一阶段：预测是Yes还是非Yes"""
        outputs = self.bert_stage1(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # 使用[CLS]表示的序列表示
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.stage1_classifier(cls_output)
        return logits
    
    def forward_stage2(self, input_ids, attention_mask, token_type_ids=None):
        """第二阶段：预测是To some extent还是No"""
        outputs = self.bert_stage2(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # 使用[CLS]表示的序列表示
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.stage2_classifier(cls_output)
        return logits
    
    def forward(self, input_ids, attention_mask, token_type_ids=None, stage=None):
        """根据stage参数进行相应阶段的前向传播"""
        if stage == 1:
            return self.forward_stage1(input_ids, attention_mask, token_type_ids)
        elif stage == 2:
            return self.forward_stage2(input_ids, attention_mask, token_type_ids)
        else:
            # 默认行为：完整的两阶段预测
            # 第一阶段：预测是Yes还是非Yes
            stage1_logits = self.forward_stage1(input_ids, attention_mask, token_type_ids)
            stage1_preds = torch.argmax(stage1_logits, dim=1)
            
            # 第二阶段：对预测为"非Yes"的样本进行To some extent vs No预测
            # 创建一个全为0的三分类输出张量（Yes=0, To some extent=1, No=2）
            batch_size = input_ids.size(0)
            final_logits = torch.zeros(batch_size, 3, device=input_ids.device)
            
            # 设置Yes的logits值（从stage1获取）
            final_logits[:, 0] = stage1_logits[:, 0]  # Yes的logits
            
            # 获取预测为非Yes(1)的样本索引
            non_yes_indices = (stage1_preds == 1).nonzero(as_tuple=True)[0]
            
            if len(non_yes_indices) > 0:
                # 只对预测为"非Yes"的样本进行第二阶段预测
                non_yes_input_ids = input_ids[non_yes_indices]
                non_yes_attention_mask = attention_mask[non_yes_indices]
                non_yes_token_type_ids = None if token_type_ids is None else token_type_ids[non_yes_indices]
                
                stage2_logits = self.forward_stage2(non_yes_input_ids, non_yes_attention_mask, non_yes_token_type_ids)
                
                # 将第二阶段的预测结果（To some extent vs No）放入最终结果中
                final_logits[non_yes_indices, 1] = stage2_logits[:, 0]  # To some extent的logits
                final_logits[non_yes_indices, 2] = stage2_logits[:, 1]  # No的logits
            
            return final_logits

In [5]:
import os
import wandb
import random
import argparse
from tqdm import tqdm

import torch
import torch.nn as nn
import numpy as np
from transformers import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# 如果在Jupyter Notebook中运行，可以使用这个自定义参数函数替代argparser
def get_default_configs():
    """在Jupyter环境中使用的默认配置，避免argparse解析错误"""
    class Args:
        def __init__(self):
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
            # self.model_name = "/mnt/cfs/huangzhiwei/BAE2025/models/ModernBERT-large"
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/xlm-roberta-large'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bge-base-en-v1.5'
            self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bert-base-uncased'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
            self.num_classes = 3
            self.dropout = 0.3
            self.freeze_pooler = 8
            self.batch_size = 16
            self.max_length = 512
            self.lr = 1e-5
            self.epochs = 50
            self.device = device
            self.name = None
            self.seed = 42
            self.data_path = './data/train.json'
            self.val_data_path = './data/valid.json'
            self.checkpoint_dir = 'checkpoints_2to2_adjust'
            self.patience = 8
            self.exp_name = 'BAE2025_track4_bert'
    return Args()


In [6]:
def train_hierarchical(configs):
    # 设置随机种子
    random.seed(configs.seed)
    np.random.seed(configs.seed)
    torch.manual_seed(configs.seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # 创建检查点目录
    checkpoint_dir = os.path.join(configs.checkpoint_dir, configs.exp_name)
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # 为保存混淆矩阵创建目录 - 分别为训练集和验证集创建
    train_plot_dir = os.path.join(checkpoint_dir, 'plots', 'train')
    val_plot_dir = os.path.join(checkpoint_dir, 'plots', 'val')
    os.makedirs(train_plot_dir, exist_ok=True)
    os.makedirs(val_plot_dir, exist_ok=True)
    
    # 创建保存预测结果的目录
    predictions_dir = os.path.join(checkpoint_dir, 'predictions')
    os.makedirs(predictions_dir, exist_ok=True)
    
    # 加载数据集
    train_dataset = BAE2025Dataset(configs.data_path)
    val_dataset = BAE2025Dataset(configs.val_data_path)    

    # 创建数据加载器
    train_dataloader = BAE2025DataLoader(
        dataset=train_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=True,
        drop_last=True,
        device=configs.device,
        tokenizer_name=configs.model_name
    )

    val_dataloader = BAE2025DataLoader(
        dataset=val_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=False,
        drop_last=False,
        device=configs.device,
        tokenizer_name=configs.model_name
    )
    
    # 创建分层分类模型
    model = HierarchicalBertClassifier(
        pretrained_model_name=configs.model_name,
        freeze_pooler=configs.freeze_pooler,
        dropout=configs.dropout
    ).to(configs.device)

    # 定义两个阶段的损失函数
    criterion = nn.CrossEntropyLoss()

    # 定义优化器
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=configs.lr
    )

    # 初始化最佳验证指标
    best_val_acc = 0.0
    best_val_f1 = 0.0
    patience_counter = 0
    
    # 定义原始类别名称和阶段类别名称
    class_names = ['Yes', 'To some extent', 'No']
    stage1_names = ['Yes', 'Non-Yes']
    stage2_names = ['To some extent', 'No']
    
    # 添加计算所需的库
    from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    import json
    import torch.nn.functional as F  # 用于获取概率值
    
    # 训练循环
    for epoch in range(configs.epochs):
        print(f"\n===== Epoch {epoch + 1}/{configs.epochs} =====")
        
        # 用于保存训练和验证结果的字典
        train_results = {
            "stage1": {"probs": [], "preds": [], "labels": []},
            "stage2": {"probs": [], "preds": [], "labels": []},
            "full": {"probs": [], "preds": [], "labels": []}
        }
        
        val_results = {
            "stage1": {"probs": [], "preds": [], "labels": []},
            "stage2": {"probs": [], "preds": [], "labels": []},
            "full": {"probs": [], "preds": [], "labels": []}
        }
        
        # ======== 训练第一阶段模型：Yes vs 非Yes ========
        model.train()
        stage1_train_loss = 0.0
        stage1_train_preds = []
        stage1_train_labels = []
        
        print("Training Stage 1 (Yes vs Non-Yes)...")
        with tqdm(train_dataloader, total=len(train_dataloader), desc="Stage 1", unit="batch", ncols=100) as pbar:
            for input_ids, attention_mask, token_type_ids, labels in pbar:
                optimizer.zero_grad()
                
                # 将原始标签转换为二分类标签：0(Yes) 或 1(非Yes)
                stage1_labels = (labels > 0).long()  # Yes=0, 其他=1
                
                # 前向传播第一阶段
                stage1_logits = model(input_ids, attention_mask, token_type_ids, stage=1)
                
                # 计算损失
                loss = criterion(stage1_logits, stage1_labels)
                
                # 反向传播
                loss.backward()
                optimizer.step()
                
                # 获取预测概率
                stage1_probs = F.softmax(stage1_logits, dim=1).detach().cpu().numpy().tolist()
                
                # 收集预测和标签
                preds = torch.argmax(stage1_logits, dim=1)
                stage1_train_preds.extend(preds.cpu().numpy())
                stage1_train_labels.extend(stage1_labels.cpu().numpy())
                
                # 保存预测概率和标签
                train_results["stage1"]["probs"].extend(stage1_probs)
                train_results["stage1"]["preds"].extend(preds.cpu().numpy().tolist())
                train_results["stage1"]["labels"].extend(stage1_labels.cpu().numpy().tolist())
                
                stage1_train_loss += loss.item()
                
                # 更新进度条
                pbar.set_postfix(loss=f'{loss.item():.3f}')
        
        # 计算第一阶段训练指标
        stage1_train_loss /= len(train_dataloader)
        stage1_train_acc = accuracy_score(stage1_train_labels, stage1_train_preds)
        stage1_train_f1 = f1_score(stage1_train_labels, stage1_train_preds, average='macro')
        
        print(f"Stage 1 Training - Loss: {stage1_train_loss:.4f}, Acc: {stage1_train_acc:.4f}, F1: {stage1_train_f1:.4f}")
        
        # 创建并保存第一阶段训练混淆矩阵
        cm = confusion_matrix(stage1_train_labels, stage1_train_preds)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=stage1_names, yticklabels=stage1_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'Train: Stage 1 (Yes vs Non-Yes)\nAcc: {stage1_train_acc:.4f}, F1: {stage1_train_f1:.4f}')
        matrix_path = os.path.join(train_plot_dir, f'stage1_cm_epoch_{epoch+1}.png')
        plt.savefig(matrix_path)
        plt.close()
        
        # ======== 训练第二阶段模型：To some extent vs No ========
        # 筛选出标签为To some extent或No的样本索引
        stage2_train_loss = 0.0
        stage2_train_preds = []
        stage2_train_labels = []
        stage2_sample_count = 0
        
        print("Training Stage 2 (To some extent vs No)...")
        with tqdm(train_dataloader, total=len(train_dataloader), desc="Stage 2", unit="batch", ncols=100) as pbar:
            for input_ids, attention_mask, token_type_ids, labels in pbar:
                # 筛选非Yes样本的索引
                non_yes_indices = (labels > 0).nonzero(as_tuple=True)[0]
                
                if len(non_yes_indices) == 0:
                    continue  # 如果批次中没有非Yes样本，跳过
                
                # 提取非Yes样本的数据
                non_yes_input_ids = input_ids[non_yes_indices]
                non_yes_attention_mask = attention_mask[non_yes_indices]
                non_yes_token_type_ids = token_type_ids[non_yes_indices]
                non_yes_labels = labels[non_yes_indices]
                
                # 将原始标签转换为二分类标签：0(To some extent) 或 1(No)
                # 原始：0=Yes, 1=To some extent, 2=No
                # 现在：0=To some extent, 1=No
                stage2_labels = (non_yes_labels == 2).long()  # To some extent=0, No=1
                
                optimizer.zero_grad()
                
                # 前向传播第二阶段
                stage2_logits = model(non_yes_input_ids, non_yes_attention_mask, non_yes_token_type_ids, stage=2)
                
                # 计算损失
                loss = criterion(stage2_logits, stage2_labels)
                
                # 反向传播
                loss.backward()
                optimizer.step()
                
                # 获取预测概率
                stage2_probs = F.softmax(stage2_logits, dim=1).detach().cpu().numpy().tolist()
                
                # 收集预测和标签
                preds = torch.argmax(stage2_logits, dim=1)
                stage2_train_preds.extend(preds.cpu().numpy())
                stage2_train_labels.extend(stage2_labels.cpu().numpy())
                
                # 保存预测概率和标签
                train_results["stage2"]["probs"].extend(stage2_probs)
                train_results["stage2"]["preds"].extend(preds.cpu().numpy().tolist())
                train_results["stage2"]["labels"].extend(stage2_labels.cpu().numpy().tolist())
                
                stage2_train_loss += loss.item()
                stage2_sample_count += 1
                
                # 更新进度条
                pbar.set_postfix(loss=f'{loss.item():.3f}')
        
        # 计算第二阶段训练指标
        if stage2_sample_count > 0:
            stage2_train_loss /= stage2_sample_count
            stage2_train_acc = accuracy_score(stage2_train_labels, stage2_train_preds)
            stage2_train_f1 = f1_score(stage2_train_labels, stage2_train_preds, average='macro')
            
            print(f"Stage 2 Training - Loss: {stage2_train_loss:.4f}, Acc: {stage2_train_acc:.4f}, F1: {stage2_train_f1:.4f}")
            
            # 创建并保存第二阶段训练混淆矩阵
            cm = confusion_matrix(stage2_train_labels, stage2_train_preds)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=stage2_names, yticklabels=stage2_names)
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title(f'Train: Stage 2 (To some extent vs No)\nAcc: {stage2_train_acc:.4f}, F1: {stage2_train_f1:.4f}')
            matrix_path = os.path.join(train_plot_dir, f'stage2_cm_epoch_{epoch+1}.png')
            plt.savefig(matrix_path)
            plt.close()
        
        # ======== 训练集整体评估 ========
        model.eval()
        train_preds = []
        train_labels = []
        
        with torch.no_grad():
            for input_ids, attention_mask, token_type_ids, labels in train_dataloader:
                # 完整两阶段预测
                logits = model(input_ids, attention_mask, token_type_ids)
                
                # 获取预测概率
                probs = F.softmax(logits, dim=1).cpu().numpy().tolist()
                
                preds = torch.argmax(logits, dim=1)
                
                train_preds.extend(preds.cpu().numpy())
                train_labels.extend(labels.cpu().numpy())
                
                # 保存预测概率和标签
                train_results["full"]["probs"].extend(probs)
                train_results["full"]["preds"].extend(preds.cpu().numpy().tolist())
                train_results["full"]["labels"].extend(labels.cpu().numpy().tolist())
        
        # 计算整体训练集指标
        train_acc = accuracy_score(train_labels, train_preds)
        train_f1 = f1_score(train_labels, train_preds, average='macro')
        
        print(f"Overall Training - Acc: {train_acc:.4f}, F1: {train_f1:.4f}")
        
        # 创建完整的训练集混淆矩阵
        cm_full = confusion_matrix(train_labels, train_preds, labels=[0, 1, 2])
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'Train: Full Hierarchical Confusion Matrix\nAcc: {train_acc:.4f}, F1: {train_f1:.4f}')
        matrix_path = os.path.join(train_plot_dir, f'full_cm_epoch_{epoch+1}.png')
        plt.savefig(matrix_path)
        plt.close()
        
        # ======== 验证集评估 ========
        model.eval()
        val_preds = []
        val_labels = []
        stage1_val_preds = []
        stage1_val_labels = []
        stage2_val_preds = []
        stage2_val_labels = []
        
        print("Evaluating on validation set...")
        with torch.no_grad():
            for input_ids, attention_mask, token_type_ids, labels in val_dataloader:
                # 第一阶段评估
                stage1_logits = model(input_ids, attention_mask, token_type_ids, stage=1)
                stage1_probs = F.softmax(stage1_logits, dim=1).cpu().numpy().tolist()
                stage1_preds = torch.argmax(stage1_logits, dim=1)
                stage1_labels_binary = (labels > 0).long()
                
                stage1_val_preds.extend(stage1_preds.cpu().numpy())
                stage1_val_labels.extend(stage1_labels_binary.cpu().numpy())
                
                # 保存第一阶段预测概率和标签
                val_results["stage1"]["probs"].extend(stage1_probs)
                val_results["stage1"]["preds"].extend(stage1_preds.cpu().numpy().tolist())
                val_results["stage1"]["labels"].extend(stage1_labels_binary.cpu().numpy().tolist())
                
                # 找出非Yes样本
                non_yes_indices = (labels > 0).nonzero(as_tuple=True)[0]
                
                if len(non_yes_indices) > 0:
                    # 第二阶段评估
                    non_yes_input_ids = input_ids[non_yes_indices]
                    non_yes_attention_mask = attention_mask[non_yes_indices]
                    non_yes_token_type_ids = token_type_ids[non_yes_indices]
                    non_yes_labels = labels[non_yes_indices]
                    
                    stage2_logits = model(non_yes_input_ids, non_yes_attention_mask, non_yes_token_type_ids, stage=2)
                    stage2_probs = F.softmax(stage2_logits, dim=1).cpu().numpy().tolist()
                    stage2_preds = torch.argmax(stage2_logits, dim=1)
                    
                    # 转换为二分类标签：0=To some extent, 1=No
                    stage2_labels_binary = (non_yes_labels == 2).long()
                    
                    stage2_val_preds.extend(stage2_preds.cpu().numpy())
                    stage2_val_labels.extend(stage2_labels_binary.cpu().numpy())
                    
                    # 保存第二阶段预测概率和标签
                    val_results["stage2"]["probs"].extend(stage2_probs)
                    val_results["stage2"]["preds"].extend(stage2_preds.cpu().numpy().tolist())
                    val_results["stage2"]["labels"].extend(stage2_labels_binary.cpu().numpy().tolist())
                
                # 完整两阶段预测
                logits = model(input_ids, attention_mask, token_type_ids)
                probs = F.softmax(logits, dim=1).cpu().numpy().tolist()
                preds = torch.argmax(logits, dim=1)
                
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                
                # 保存整体预测概率和标签
                val_results["full"]["probs"].extend(probs)
                val_results["full"]["preds"].extend(preds.cpu().numpy().tolist())
                val_results["full"]["labels"].extend(labels.cpu().numpy().tolist())
        
        # 计算验证集指标
        # 阶段1
        stage1_val_acc = accuracy_score(stage1_val_labels, stage1_val_preds)
        stage1_val_f1 = f1_score(stage1_val_labels, stage1_val_preds, average='macro')
        
        print(f"Stage 1 Validation - Acc: {stage1_val_acc:.4f}, F1: {stage1_val_f1:.4f}")
        
        # 创建阶段1验证混淆矩阵
        cm = confusion_matrix(stage1_val_labels, stage1_val_preds)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=stage1_names, yticklabels=stage1_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'Val: Stage 1 (Yes vs Non-Yes)\nAcc: {stage1_val_acc:.4f}, F1: {stage1_val_f1:.4f}')
        matrix_path = os.path.join(val_plot_dir, f'stage1_cm_epoch_{epoch+1}.png')
        plt.savefig(matrix_path)
        plt.close()
        
        # 阶段2
        if len(stage2_val_labels) > 0:
            stage2_val_acc = accuracy_score(stage2_val_labels, stage2_val_preds)
            stage2_val_f1 = f1_score(stage2_val_labels, stage2_val_preds, average='macro')
            
            print(f"Stage 2 Validation - Acc: {stage2_val_acc:.4f}, F1: {stage2_val_f1:.4f}")
            
            # 创建阶段2验证混淆矩阵
            cm = confusion_matrix(stage2_val_labels, stage2_val_preds)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=stage2_names, yticklabels=stage2_names)
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title(f'Val: Stage 2 (To some extent vs No)\nAcc: {stage2_val_acc:.4f}, F1: {stage2_val_f1:.4f}')
            matrix_path = os.path.join(val_plot_dir, f'stage2_cm_epoch_{epoch+1}.png')
            plt.savefig(matrix_path)
            plt.close()
        
        # 整体验证集评估
        val_acc = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds, average='macro')
        
        print(f"Overall Validation - Acc: {val_acc:.4f}, F1: {val_f1:.4f}")
        
        # 创建完整验证集混淆矩阵
        cm_full = confusion_matrix(val_labels, val_preds, labels=[0, 1, 2])
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(f'Val: Full Hierarchical Confusion Matrix\nAcc: {val_acc:.4f}, F1: {val_f1:.4f}')
        matrix_path = os.path.join(val_plot_dir, f'full_cm_epoch_{epoch+1}.png')
        plt.savefig(matrix_path)
        plt.close()
        
        # 绘制两两类别的验证集混淆矩阵
        class_pairs = [
            ([0, 1], ['Yes', 'To some extent']),  # Yes vs To some extent
            ([0, 2], ['Yes', 'No']),              # Yes vs No
            ([1, 2], ['To some extent', 'No'])    # To some extent vs No
        ]
        
        for classes_idx, classes_names in class_pairs:
            # 筛选出对应两个类别的预测和标签
            mask = np.isin(np.array(val_labels), classes_idx)
            filtered_preds = np.array(val_preds)[mask]
            filtered_labels = np.array(val_labels)[mask]
            
            # 计算此对类别的准确率和F1分数
            if len(filtered_labels) > 0:
                pair_acc = accuracy_score(filtered_labels, filtered_preds)
                # 计算二分类F1分数
                pair_f1 = f1_score(filtered_labels, filtered_preds, average='macro')
                
                # 创建混淆矩阵
                cm = confusion_matrix(filtered_labels, filtered_preds, labels=classes_idx)
                
                # 绘制混淆矩阵
                plt.figure(figsize=(8, 6))
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                            xticklabels=[classes_names[i == classes_idx[1]] for i in classes_idx],
                            yticklabels=[classes_names[i == classes_idx[1]] for i in classes_idx])
                plt.xlabel('Predicted')
                plt.ylabel('True')
                plt.title(f'Val: {classes_names[0]} vs {classes_names[1]}\nAcc: {pair_acc:.4f}, F1: {pair_f1:.4f}')
                
                # 保存图表
                matrix_path = os.path.join(val_plot_dir, f'cm_{classes_names[0].replace(" ", "_")}_{classes_names[1].replace(" ", "_")}_epoch_{epoch+1}.png')
                plt.savefig(matrix_path)
                plt.close()
        
        # 保存训练和验证结果到JSON文件
        train_json_path = os.path.join(predictions_dir, f'train_predictions_epoch_{epoch+1}.json')
        val_json_path = os.path.join(predictions_dir, f'val_predictions_epoch_{epoch+1}.json')
        
        # 添加一些元数据到结果字典
        train_metadata = {
            "epoch": epoch + 1,
            "stage1_acc": stage1_train_acc,
            "stage1_f1": stage1_train_f1,
            "full_acc": train_acc,
            "full_f1": train_f1
        }
        
        val_metadata = {
            "epoch": epoch + 1,
            "stage1_acc": stage1_val_acc,
            "stage1_f1": stage1_val_f1,
            "full_acc": val_acc,
            "full_f1": val_f1
        }
        
        if stage2_sample_count > 0:
            train_metadata["stage2_acc"] = stage2_train_acc
            train_metadata["stage2_f1"] = stage2_train_f1
        
        if len(stage2_val_labels) > 0:
            val_metadata["stage2_acc"] = stage2_val_acc
            val_metadata["stage2_f1"] = stage2_val_f1
        
        # 将元数据添加到结果字典
        train_results["metadata"] = train_metadata
        val_results["metadata"] = val_metadata
        
        # 保存JSON文件
        with open(train_json_path, 'w') as f:
            json.dump(train_results, f, indent=2)
        
        with open(val_json_path, 'w') as f:
            json.dump(val_results, f, indent=2)
        
        print(f"Saved prediction results to {train_json_path} and {val_json_path}")
        
        # 检查是否保存模型并判断是否需要早停
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_val_acc = val_acc
            
            # 保存模型
            # torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'best_hierarchical_model.pt'))
            print(f'New best model saved with F1: {best_val_f1:.4f}, Acc: {best_val_acc:.4f}')
            
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= configs.patience:
                print(f'Early stopping triggered after {epoch+1} epochs.')
                break
        
        # 返回训练状态
        model.train()
    
    print("\nTraining complete!")
    print(f"Best validation accuracy: {best_val_acc:.4f}")
    print(f"Best validation F1 score: {best_val_f1:.4f}")
    
    return model

# 修改主函数
if __name__ == '__main__':
    # 判断是否在Jupyter环境中运行
    try:
        # 检查是否在Jupyter中运行
        get_ipython = globals().get('get_ipython', None)
        if get_ipython and 'IPKernelApp' in get_ipython().config:
            # 在Jupyter环境中运行，使用默认配置
            print("Running in Jupyter environment, using default configs")
            configs = get_default_configs()
        else:
            # 在命令行环境中运行，使用argparse
            configs = argparser()
    except:
        # 任何异常都使用argparse处理
        configs = argparser()
    
    # 设置实验名称
    if configs.name is None:
        configs.exp_name = \
            f'hierarchical_{os.path.basename(configs.model_name)}' + \
            f'{"_fp" if configs.freeze_pooler else ""}' + \
            f'_b{configs.batch_size}_e{configs.epochs}' + \
            f'_len{configs.max_length}_lr{configs.lr}'
    else:
        configs.exp_name = configs.name
    
    # 设置设备
    if configs.device is None:
        configs.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
    
    # 调用分层训练函数
    trained_model = train_hierarchical(configs)

Running in Jupyter environment, using default configs





===== Epoch 1/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.25batch/s, loss=0.766]


Stage 1 Training - Loss: 0.6432, Acc: 0.6336, F1: 0.5957
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.62batch/s, loss=0.658]


Stage 2 Training - Loss: 0.6357, Acc: 0.6347, F1: 0.6345
Overall Training - Acc: 0.6494, F1: 0.4962
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7046, F1: 0.6266
Stage 2 Validation - Acc: 0.6517, F1: 0.6513
Overall Validation - Acc: 0.6806, F1: 0.4879
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_1.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_1.json
New best model saved with F1: 0.4879, Acc: 0.6806

===== Epoch 2/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.47batch/s, loss=0.715]


Stage 1 Training - Loss: 0.5916, Acc: 0.6814, F1: 0.6425
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.58batch/s, loss=0.588]


Stage 2 Training - Loss: 0.5463, Acc: 0.7159, F1: 0.7156
Overall Training - Acc: 0.5097, F1: 0.5246
Evaluating on validation set...
Stage 1 Validation - Acc: 0.6966, F1: 0.6815
Stage 2 Validation - Acc: 0.6816, F1: 0.6793
Overall Validation - Acc: 0.5110, F1: 0.5060
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_2.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_2.json
New best model saved with F1: 0.5060, Acc: 0.5110

===== Epoch 3/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.62batch/s, loss=0.504]


Stage 1 Training - Loss: 0.5572, Acc: 0.7058, F1: 0.6753
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.63batch/s, loss=0.746]


Stage 2 Training - Loss: 0.5287, Acc: 0.7341, F1: 0.7339
Overall Training - Acc: 0.7022, F1: 0.6616
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7186, F1: 0.6789
Stage 2 Validation - Acc: 0.6617, F1: 0.6607
Overall Validation - Acc: 0.6048, F1: 0.5322
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_3.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_3.json
New best model saved with F1: 0.5322, Acc: 0.6048

===== Epoch 4/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.61batch/s, loss=0.637]


Stage 1 Training - Loss: 0.5203, Acc: 0.7337, F1: 0.7170
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.52batch/s, loss=0.406]


Stage 2 Training - Loss: 0.4554, Acc: 0.7954, F1: 0.7953
Overall Training - Acc: 0.7744, F1: 0.7091
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7345, F1: 0.6921
Stage 2 Validation - Acc: 0.6667, F1: 0.6626
Overall Validation - Acc: 0.6747, F1: 0.5327
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_4.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_4.json
New best model saved with F1: 0.5327, Acc: 0.6747

===== Epoch 5/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.59batch/s, loss=0.466]


Stage 1 Training - Loss: 0.4510, Acc: 0.7891, F1: 0.7797
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.54batch/s, loss=0.625]


Stage 2 Training - Loss: 0.4201, Acc: 0.8092, F1: 0.8092
Overall Training - Acc: 0.7937, F1: 0.7254
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7126, F1: 0.6613
Stage 2 Validation - Acc: 0.6716, F1: 0.6693
Overall Validation - Acc: 0.6846, F1: 0.5216
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_5.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_5.json

===== Epoch 6/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.59batch/s, loss=0.453]


Stage 1 Training - Loss: 0.4000, Acc: 0.8196, F1: 0.8131
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.66batch/s, loss=0.403]


Stage 2 Training - Loss: 0.3404, Acc: 0.8532, F1: 0.8531
Overall Training - Acc: 0.8562, F1: 0.8200
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7126, F1: 0.6726
Stage 2 Validation - Acc: 0.6915, F1: 0.6898
Overall Validation - Acc: 0.6926, F1: 0.5753
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_6.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_6.json
New best model saved with F1: 0.5753, Acc: 0.6926

===== Epoch 7/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.58batch/s, loss=0.339]


Stage 1 Training - Loss: 0.3424, Acc: 0.8557, F1: 0.8513
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.68batch/s, loss=0.156]


Stage 2 Training - Loss: 0.2959, Acc: 0.8799, F1: 0.8798
Overall Training - Acc: 0.9080, F1: 0.8908
Evaluating on validation set...
Stage 1 Validation - Acc: 0.6886, F1: 0.6743
Stage 2 Validation - Acc: 0.6567, F1: 0.6533
Overall Validation - Acc: 0.6248, F1: 0.5409
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_7.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_7.json

===== Epoch 8/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.56batch/s, loss=0.607]


Stage 1 Training - Loss: 0.3034, Acc: 0.8740, F1: 0.8707
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.50batch/s, loss=0.129]


Stage 2 Training - Loss: 0.2246, Acc: 0.9155, F1: 0.9154
Overall Training - Acc: 0.9162, F1: 0.8977
Evaluating on validation set...
Stage 1 Validation - Acc: 0.6846, F1: 0.6778
Stage 2 Validation - Acc: 0.6766, F1: 0.6761
Overall Validation - Acc: 0.5988, F1: 0.5408
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_8.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_8.json

===== Epoch 9/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.55batch/s, loss=0.113]


Stage 1 Training - Loss: 0.2418, Acc: 0.9080, F1: 0.9060
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.57batch/s, loss=0.144]


Stage 2 Training - Loss: 0.1868, Acc: 0.9375, F1: 0.9374
Overall Training - Acc: 0.9543, F1: 0.9448
Evaluating on validation set...
Stage 1 Validation - Acc: 0.6926, F1: 0.6736
Stage 2 Validation - Acc: 0.6418, F1: 0.6414
Overall Validation - Acc: 0.6367, F1: 0.5423
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_9.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_9.json

===== Epoch 10/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.56batch/s, loss=0.204]


Stage 1 Training - Loss: 0.2164, Acc: 0.9111, F1: 0.9094
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.45batch/s, loss=0.123]


Stage 2 Training - Loss: 0.1504, Acc: 0.9387, F1: 0.9387
Overall Training - Acc: 0.9416, F1: 0.9321
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7026, F1: 0.6688
Stage 2 Validation - Acc: 0.6468, F1: 0.6439
Overall Validation - Acc: 0.6607, F1: 0.5345
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_10.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_10.json

===== Epoch 11/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.56batch/s, loss=0.370]


Stage 1 Training - Loss: 0.1721, Acc: 0.9329, F1: 0.9315
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.59batch/s, loss=0.058]


Stage 2 Training - Loss: 0.1279, Acc: 0.9561, F1: 0.9560
Overall Training - Acc: 0.9629, F1: 0.9553
Evaluating on validation set...
Stage 1 Validation - Acc: 0.6906, F1: 0.6652
Stage 2 Validation - Acc: 0.6567, F1: 0.6545
Overall Validation - Acc: 0.6427, F1: 0.5218
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_11.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_11.json

===== Epoch 12/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.53batch/s, loss=0.079]


Stage 1 Training - Loss: 0.1532, Acc: 0.9365, F1: 0.9353
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.48batch/s, loss=0.326]


Stage 2 Training - Loss: 0.1001, Acc: 0.9619, F1: 0.9618
Overall Training - Acc: 0.9807, F1: 0.9758
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7126, F1: 0.6935
Stage 2 Validation - Acc: 0.6418, F1: 0.6418
Overall Validation - Acc: 0.6427, F1: 0.5447
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_12.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_12.json

===== Epoch 13/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.52batch/s, loss=0.203]


Stage 1 Training - Loss: 0.1251, Acc: 0.9543, F1: 0.9535
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.53batch/s, loss=0.077]


Stage 2 Training - Loss: 0.0955, Acc: 0.9630, F1: 0.9629
Overall Training - Acc: 0.9863, F1: 0.9827
Evaluating on validation set...
Stage 1 Validation - Acc: 0.7046, F1: 0.6822
Stage 2 Validation - Acc: 0.6517, F1: 0.6453
Overall Validation - Acc: 0.6487, F1: 0.5363
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_13.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_13.json

===== Epoch 14/50 =====
Training Stage 1 (Yes vs Non-Yes)...


Stage 1: 100%|█████████████████████████████████████| 123/123 [00:10<00:00, 11.55batch/s, loss=0.112]


Stage 1 Training - Loss: 0.0926, Acc: 0.9665, F1: 0.9659
Training Stage 2 (To some extent vs No)...


Stage 2: 100%|█████████████████████████████████████| 123/123 [00:05<00:00, 22.42batch/s, loss=0.050]


Stage 2 Training - Loss: 0.0945, Acc: 0.9746, F1: 0.9746
Overall Training - Acc: 0.9919, F1: 0.9901
Evaluating on validation set...
Stage 1 Validation - Acc: 0.6707, F1: 0.6616
Stage 2 Validation - Acc: 0.6418, F1: 0.6342
Overall Validation - Acc: 0.5948, F1: 0.5117
Saved prediction results to checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/train_predictions_epoch_14.json and checkpoints_2to2_adjust/hierarchical_bert-base-uncased_fp_b16_e50_len512_lr1e-05/predictions/val_predictions_epoch_14.json
Early stopping triggered after 14 epochs.

Training complete!
Best validation accuracy: 0.6926
Best validation F1 score: 0.5753
