In [1]:
import os
# 设置环境变量，只让程序看到 GPU 2
os.environ['CUDA_VISIBLE_DEVICES'] = '2'


import torch
import torch.nn as nn
import wandb
import random
import argparse
import numpy as np
from tqdm import tqdm
from transformers import BertModel, AutoModel
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


# 数据预处理函数

In [2]:
from torch.utils.data import Dataset
import json

class BAE2025Dataset(Dataset):
    def __init__(
            self,
            data_path,
            label_type="Actionability",  # 根据需要可以是 "Mistake_Identification", "Mistake_Location", "Providing_Guidance", "Actionability"
            labels={
                "Yes": 0,
                "To some extent": 1, 
                "No": 2,
            }
    ):
        self.data_path = data_path
        self.label_type = label_type
        self.labels = labels
    #     self._get_data()

        self.data = []  # 初始化为空列表
        
        # 只有在data_path不为None时才加载数据
        if self.data_path is not None:
            self._get_data()
    
    def _get_data(self):
        with open(self.data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        self.data = []
        for item in data:
            sent1 = item['conversation_history']
            sent2 = item['response']
            
            # 检查item中是否直接包含我们需要的标签
            if self.label_type in item and item[self.label_type] in self.labels:
                self.data.append(((sent1, sent2), self.labels[item[self.label_type]]))
    
    def __len__(self):
        return len(self.data)
    
    def get_labels(self):
        return self.labels

    def __getitem__(self, idx):
        return self.data[idx]

# 数据加载函数

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import DebertaV2Tokenizer

class BAE2025DataLoader:
    def __init__(
        self,
        dataset,
        batch_size=16,
        max_length=512,
        shuffle=True,
        drop_last=True,
        device=None,
        # tokenizer_name='chinese-bert-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext-large'
        # tokenizer_name='/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
        tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
        # tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.tokenizer.truncation_side = 'left'  # 设置截断方向为左侧,即从句子开头开始截断,假设一个句子过长，则从句子开头开始截断，保留句子结尾的部分
        print("当前使用的 tokenizer 类型：", type(self.tokenizer))
        
        # config = AutoConfig.from_pretrained(tokenizer_name)
        # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, config=config, use_fast=True)
        
        
        # self.tokenizer = DebertaV2Tokenizer.from_pretrained(tokenizer_name)
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.drop_last = drop_last

        if device is None:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu'
            )
        else:
            self.device = device

        self.loader = DataLoader(
            dataset=self.dataset,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            shuffle=self.shuffle,
            drop_last=self.drop_last
        )

    def collate_fn(self, data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 修改这里，处理两个句子的情况
        data = self.tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=[(sent[0], sent[1]) for sent in sents],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_length=True
        )
        input_ids = data['input_ids'].to(self.device)
        attention_mask = data['attention_mask'].to(self.device)
        # token_type_ids = data['token_type_ids'].to(self.device)
        labels = torch.LongTensor(labels).to(self.device)

        # return input_ids, attention_mask, token_type_ids, labels
        return input_ids, attention_mask, labels


    def __iter__(self):
        for data in self.loader:
            yield data

    def __len__(self):
        return len(self.loader)



# 模型代码

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel


class ExpertLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        output, (hidden, _) = self.lstm(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertBiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.bilstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size // 2,  # 因为是双向的，所以每个方向的隐藏层大小减半
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        output, (hidden, _) = self.bilstm(x)
        # 拼接最后一层的正向和反向隐藏状态
        # hidden shape: [num_layers * num_directions, batch_size, hidden_size//2]
        hidden_forward = hidden[-2]  # 正向的最后一层 [batch_size, hidden_size//2]
        hidden_backward = hidden[-1]  # 反向的最后一层 [batch_size, hidden_size//2]
        hidden_concat = torch.cat([hidden_forward, hidden_backward], dim=1)  # [batch_size, hidden_size]
        return hidden_concat


class ExpertRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        _, hidden = self.rnn(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        _, hidden = self.gru(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertLinear(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.LayerNorm(hidden_size * 2),
            nn.GELU(),
            nn.Linear(hidden_size * 2, hidden_size)
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        # 我们需要把序列信息压缩为一个向量，可以使用平均池化
        pooled = torch.mean(x, dim=1)  # [batch_size, input_size]
        return self.linear(pooled)  # [batch_size, hidden_size]


class BertClassificationHead(nn.Module):
    def __init__(self, hidden_size=1024, num_classes=3, dropout_prob=0.3):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.out_proj = nn.Linear(hidden_size, num_classes)
    
    def forward(self, features):
        # 提取 [CLS] 标记的表示
        x = features[:, 0, :]  # 使用第一个标记([CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class MoERouter(nn.Module):
    """专家路由器，学习为每个样本分配专家权重"""
    def __init__(self, input_size, num_experts):
        super().__init__()
        self.router = nn.Linear(input_size, num_experts)
        
    def forward(self, x):
        # x: [batch_size, input_size]
        # 计算每个专家的权重 (使用softmax确保权重和为1)
        router_logits = self.router(x)
        router_probs = F.softmax(router_logits, dim=-1)
        return router_probs  # [batch_size, num_experts]


class DeBERTaMoEClassifier(nn.Module):
    def __init__(
        self, 
        pretrained_model_name, 
        num_classes=3, 
        freeze_pooler=0,
        expert_hidden_size=256,
        dropout=0.3,
        num_rnn_layers=1
    ):
        super().__init__()
        
        # 使用 AutoModel 加载 DeBERTa 模型
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        
        # 获取 bert 隐藏层大小
        self.bert_hidden_size = self.bert.config.hidden_size
        
        # 保留原有的分类头
        self.original_classifier = BertClassificationHead(
            hidden_size=self.bert_hidden_size,
            num_classes=num_classes,
            dropout_prob=dropout
        )
        
        # 创建多个专家模型
        self.experts = nn.ModuleDict({
            'lstm': ExpertLSTM(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'bilstm': ExpertBiLSTM(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'rnn': ExpertRNN(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'gru': ExpertGRU(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'linear': ExpertLinear(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size
            ),
        })
        
        # 创建路由器 (使用[CLS]标记表示作为路由的输入)
        self.router = MoERouter(self.bert_hidden_size, len(self.experts))
        
        # 各专家模型的输出映射层，将各自的hidden_size映射到统一的输出空间
        self.expert_outputs = nn.ModuleDict({
            expert_name: nn.Linear(expert_hidden_size, num_classes)
            for expert_name in self.experts.keys()
        })
        
        # 最终的融合层，将所有结果拼接后映射到输出类别
        # (1个原始分类头 + 5个专家) * 每个输出num_classes = 6 * num_classes
        combined_dim = num_classes * (1 + len(self.experts))
        self.final_classifier = nn.Sequential(
            nn.Linear(combined_dim, combined_dim // 2),
            nn.LayerNorm(combined_dim // 2),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(combined_dim // 2, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # DeBERTa 编码
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # 获取序列隐藏状态
        hidden_states = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        
        # 获取原始分类头结果
        original_logits = self.original_classifier(hidden_states)  # [batch_size, num_classes]
        
        # 获取路由权重
        cls_embedding = hidden_states[:, 0]  # [batch_size, hidden_size]
        routing_weights = self.router(cls_embedding)  # [batch_size, num_experts]
        
        # 获取各专家结果
        expert_outputs = {}
        for expert_name, expert in self.experts.items():
            # 获取专家输出
            expert_hidden = expert(hidden_states)  # [batch_size, expert_hidden_size]
            # 映射到类别空间
            expert_logits = self.expert_outputs[expert_name](expert_hidden)  # [batch_size, num_classes]
            # 存储结果
            expert_outputs[expert_name] = expert_logits
        
        # 根据路由权重加权专家结果
        # 首先，将所有专家的结果拼接到一起
        expert_logits_list = [original_logits]  # 包含原始分类头
        expert_names = list(self.experts.keys())
        
        for expert_name in expert_names:
            expert_logits_list.append(expert_outputs[expert_name])
        
        # 拼接所有结果 [batch_size, (1+num_experts)*num_classes]
        combined_logits = torch.cat(expert_logits_list, dim=1)
        
        # 通过最终分类器输出最终结果
        final_logits = self.final_classifier(combined_logits)
        
        # return {
        #     'logits': final_logits,  # 最终预测
        #     'original_logits': original_logits,  # 原始分类头预测
        #     'expert_logits': expert_outputs,  # 各专家预测
        #     'routing_weights': routing_weights,  # 路由权重
        #     'combined_logits': combined_logits  # 拼接的中间结果
        # }
        
        return final_logits


# FGM

In [5]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}
        # 明确指定为word_embeddings
        self.emb_name = 'word_embeddings'
    
    def attack(self, epsilon=1.):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
    
    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

# 训练参数设置

In [6]:
import os
import wandb
import random
import argparse
from tqdm import tqdm

import torch
import torch.nn as nn
import numpy as np
from transformers import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# 如果在Jupyter Notebook中运行，可以使用这个自定义参数函数替代argparser
def get_default_configs():
    """在Jupyter环境中使用的默认配置，避免argparse解析错误"""
    class Args:
        def __init__(self):
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
            # self.model_name = "/mnt/cfs/huangzhiwei/BAE2025/models/ModernBERT-large"
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/xlm-roberta-large'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bge-base-en-v1.5'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bert-base-uncased'
            self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
            self.num_classes = 3
            self.dropout = 0.25
            self.freeze_pooler = 8
            self.batch_size = 16
            self.max_length = 512
            self.lr = 2e-5
            self.epochs = 50
            self.device = device
            self.name = None
            self.seed = 42
            self.data_path = '../data_new/all.json'
            self.val_data_path = '../data_new/val.json'
            self.checkpoint_dir = '/mnt/cfs/huangzhiwei/BAE2025/projects/predict/5foldx'
            self.patience = 6
            self.expert_hidden_size = 512
            self.num_rnn_layers = 1
            self.warmup_ratio = 0.1
            self.exp_name = 'BAE2025_track4_bert'

            self.cross_validation = True  # 或通过命令行参数设置
            self.n_folds = 5  # 或通过命令行参数设置

    return Args()


# 训练函数

In [7]:
def train(configs, fold_idx=None):
    
    # 设置随机种子
    random.seed(configs.seed)
    np.random.seed(configs.seed)
    torch.manual_seed(configs.seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # 创建检查点目录 - 为每个折创建子目录
    # if fold_idx is not None:
    #     checkpoint_dir = os.path.join(configs.checkpoint_dir, configs.exp_name, f'fold_{fold_idx}')
    # else:
    #     checkpoint_dir = os.path.join(configs.checkpoint_dir, configs.exp_name)

    checkpoint_dir = configs.checkpoint_dir
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # 加载数据集
    if fold_idx is None:
        # 非交叉验证模式 - 使用原始的训练和验证集
        train_dataset = BAE2025Dataset(configs.data_path)
        val_dataset = BAE2025Dataset(configs.val_data_path)
    else:
        # 交叉验证模式 - 使用从全数据集分割的训练集和验证集
        # 这些数据集会在cross_validate函数中传入
        train_dataset = configs.fold_datasets[fold_idx]['train']
        val_dataset = configs.fold_datasets[fold_idx]['val']
    
    # 创建数据加载器
    train_dataloader = BAE2025DataLoader(
        dataset=train_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=True,
        drop_last=True,
        device=configs.device,
        tokenizer_name=configs.model_name
    )

    val_dataloader = BAE2025DataLoader(
        dataset=val_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=False,
        drop_last=False,
        device=configs.device,
        tokenizer_name=configs.model_name
    )
    
    # 创建模型
    model = DeBERTaMoEClassifier(
        pretrained_model_name=configs.model_name,
        num_classes=configs.num_classes,
        freeze_pooler=configs.freeze_pooler,
        num_rnn_layers=configs.num_rnn_layers,
        expert_hidden_size=configs.expert_hidden_size,
        dropout=configs.dropout
    ).to(configs.device)

    criterion = nn.CrossEntropyLoss()

    # 定义优化器
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=configs.lr
    )
    
    # ===== 添加Warmup + Cosine Decay学习率调度 =====
    from transformers import get_cosine_schedule_with_warmup
    
    # 计算总训练步数
    total_steps = len(train_dataloader) * configs.epochs
    
    # 计算warmup步数 (默认总步数的10%，可通过configs.warmup_ratio调整)
    warmup_ratio = getattr(configs, 'warmup_ratio', 0.1)  # 如果未定义，则使用默认值0.1
    warmup_steps = int(warmup_ratio * total_steps)
    
    # 创建学习率调度器
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    # ===============================================
    
    # 初始化最佳验证损失和早停计数器
    best_val_acc = 0.0
    best_val_f1 = 0.0  # 添加F1分数作为评估指标
    best_val_loss = float('inf')
    best_epoch = -1  # 记录达到最佳效果的epoch
    patience_counter = 0
    
    # 定义类别名称
    class_names = ['Yes', 'To some extent', 'No']
    
    # 添加F1计算所需的库
    from sklearn.metrics import f1_score

    # 获得对应embedding名字
    # 打印模型中的层名称，查找嵌入层
    # for name, _ in model.named_parameters():
    #     if 'embed' in name:
    #         print(name)
    
    # # 设置FGM的嵌入层名称
    # fgm = FGM(model)
    # fgm.emb_name = 'word_embeddings'  # 根据打印结果调整正确的名称
    
    # 训练循环
    for epoch in range(configs.epochs):
        # 训练阶段
        model.train()
        train_loss = 0.0
        train_acc = 0.0
        train_preds = []
        train_labels_list = []
        
        with tqdm(
            train_dataloader,
            total=len(train_dataloader),
            desc=f'Fold {fold_idx if fold_idx is not None else "N/A"} - Epoch {epoch + 1}/{configs.epochs}',
            unit='batch',
            ncols=100
        ) as pbar:
            for input_ids, attention_mask, labels in pbar:
                optimizer.zero_grad()
                
                # 前向传播
                logits = model(input_ids, attention_mask)
                
                # 计算损失 - 确保labels是长整型
                labels = labels.long()
                loss = criterion(logits, labels)
                # 反向传播
                loss.backward()

                # # FGM对抗训练
                # fgm.attack()  # 在嵌入层添加扰动
                # # 对抗样本的前向传播
                # logits_adv = model(input_ids, attention_mask)
                # # 计算对抗损失
                # loss_adv = criterion(logits_adv, labels)
                # # 反向传播
                # loss_adv.backward()
                # # 恢复嵌入层参数
                # fgm.restore()
                
                optimizer.step()
                scheduler.step()  # 更新学习率
                
                preds = logits.argmax(dim=1)
                accuracy = (preds == labels).float().mean()
                accuracy_all = (preds == labels).float().sum()
                
                # 收集预测结果和真实标签，用于计算F1
                train_preds.extend(preds.cpu().numpy())
                train_labels_list.extend(labels.cpu().numpy())
                
                train_loss += loss.item()
                train_acc += accuracy_all.item()
                
                # 添加当前学习率到进度条
                curr_lr = scheduler.get_last_lr()[0]
                pbar.set_postfix(
                    loss=f'{loss.item():.3f}',
                    accuracy=f'{accuracy.item():.3f}',
                    lr=f'{curr_lr:.6f}'  # 显示当前学习率
                )
        
        train_loss = train_loss / len(train_dataloader)
        train_acc = train_acc / len(train_dataset)
        
        # 计算训练集的F1分数 - 使用macro平均以处理多分类
        train_f1 = f1_score(train_labels_list, train_preds, average='macro')
        
        print(f'Fold {fold_idx if fold_idx is not None else "N/A"} - Training Loss: {train_loss:.4f}')
        print(f'Fold {fold_idx if fold_idx is not None else "N/A"} - Training Accuracy: {train_acc:.4f}')
        print(f'Fold {fold_idx if fold_idx is not None else "N/A"} - Training F1 Score: {train_f1:.4f}')
        
        # 验证阶段
        model.eval()
        val_loss = 0.0
        val_corrects = 0.0
        val_preds = []
        val_labels_list = []

        with torch.no_grad():
            for input_ids, attention_mask, labels in val_dataloader:
                # 确保labels是长整型
                labels = labels.long()
                
                # 前向传播
                logits = model(input_ids, attention_mask)
                
                loss = criterion(logits, labels)
                val_loss += loss.item()
                preds = logits.argmax(dim=1)
                accuracy = (preds == labels).float().sum()
                val_corrects += accuracy
                
                # 收集预测结果和真实标签，用于计算F1
                val_preds.extend(preds.cpu().numpy())
                val_labels_list.extend(labels.cpu().numpy())
        
        val_loss = val_loss / len(val_dataloader)
        val_acc = val_corrects.double() / len(val_dataset)
        
        # 计算验证集的F1分数
        val_f1 = f1_score(val_labels_list, val_preds, average='macro')
        
        print(f'Fold {fold_idx if fold_idx is not None else "N/A"} - Validation Loss: {val_loss:.4f} Acc: {val_acc:.4f} F1: {val_f1:.4f}')
        
        # 检查是否保存模型并判断是否需要早停
        # 使用F1分数作为主要指标
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_val_acc = val_acc
            best_epoch = epoch + 1  # 记录达到最佳效果的epoch
            
            # 保存模型 - 为每个折添加fold_idx到文件名
            if fold_idx is not None:
                model_path = os.path.join(checkpoint_dir, f'best_model_fold_{fold_idx}.pt')
            else:
                model_path = os.path.join(checkpoint_dir, 'best_model_f1.pt')
            
            state_dict = model.state_dict()
            torch.save(state_dict, model_path)
            print(f'Fold {fold_idx if fold_idx is not None else "N/A"} - New best model saved with F1: {best_val_f1:.4f}, Acc: {best_val_acc:.4f} at Epoch {best_epoch}')
            
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= configs.patience:
                print(f'Fold {fold_idx if fold_idx is not None else "N/A"} - Early stopping triggered after {epoch+1} epochs. Best model was at Epoch {best_epoch}.')
                break
    
    # 返回最佳验证指标，用于交叉验证结果汇总
    return {
        'best_val_f1': best_val_f1,
        'best_val_acc': best_val_acc,
        'best_epoch': best_epoch
    }


def cross_validate(configs, n_folds=5):
    """
    执行n折交叉验证
    
    参数:
    - configs: 配置对象
    - n_folds: 交叉验证的折数，默认为5
    """
    # 导入所需库
    from sklearn.model_selection import StratifiedKFold
    import numpy as np
    
    # 首先加载完整的数据集
    full_dataset = BAE2025Dataset(configs.data_path)
    
    # 获取标签，用于分层抽样 - 修改这一行
    labels = [data[1] for data in full_dataset.data]  # 使用.data而不是.samples
    
    # 创建分层K折交叉验证对象
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=configs.seed)
    
    # 存储每个折的数据集
    configs.fold_datasets = []
    
    # 存储每个折的结果
    fold_results = []
    
    # 遍历每个折
    for fold_idx, (train_indices, val_indices) in enumerate(skf.split(np.zeros(len(labels)), labels)):
        print(f"\n{'='*20} Fold {fold_idx+1}/{n_folds} {'='*20}")
        
        # 为当前折创建训练集和验证集 - 修改这些行
        train_samples = [full_dataset.data[i] for i in train_indices]
        val_samples = [full_dataset.data[i] for i in val_indices]
        
        # 创建新的数据集对象
        train_dataset = BAE2025Dataset(None)  # 创建空数据集
        train_dataset.data = train_samples  # 填充训练样本，使用.data而不是.samples
        
        val_dataset = BAE2025Dataset(None)  # 创建空数据集
        val_dataset.data = val_samples  # 填充验证样本，使用.data而不是.samples
        
        # 保存当前折的数据集
        configs.fold_datasets.append({
            'train': train_dataset,
            'val': val_dataset
        })
        
        # 检查标签分布是否保持一致 - 修改这些行
        train_label_dist = np.bincount(np.array([s[1] for s in train_samples])) / len(train_samples)
        val_label_dist = np.bincount(np.array([s[1] for s in val_samples])) / len(val_samples)
        full_label_dist = np.bincount(np.array(labels)) / len(labels)
        
        print(f"标签分布检查:")
        print(f"原始数据集分布: {full_label_dist}")
        print(f"训练集分布: {train_label_dist}")
        print(f"验证集分布: {val_label_dist}")
        
        # 训练当前折
        fold_result = train(configs, fold_idx=fold_idx)
        fold_results.append(fold_result)
        
        print(f"Fold {fold_idx+1} 完成，最佳F1: {fold_result['best_val_f1']:.4f}, 最佳准确率: {fold_result['best_val_acc']:.4f}")
    
    # 计算交叉验证的平均结果
    avg_f1 = sum(r['best_val_f1'] for r in fold_results) / n_folds
    avg_acc = sum(r['best_val_acc'] for r in fold_results) / n_folds
    avg_epoch = sum(r['best_epoch'] for r in fold_results) / n_folds
    
    print("\n" + "="*50)
    print(f"交叉验证完成! {n_folds}折平均结果:")
    print(f"平均最佳F1: {avg_f1:.4f}")
    print(f"平均最佳准确率: {avg_acc:.4f}")
    print(f"平均最佳Epoch: {avg_epoch:.1f}")
    print("各折结果:")
    for fold_idx, result in enumerate(fold_results):
        print(f"  Fold {fold_idx+1}: F1={result['best_val_f1']:.4f}, Acc={result['best_val_acc']:.4f}, Best Epoch={result['best_epoch']}")
    
    return fold_results


# 在以下主函数中添加判断Jupyter环境的逻辑
if __name__ == '__main__':
    # 判断是否在Jupyter环境中运行
    try:
        # 检查是否在Jupyter中运行
        get_ipython = globals().get('get_ipython', None)
        if get_ipython and 'IPKernelApp' in get_ipython().config:
            # 在Jupyter环境中运行，使用默认配置
            print("Running in Jupyter environment, using default configs")
            configs = get_default_configs()
        else:
            # 在命令行环境中运行，使用argparse
            configs = argparser()
    except:
        # 任何异常都使用argparse处理
        configs = argparser()
    
    # 设置实验名称
    if configs.name is None:
        configs.exp_name = \
            f'{os.path.basename(configs.model_name)}' + \
            f'{"_fp" if configs.freeze_pooler else ""}' + \
            f'_b{configs.batch_size}_e{configs.epochs}' + \
            f'_len{configs.max_length}_lr{configs.lr}'
    else:
        configs.exp_name = configs.name
    
    # 设置设备
    if configs.device is None:
        configs.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
    
    # 判断是否使用交叉验证
    use_cv = getattr(configs, 'cross_validation', False)
    n_folds = getattr(configs, 'n_folds', 5)
    
    if use_cv:
        # 执行交叉验证
        print(f"执行{n_folds}折交叉验证...")
        cross_validate(configs, n_folds=n_folds)
    else:
        # 执行常规训练
        print("执行常规训练...")
        train(configs)

Running in Jupyter environment, using default configs
执行5折交叉验证...

标签分布检查:
原始数据集分布: [0.52907916 0.14903069 0.32189015]
训练集分布: [0.52929293 0.1489899  0.32171717]
验证集分布: [0.52822581 0.14919355 0.32258065]




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>


Fold 0 - Epoch 1/50: 100%|█| 123/123 [00:49<00:00,  2.47batch/s, accuracy=0.500, loss=1.047, lr=0.00


Fold 0 - Training Loss: 1.0948
Fold 0 - Training Accuracy: 0.3606
Fold 0 - Training F1 Score: 0.3155
Fold 0 - Validation Loss: 0.9907 Acc: 0.5262 F1: 0.2471
Fold 0 - New best model saved with F1: 0.2471, Acc: 0.5262 at Epoch 1


Fold 0 - Epoch 2/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.886, lr=0.00


Fold 0 - Training Loss: 0.9958
Fold 0 - Training Accuracy: 0.5207
Fold 0 - Training F1 Score: 0.2649
Fold 0 - Validation Loss: 0.9758 Acc: 0.5363 F1: 0.2518
Fold 0 - New best model saved with F1: 0.2518, Acc: 0.5363 at Epoch 2


Fold 0 - Epoch 3/50: 100%|█| 123/123 [00:48<00:00,  2.51batch/s, accuracy=0.625, loss=0.862, lr=0.00


Fold 0 - Training Loss: 0.9968
Fold 0 - Training Accuracy: 0.5293
Fold 0 - Training F1 Score: 0.2868
Fold 0 - Validation Loss: 0.9746 Acc: 0.5302 F1: 0.2349


Fold 0 - Epoch 4/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.438, loss=0.998, lr=0.00


Fold 0 - Training Loss: 0.9940
Fold 0 - Training Accuracy: 0.5313
Fold 0 - Training F1 Score: 0.2958
Fold 0 - Validation Loss: 0.9732 Acc: 0.5423 F1: 0.3592
Fold 0 - New best model saved with F1: 0.3592, Acc: 0.5423 at Epoch 4


Fold 0 - Epoch 5/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.375, loss=1.182, lr=0.00


Fold 0 - Training Loss: 0.9880
Fold 0 - Training Accuracy: 0.5303
Fold 0 - Training F1 Score: 0.3176
Fold 0 - Validation Loss: 0.9733 Acc: 0.5302 F1: 0.2349


Fold 0 - Epoch 6/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.973, lr=0.00


Fold 0 - Training Loss: 0.9607
Fold 0 - Training Accuracy: 0.5611
Fold 0 - Training F1 Score: 0.3610
Fold 0 - Validation Loss: 0.9161 Acc: 0.5847 F1: 0.4202
Fold 0 - New best model saved with F1: 0.4202, Acc: 0.5847 at Epoch 6


Fold 0 - Epoch 7/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.721, lr=0.00


Fold 0 - Training Loss: 0.9184
Fold 0 - Training Accuracy: 0.6071
Fold 0 - Training F1 Score: 0.4307
Fold 0 - Validation Loss: 0.8855 Acc: 0.6391 F1: 0.4466
Fold 0 - New best model saved with F1: 0.4466, Acc: 0.6391 at Epoch 7


Fold 0 - Epoch 8/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.720, lr=0.00


Fold 0 - Training Loss: 0.8726
Fold 0 - Training Accuracy: 0.6601
Fold 0 - Training F1 Score: 0.4809
Fold 0 - Validation Loss: 0.8720 Acc: 0.6351 F1: 0.4533
Fold 0 - New best model saved with F1: 0.4533, Acc: 0.6351 at Epoch 8


Fold 0 - Epoch 9/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.718, lr=0.00


Fold 0 - Training Loss: 0.8542
Fold 0 - Training Accuracy: 0.6773
Fold 0 - Training F1 Score: 0.4825
Fold 0 - Validation Loss: 0.8712 Acc: 0.6230 F1: 0.4474


Fold 0 - Epoch 10/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.766, lr=0.0


Fold 0 - Training Loss: 0.8430
Fold 0 - Training Accuracy: 0.6894
Fold 0 - Training F1 Score: 0.4999
Fold 0 - Validation Loss: 0.8416 Acc: 0.6976 F1: 0.4952
Fold 0 - New best model saved with F1: 0.4952, Acc: 0.6976 at Epoch 10


Fold 0 - Epoch 11/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.844, lr=0.0


Fold 0 - Training Loss: 0.8195
Fold 0 - Training Accuracy: 0.6975
Fold 0 - Training F1 Score: 0.5049
Fold 0 - Validation Loss: 0.8404 Acc: 0.6895 F1: 0.4863


Fold 0 - Epoch 12/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.817, lr=0.0


Fold 0 - Training Loss: 0.7978
Fold 0 - Training Accuracy: 0.7217
Fold 0 - Training F1 Score: 0.5207
Fold 0 - Validation Loss: 0.8545 Acc: 0.6754 F1: 0.4693


Fold 0 - Epoch 13/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=0.999, lr=0.0


Fold 0 - Training Loss: 0.7964
Fold 0 - Training Accuracy: 0.7253
Fold 0 - Training F1 Score: 0.5248
Fold 0 - Validation Loss: 0.8117 Acc: 0.7097 F1: 0.5047
Fold 0 - New best model saved with F1: 0.5047, Acc: 0.7097 at Epoch 13


Fold 0 - Epoch 14/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=1.072, lr=0.0


Fold 0 - Training Loss: 0.7667
Fold 0 - Training Accuracy: 0.7359
Fold 0 - Training F1 Score: 0.5306
Fold 0 - Validation Loss: 0.8154 Acc: 0.7036 F1: 0.4993


Fold 0 - Epoch 15/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.703, lr=0.0


Fold 0 - Training Loss: 0.7442
Fold 0 - Training Accuracy: 0.7510
Fold 0 - Training F1 Score: 0.5485
Fold 0 - Validation Loss: 0.8277 Acc: 0.6835 F1: 0.4899


Fold 0 - Epoch 16/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.546, lr=0.0


Fold 0 - Training Loss: 0.7344
Fold 0 - Training Accuracy: 0.7566
Fold 0 - Training F1 Score: 0.5589
Fold 0 - Validation Loss: 0.8057 Acc: 0.7036 F1: 0.5026


Fold 0 - Epoch 17/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.779, lr=0.0


Fold 0 - Training Loss: 0.7135
Fold 0 - Training Accuracy: 0.7672
Fold 0 - Training F1 Score: 0.5852
Fold 0 - Validation Loss: 0.8084 Acc: 0.7016 F1: 0.5103
Fold 0 - New best model saved with F1: 0.5103, Acc: 0.7016 at Epoch 17


Fold 0 - Epoch 18/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=0.965, lr=0.0


Fold 0 - Training Loss: 0.6720
Fold 0 - Training Accuracy: 0.7955
Fold 0 - Training F1 Score: 0.6694
Fold 0 - Validation Loss: 0.8118 Acc: 0.6835 F1: 0.5765
Fold 0 - New best model saved with F1: 0.5765, Acc: 0.6835 at Epoch 18


Fold 0 - Epoch 19/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.943, lr=0.0


Fold 0 - Training Loss: 0.6643
Fold 0 - Training Accuracy: 0.8030
Fold 0 - Training F1 Score: 0.7025
Fold 0 - Validation Loss: 0.8094 Acc: 0.6754 F1: 0.5526


Fold 0 - Epoch 20/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.658, lr=0.0


Fold 0 - Training Loss: 0.6602
Fold 0 - Training Accuracy: 0.8020
Fold 0 - Training F1 Score: 0.7248
Fold 0 - Validation Loss: 0.8125 Acc: 0.6754 F1: 0.5667


Fold 0 - Epoch 21/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.643, lr=0.0


Fold 0 - Training Loss: 0.6222
Fold 0 - Training Accuracy: 0.8369
Fold 0 - Training F1 Score: 0.7815
Fold 0 - Validation Loss: 0.8232 Acc: 0.6552 F1: 0.5729


Fold 0 - Epoch 22/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.621, lr=0.0


Fold 0 - Training Loss: 0.6202
Fold 0 - Training Accuracy: 0.8338
Fold 0 - Training F1 Score: 0.7775
Fold 0 - Validation Loss: 0.7983 Acc: 0.6976 F1: 0.6103
Fold 0 - New best model saved with F1: 0.6103, Acc: 0.6976 at Epoch 22


Fold 0 - Epoch 23/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.483, lr=0.0


Fold 0 - Training Loss: 0.5944
Fold 0 - Training Accuracy: 0.8540
Fold 0 - Training F1 Score: 0.8068
Fold 0 - Validation Loss: 0.8180 Acc: 0.6653 F1: 0.5719


Fold 0 - Epoch 24/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=1.000, loss=0.316, lr=0.0


Fold 0 - Training Loss: 0.5740
Fold 0 - Training Accuracy: 0.8662
Fold 0 - Training F1 Score: 0.8208
Fold 0 - Validation Loss: 0.8256 Acc: 0.6593 F1: 0.5688


Fold 0 - Epoch 25/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.696, lr=0.0


Fold 0 - Training Loss: 0.5607
Fold 0 - Training Accuracy: 0.8768
Fold 0 - Training F1 Score: 0.8463
Fold 0 - Validation Loss: 0.7933 Acc: 0.6976 F1: 0.6035


Fold 0 - Epoch 26/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.471, lr=0.0


Fold 0 - Training Loss: 0.5718
Fold 0 - Training Accuracy: 0.8667
Fold 0 - Training F1 Score: 0.8248
Fold 0 - Validation Loss: 0.7927 Acc: 0.6875 F1: 0.5895


Fold 0 - Epoch 27/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.505, lr=0.0


Fold 0 - Training Loss: 0.5402
Fold 0 - Training Accuracy: 0.8859
Fold 0 - Training F1 Score: 0.8579
Fold 0 - Validation Loss: 0.7878 Acc: 0.6915 F1: 0.6174
Fold 0 - New best model saved with F1: 0.6174, Acc: 0.6915 at Epoch 27


Fold 0 - Epoch 28/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.483, lr=0.0


Fold 0 - Training Loss: 0.5359
Fold 0 - Training Accuracy: 0.8944
Fold 0 - Training F1 Score: 0.8708
Fold 0 - Validation Loss: 0.8163 Acc: 0.6673 F1: 0.5886


Fold 0 - Epoch 29/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.619, lr=0.0


Fold 0 - Training Loss: 0.5268
Fold 0 - Training Accuracy: 0.8955
Fold 0 - Training F1 Score: 0.8756
Fold 0 - Validation Loss: 0.8001 Acc: 0.6935 F1: 0.5896


Fold 0 - Epoch 30/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.584, lr=0.0


Fold 0 - Training Loss: 0.5238
Fold 0 - Training Accuracy: 0.8990
Fold 0 - Training F1 Score: 0.8798
Fold 0 - Validation Loss: 0.8058 Acc: 0.6815 F1: 0.5872


Fold 0 - Epoch 31/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.591, lr=0.0


Fold 0 - Training Loss: 0.5211
Fold 0 - Training Accuracy: 0.8919
Fold 0 - Training F1 Score: 0.8721
Fold 0 - Validation Loss: 0.7993 Acc: 0.6895 F1: 0.5974


Fold 0 - Epoch 32/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.615, lr=0.0


Fold 0 - Training Loss: 0.5096
Fold 0 - Training Accuracy: 0.9005
Fold 0 - Training F1 Score: 0.8804
Fold 0 - Validation Loss: 0.7905 Acc: 0.6875 F1: 0.6074


Fold 0 - Epoch 33/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.528, lr=0.0


Fold 0 - Training Loss: 0.5000
Fold 0 - Training Accuracy: 0.9066
Fold 0 - Training F1 Score: 0.8923
Fold 0 - Validation Loss: 0.8088 Acc: 0.6794 F1: 0.5864
Fold 0 - Early stopping triggered after 33 epochs. Best model was at Epoch 27.
Fold 1 完成，最佳F1: 0.6174, 最佳准确率: 0.6915

标签分布检查:
原始数据集分布: [0.52907916 0.14903069 0.32189015]
训练集分布: [0.52902574 0.14891469 0.32205957]
验证集分布: [0.52929293 0.14949495 0.32121212]




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>


Fold 1 - Epoch 1/50: 100%|█| 123/123 [00:49<00:00,  2.49batch/s, accuracy=0.500, loss=0.955, lr=0.00


Fold 1 - Training Loss: 1.1014
Fold 1 - Training Accuracy: 0.3776
Fold 1 - Training F1 Score: 0.3316
Fold 1 - Validation Loss: 0.9907 Acc: 0.5192 F1: 0.2287
Fold 1 - New best model saved with F1: 0.2287, Acc: 0.5192 at Epoch 1


Fold 1 - Epoch 2/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.500, loss=1.032, lr=0.00


Fold 1 - Training Loss: 1.0019
Fold 1 - Training Accuracy: 0.5154
Fold 1 - Training F1 Score: 0.2782
Fold 1 - Validation Loss: 0.9818 Acc: 0.5232 F1: 0.2328
Fold 1 - New best model saved with F1: 0.2328, Acc: 0.5232 at Epoch 2


Fold 1 - Epoch 3/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.871, lr=0.00


Fold 1 - Training Loss: 0.9908
Fold 1 - Training Accuracy: 0.5346
Fold 1 - Training F1 Score: 0.2951
Fold 1 - Validation Loss: 0.9758 Acc: 0.5212 F1: 0.2322


Fold 1 - Epoch 4/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.999, lr=0.00


Fold 1 - Training Loss: 0.9873
Fold 1 - Training Accuracy: 0.5331
Fold 1 - Training F1 Score: 0.3171
Fold 1 - Validation Loss: 0.9781 Acc: 0.5253 F1: 0.2334
Fold 1 - New best model saved with F1: 0.2334, Acc: 0.5253 at Epoch 4


Fold 1 - Epoch 5/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.842, lr=0.00


Fold 1 - Training Loss: 0.9861
Fold 1 - Training Accuracy: 0.5265
Fold 1 - Training F1 Score: 0.2928
Fold 1 - Validation Loss: 0.9730 Acc: 0.5475 F1: 0.2955
Fold 1 - New best model saved with F1: 0.2955, Acc: 0.5475 at Epoch 5


Fold 1 - Epoch 6/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=0.970, lr=0.00


Fold 1 - Training Loss: 0.9993
Fold 1 - Training Accuracy: 0.5300
Fold 1 - Training F1 Score: 0.3161
Fold 1 - Validation Loss: 0.9742 Acc: 0.5253 F1: 0.2407


Fold 1 - Epoch 7/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.820, lr=0.00


Fold 1 - Training Loss: 0.9323
Fold 1 - Training Accuracy: 0.6083
Fold 1 - Training F1 Score: 0.4174
Fold 1 - Validation Loss: 0.8640 Acc: 0.6586 F1: 0.4731
Fold 1 - New best model saved with F1: 0.4731, Acc: 0.6586 at Epoch 7


Fold 1 - Epoch 8/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.500, loss=1.041, lr=0.00


Fold 1 - Training Loss: 0.9273
Fold 1 - Training Accuracy: 0.6128
Fold 1 - Training F1 Score: 0.4177
Fold 1 - Validation Loss: 0.9934 Acc: 0.5253 F1: 0.2407


Fold 1 - Epoch 9/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=0.957, lr=0.00


Fold 1 - Training Loss: 0.9996
Fold 1 - Training Accuracy: 0.5295
Fold 1 - Training F1 Score: 0.2557
Fold 1 - Validation Loss: 0.9818 Acc: 0.5253 F1: 0.2407


Fold 1 - Epoch 10/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.312, loss=1.101, lr=0.0


Fold 1 - Training Loss: 0.9967
Fold 1 - Training Accuracy: 0.5305
Fold 1 - Training F1 Score: 0.2844
Fold 1 - Validation Loss: 0.9795 Acc: 0.5253 F1: 0.2407


Fold 1 - Epoch 11/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.876, lr=0.0


Fold 1 - Training Loss: 0.9890
Fold 1 - Training Accuracy: 0.5260
Fold 1 - Training F1 Score: 0.2870
Fold 1 - Validation Loss: 0.9627 Acc: 0.5253 F1: 0.2407


Fold 1 - Epoch 12/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.862, lr=0.0


Fold 1 - Training Loss: 0.9490
Fold 1 - Training Accuracy: 0.5861
Fold 1 - Training F1 Score: 0.3800
Fold 1 - Validation Loss: 0.8414 Acc: 0.7192 F1: 0.5135
Fold 1 - New best model saved with F1: 0.5135, Acc: 0.7192 at Epoch 12


Fold 1 - Epoch 13/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=1.000, lr=0.0


Fold 1 - Training Loss: 0.8514
Fold 1 - Training Accuracy: 0.6648
Fold 1 - Training F1 Score: 0.4725
Fold 1 - Validation Loss: 0.7946 Acc: 0.7313 F1: 0.5246
Fold 1 - New best model saved with F1: 0.5246, Acc: 0.7313 at Epoch 13


Fold 1 - Epoch 14/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.500, loss=1.034, lr=0.0


Fold 1 - Training Loss: 0.8319
Fold 1 - Training Accuracy: 0.7012
Fold 1 - Training F1 Score: 0.5062
Fold 1 - Validation Loss: 0.7950 Acc: 0.7293 F1: 0.5229


Fold 1 - Epoch 15/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.761, lr=0.0


Fold 1 - Training Loss: 0.8384
Fold 1 - Training Accuracy: 0.6921
Fold 1 - Training F1 Score: 0.4956
Fold 1 - Validation Loss: 0.8062 Acc: 0.7111 F1: 0.5091


Fold 1 - Epoch 16/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.814, lr=0.0


Fold 1 - Training Loss: 0.8186
Fold 1 - Training Accuracy: 0.7092
Fold 1 - Training F1 Score: 0.5084
Fold 1 - Validation Loss: 0.7785 Acc: 0.7414 F1: 0.5314
Fold 1 - New best model saved with F1: 0.5314, Acc: 0.7414 at Epoch 16


Fold 1 - Epoch 17/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.692, lr=0.0


Fold 1 - Training Loss: 0.7989
Fold 1 - Training Accuracy: 0.7224
Fold 1 - Training F1 Score: 0.5206
Fold 1 - Validation Loss: 0.7740 Acc: 0.7414 F1: 0.5323
Fold 1 - New best model saved with F1: 0.5323, Acc: 0.7414 at Epoch 17


Fold 1 - Epoch 18/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.907, lr=0.0


Fold 1 - Training Loss: 0.7981
Fold 1 - Training Accuracy: 0.7092
Fold 1 - Training F1 Score: 0.5089
Fold 1 - Validation Loss: 0.7665 Acc: 0.7495 F1: 0.5386
Fold 1 - New best model saved with F1: 0.5386, Acc: 0.7495 at Epoch 18


Fold 1 - Epoch 19/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.934, lr=0.0


Fold 1 - Training Loss: 0.7809
Fold 1 - Training Accuracy: 0.7345
Fold 1 - Training F1 Score: 0.5321
Fold 1 - Validation Loss: 0.7812 Acc: 0.7333 F1: 0.5238


Fold 1 - Epoch 20/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.890, lr=0.0


Fold 1 - Training Loss: 0.7512
Fold 1 - Training Accuracy: 0.7532
Fold 1 - Training F1 Score: 0.5457
Fold 1 - Validation Loss: 0.7819 Acc: 0.7293 F1: 0.5228


Fold 1 - Epoch 21/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.768, lr=0.0


Fold 1 - Training Loss: 0.7551
Fold 1 - Training Accuracy: 0.7501
Fold 1 - Training F1 Score: 0.5472
Fold 1 - Validation Loss: 0.7723 Acc: 0.7333 F1: 0.5267


Fold 1 - Epoch 22/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.989, lr=0.0


Fold 1 - Training Loss: 0.7415
Fold 1 - Training Accuracy: 0.7617
Fold 1 - Training F1 Score: 0.5603
Fold 1 - Validation Loss: 0.7664 Acc: 0.7394 F1: 0.5312


Fold 1 - Epoch 23/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.506, lr=0.0


Fold 1 - Training Loss: 0.7368
Fold 1 - Training Accuracy: 0.7643
Fold 1 - Training F1 Score: 0.5624
Fold 1 - Validation Loss: 0.7502 Acc: 0.7455 F1: 0.5364


Fold 1 - Epoch 24/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.534, lr=0.0


Fold 1 - Training Loss: 0.7291
Fold 1 - Training Accuracy: 0.7557
Fold 1 - Training F1 Score: 0.5584
Fold 1 - Validation Loss: 0.7719 Acc: 0.7172 F1: 0.5147
Fold 1 - Early stopping triggered after 24 epochs. Best model was at Epoch 18.
Fold 2 完成，最佳F1: 0.5386, 最佳准确率: 0.7495

标签分布检查:
原始数据集分布: [0.52907916 0.14903069 0.32189015]
训练集分布: [0.52902574 0.14891469 0.32205957]
验证集分布: [0.52929293 0.14949495 0.32121212]




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>


Fold 2 - Epoch 1/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=1.013, lr=0.00


Fold 2 - Training Loss: 1.0921
Fold 2 - Training Accuracy: 0.3720
Fold 2 - Training F1 Score: 0.3277
Fold 2 - Validation Loss: 0.9930 Acc: 0.5172 F1: 0.2285
Fold 2 - New best model saved with F1: 0.2285, Acc: 0.5172 at Epoch 1


Fold 2 - Epoch 2/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.949, lr=0.00


Fold 2 - Training Loss: 0.9954
Fold 2 - Training Accuracy: 0.5215
Fold 2 - Training F1 Score: 0.3123
Fold 2 - Validation Loss: 0.9761 Acc: 0.5293 F1: 0.2307
Fold 2 - New best model saved with F1: 0.2307, Acc: 0.5293 at Epoch 2


Fold 2 - Epoch 3/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=1.005, lr=0.00


Fold 2 - Training Loss: 0.9925
Fold 2 - Training Accuracy: 0.5265
Fold 2 - Training F1 Score: 0.3002
Fold 2 - Validation Loss: 0.9759 Acc: 0.5293 F1: 0.2307


Fold 2 - Epoch 4/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.438, loss=1.023, lr=0.00


Fold 2 - Training Loss: 0.9890
Fold 2 - Training Accuracy: 0.5285
Fold 2 - Training F1 Score: 0.3090
Fold 2 - Validation Loss: 0.9716 Acc: 0.5293 F1: 0.2307


Fold 2 - Epoch 5/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.754, lr=0.00


Fold 2 - Training Loss: 0.9914
Fold 2 - Training Accuracy: 0.5315
Fold 2 - Training F1 Score: 0.3219
Fold 2 - Validation Loss: 0.9627 Acc: 0.5596 F1: 0.2957
Fold 2 - New best model saved with F1: 0.2957, Acc: 0.5596 at Epoch 5


Fold 2 - Epoch 6/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.786, lr=0.00


Fold 2 - Training Loss: 0.9631
Fold 2 - Training Accuracy: 0.5628
Fold 2 - Training F1 Score: 0.3676
Fold 2 - Validation Loss: 0.8820 Acc: 0.6465 F1: 0.4495
Fold 2 - New best model saved with F1: 0.4495, Acc: 0.6465 at Epoch 6


Fold 2 - Epoch 7/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.500, loss=0.975, lr=0.00


Fold 2 - Training Loss: 0.8912
Fold 2 - Training Accuracy: 0.6355
Fold 2 - Training F1 Score: 0.4489
Fold 2 - Validation Loss: 0.8490 Acc: 0.6848 F1: 0.4806
Fold 2 - New best model saved with F1: 0.4806, Acc: 0.6848 at Epoch 7


Fold 2 - Epoch 8/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.908, lr=0.00


Fold 2 - Training Loss: 0.8498
Fold 2 - Training Accuracy: 0.6774
Fold 2 - Training F1 Score: 0.4858
Fold 2 - Validation Loss: 0.8077 Acc: 0.7131 F1: 0.5119
Fold 2 - New best model saved with F1: 0.5119, Acc: 0.7131 at Epoch 8


Fold 2 - Epoch 9/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.759, lr=0.00


Fold 2 - Training Loss: 0.8336
Fold 2 - Training Accuracy: 0.6936
Fold 2 - Training F1 Score: 0.4967
Fold 2 - Validation Loss: 0.8416 Acc: 0.6848 F1: 0.4858


Fold 2 - Epoch 10/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.688, loss=0.744, lr=0.0


Fold 2 - Training Loss: 0.8026
Fold 2 - Training Accuracy: 0.7092
Fold 2 - Training F1 Score: 0.5171
Fold 2 - Validation Loss: 0.7976 Acc: 0.7091 F1: 0.5007


Fold 2 - Epoch 11/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.602, lr=0.0


Fold 2 - Training Loss: 0.7648
Fold 2 - Training Accuracy: 0.7405
Fold 2 - Training F1 Score: 0.5448
Fold 2 - Validation Loss: 0.8167 Acc: 0.7071 F1: 0.4982


Fold 2 - Epoch 12/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.839, lr=0.0


Fold 2 - Training Loss: 0.7511
Fold 2 - Training Accuracy: 0.7532
Fold 2 - Training F1 Score: 0.5760
Fold 2 - Validation Loss: 0.7738 Acc: 0.7232 F1: 0.5157
Fold 2 - New best model saved with F1: 0.5157, Acc: 0.7232 at Epoch 12


Fold 2 - Epoch 13/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.777, lr=0.0


Fold 2 - Training Loss: 0.7387
Fold 2 - Training Accuracy: 0.7633
Fold 2 - Training F1 Score: 0.6049
Fold 2 - Validation Loss: 0.8010 Acc: 0.7071 F1: 0.4976


Fold 2 - Epoch 14/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.581, lr=0.0


Fold 2 - Training Loss: 0.7348
Fold 2 - Training Accuracy: 0.7547
Fold 2 - Training F1 Score: 0.6449
Fold 2 - Validation Loss: 0.7590 Acc: 0.7455 F1: 0.5351
Fold 2 - New best model saved with F1: 0.5351, Acc: 0.7455 at Epoch 14


Fold 2 - Epoch 15/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=0.953, lr=0.0


Fold 2 - Training Loss: 0.7069
Fold 2 - Training Accuracy: 0.7744
Fold 2 - Training F1 Score: 0.6704
Fold 2 - Validation Loss: 0.7613 Acc: 0.7232 F1: 0.6071
Fold 2 - New best model saved with F1: 0.6071, Acc: 0.7232 at Epoch 15


Fold 2 - Epoch 16/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.702, lr=0.0


Fold 2 - Training Loss: 0.6958
Fold 2 - Training Accuracy: 0.7910
Fold 2 - Training F1 Score: 0.7101
Fold 2 - Validation Loss: 0.7684 Acc: 0.7111 F1: 0.6109
Fold 2 - New best model saved with F1: 0.6109, Acc: 0.7111 at Epoch 16


Fold 2 - Epoch 17/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.572, lr=0.0


Fold 2 - Training Loss: 0.6685
Fold 2 - Training Accuracy: 0.8147
Fold 2 - Training F1 Score: 0.7438
Fold 2 - Validation Loss: 0.7493 Acc: 0.7374 F1: 0.6126
Fold 2 - New best model saved with F1: 0.6126, Acc: 0.7374 at Epoch 17


Fold 2 - Epoch 18/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.760, lr=0.0


Fold 2 - Training Loss: 0.6436
Fold 2 - Training Accuracy: 0.8173
Fold 2 - Training F1 Score: 0.7523
Fold 2 - Validation Loss: 0.7524 Acc: 0.7374 F1: 0.5968


Fold 2 - Epoch 19/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.556, lr=0.0


Fold 2 - Training Loss: 0.6514
Fold 2 - Training Accuracy: 0.8173
Fold 2 - Training F1 Score: 0.7626
Fold 2 - Validation Loss: 0.7767 Acc: 0.7071 F1: 0.5694


Fold 2 - Epoch 20/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.525, lr=0.0


Fold 2 - Training Loss: 0.6208
Fold 2 - Training Accuracy: 0.8284
Fold 2 - Training F1 Score: 0.7864
Fold 2 - Validation Loss: 0.7543 Acc: 0.7374 F1: 0.6341
Fold 2 - New best model saved with F1: 0.6341, Acc: 0.7374 at Epoch 20


Fold 2 - Epoch 21/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.541, lr=0.0


Fold 2 - Training Loss: 0.6084
Fold 2 - Training Accuracy: 0.8415
Fold 2 - Training F1 Score: 0.7973
Fold 2 - Validation Loss: 0.7584 Acc: 0.7293 F1: 0.5686


Fold 2 - Epoch 22/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=1.106, lr=0.0


Fold 2 - Training Loss: 0.5775
Fold 2 - Training Accuracy: 0.8612
Fold 2 - Training F1 Score: 0.8281
Fold 2 - Validation Loss: 0.7586 Acc: 0.7273 F1: 0.6115


Fold 2 - Epoch 23/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.534, lr=0.0


Fold 2 - Training Loss: 0.5773
Fold 2 - Training Accuracy: 0.8627
Fold 2 - Training F1 Score: 0.8325
Fold 2 - Validation Loss: 0.7535 Acc: 0.7192 F1: 0.6331


Fold 2 - Epoch 24/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.554, lr=0.0


Fold 2 - Training Loss: 0.5621
Fold 2 - Training Accuracy: 0.8688
Fold 2 - Training F1 Score: 0.8389
Fold 2 - Validation Loss: 0.7568 Acc: 0.7293 F1: 0.5786


Fold 2 - Epoch 25/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.632, lr=0.0


Fold 2 - Training Loss: 0.5535
Fold 2 - Training Accuracy: 0.8819
Fold 2 - Training F1 Score: 0.8582
Fold 2 - Validation Loss: 0.7527 Acc: 0.7273 F1: 0.6112


Fold 2 - Epoch 26/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.471, lr=0.0


Fold 2 - Training Loss: 0.5401
Fold 2 - Training Accuracy: 0.8844
Fold 2 - Training F1 Score: 0.8630
Fold 2 - Validation Loss: 0.7505 Acc: 0.7475 F1: 0.5736
Fold 2 - Early stopping triggered after 26 epochs. Best model was at Epoch 20.
Fold 3 完成，最佳F1: 0.6341, 最佳准确率: 0.7374

标签分布检查:
原始数据集分布: [0.52907916 0.14903069 0.32189015]
训练集分布: [0.52902574 0.14891469 0.32205957]
验证集分布: [0.52929293 0.14949495 0.32121212]




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>


Fold 3 - Epoch 1/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.438, loss=1.082, lr=0.00


Fold 3 - Training Loss: 1.0895
Fold 3 - Training Accuracy: 0.3796
Fold 3 - Training F1 Score: 0.3351
Fold 3 - Validation Loss: 0.9927 Acc: 0.5333 F1: 0.2498
Fold 3 - New best model saved with F1: 0.2498, Acc: 0.5333 at Epoch 1


Fold 3 - Epoch 2/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.889, lr=0.00


Fold 3 - Training Loss: 0.9941
Fold 3 - Training Accuracy: 0.5209
Fold 3 - Training F1 Score: 0.2919
Fold 3 - Validation Loss: 0.9790 Acc: 0.5293 F1: 0.2307


Fold 3 - Epoch 3/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.750, loss=0.710, lr=0.00


Fold 3 - Training Loss: 0.9948
Fold 3 - Training Accuracy: 0.5164
Fold 3 - Training F1 Score: 0.3141
Fold 3 - Validation Loss: 0.9765 Acc: 0.5212 F1: 0.2882
Fold 3 - New best model saved with F1: 0.2882, Acc: 0.5212 at Epoch 3


Fold 3 - Epoch 4/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.500, loss=0.957, lr=0.00


Fold 3 - Training Loss: 0.9873
Fold 3 - Training Accuracy: 0.5356
Fold 3 - Training F1 Score: 0.3403
Fold 3 - Validation Loss: 0.9787 Acc: 0.4990 F1: 0.3323
Fold 3 - New best model saved with F1: 0.3323, Acc: 0.4990 at Epoch 4


Fold 3 - Epoch 5/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.562, loss=1.016, lr=0.00


Fold 3 - Training Loss: 0.9656
Fold 3 - Training Accuracy: 0.5462
Fold 3 - Training F1 Score: 0.3604
Fold 3 - Validation Loss: 0.9624 Acc: 0.5495 F1: 0.2733


Fold 3 - Epoch 6/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.809, lr=0.00


Fold 3 - Training Loss: 0.8942
Fold 3 - Training Accuracy: 0.6421
Fold 3 - Training F1 Score: 0.4569
Fold 3 - Validation Loss: 0.8985 Acc: 0.6485 F1: 0.4340
Fold 3 - New best model saved with F1: 0.4340, Acc: 0.6485 at Epoch 6


Fold 3 - Epoch 7/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.653, lr=0.00


Fold 3 - Training Loss: 0.8315
Fold 3 - Training Accuracy: 0.6951
Fold 3 - Training F1 Score: 0.4973
Fold 3 - Validation Loss: 0.8479 Acc: 0.6586 F1: 0.4723
Fold 3 - New best model saved with F1: 0.4723, Acc: 0.6586 at Epoch 7


Fold 3 - Epoch 8/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.687, lr=0.00


Fold 3 - Training Loss: 0.7916
Fold 3 - Training Accuracy: 0.7219
Fold 3 - Training F1 Score: 0.5254
Fold 3 - Validation Loss: 0.8156 Acc: 0.7091 F1: 0.5048
Fold 3 - New best model saved with F1: 0.5048, Acc: 0.7091 at Epoch 8


Fold 3 - Epoch 9/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.757, lr=0.00


Fold 3 - Training Loss: 0.7629
Fold 3 - Training Accuracy: 0.7420
Fold 3 - Training F1 Score: 0.5447
Fold 3 - Validation Loss: 0.8221 Acc: 0.7010 F1: 0.4941


Fold 3 - Epoch 10/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.438, loss=1.061, lr=0.0


Fold 3 - Training Loss: 0.7682
Fold 3 - Training Accuracy: 0.7385
Fold 3 - Training F1 Score: 0.5543
Fold 3 - Validation Loss: 0.7936 Acc: 0.7131 F1: 0.5076
Fold 3 - New best model saved with F1: 0.5076, Acc: 0.7131 at Epoch 10


Fold 3 - Epoch 11/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.858, lr=0.0


Fold 3 - Training Loss: 0.7195
Fold 3 - Training Accuracy: 0.7597
Fold 3 - Training F1 Score: 0.5998
Fold 3 - Validation Loss: 0.8066 Acc: 0.7071 F1: 0.5953
Fold 3 - New best model saved with F1: 0.5953, Acc: 0.7071 at Epoch 11


Fold 3 - Epoch 12/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.623, lr=0.0


Fold 3 - Training Loss: 0.7298
Fold 3 - Training Accuracy: 0.7577
Fold 3 - Training F1 Score: 0.6506
Fold 3 - Validation Loss: 0.7958 Acc: 0.7253 F1: 0.6072
Fold 3 - New best model saved with F1: 0.6072, Acc: 0.7253 at Epoch 12


Fold 3 - Epoch 13/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.609, lr=0.0


Fold 3 - Training Loss: 0.7027
Fold 3 - Training Accuracy: 0.7754
Fold 3 - Training F1 Score: 0.6760
Fold 3 - Validation Loss: 0.7850 Acc: 0.7333 F1: 0.5986


Fold 3 - Epoch 14/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.584, lr=0.0


Fold 3 - Training Loss: 0.6951
Fold 3 - Training Accuracy: 0.7784
Fold 3 - Training F1 Score: 0.6759
Fold 3 - Validation Loss: 0.8013 Acc: 0.6929 F1: 0.5996


Fold 3 - Epoch 15/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.713, lr=0.0


Fold 3 - Training Loss: 0.6930
Fold 3 - Training Accuracy: 0.7845
Fold 3 - Training F1 Score: 0.7107
Fold 3 - Validation Loss: 0.7961 Acc: 0.6970 F1: 0.6230
Fold 3 - New best model saved with F1: 0.6230, Acc: 0.6970 at Epoch 15


Fold 3 - Epoch 16/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.570, lr=0.0


Fold 3 - Training Loss: 0.6448
Fold 3 - Training Accuracy: 0.8248
Fold 3 - Training F1 Score: 0.7658
Fold 3 - Validation Loss: 0.8208 Acc: 0.6768 F1: 0.6010


Fold 3 - Epoch 17/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.676, lr=0.0


Fold 3 - Training Loss: 0.6575
Fold 3 - Training Accuracy: 0.8057
Fold 3 - Training F1 Score: 0.7495
Fold 3 - Validation Loss: 0.8141 Acc: 0.6929 F1: 0.6084


Fold 3 - Epoch 18/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.477, lr=0.0


Fold 3 - Training Loss: 0.6186
Fold 3 - Training Accuracy: 0.8354
Fold 3 - Training F1 Score: 0.7902
Fold 3 - Validation Loss: 0.7811 Acc: 0.7131 F1: 0.6232
Fold 3 - New best model saved with F1: 0.6232, Acc: 0.7131 at Epoch 18


Fold 3 - Epoch 19/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=1.000, loss=0.492, lr=0.0


Fold 3 - Training Loss: 0.6084
Fold 3 - Training Accuracy: 0.8460
Fold 3 - Training F1 Score: 0.8039
Fold 3 - Validation Loss: 0.7887 Acc: 0.7212 F1: 0.5999


Fold 3 - Epoch 20/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.670, lr=0.0


Fold 3 - Training Loss: 0.5977
Fold 3 - Training Accuracy: 0.8486
Fold 3 - Training F1 Score: 0.8033
Fold 3 - Validation Loss: 0.8432 Acc: 0.6586 F1: 0.5851


Fold 3 - Epoch 21/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.617, lr=0.0


Fold 3 - Training Loss: 0.5933
Fold 3 - Training Accuracy: 0.8516
Fold 3 - Training F1 Score: 0.8131
Fold 3 - Validation Loss: 0.8096 Acc: 0.6889 F1: 0.6179


Fold 3 - Epoch 22/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.618, lr=0.0


Fold 3 - Training Loss: 0.5904
Fold 3 - Training Accuracy: 0.8511
Fold 3 - Training F1 Score: 0.8136
Fold 3 - Validation Loss: 0.7779 Acc: 0.7111 F1: 0.6306
Fold 3 - New best model saved with F1: 0.6306, Acc: 0.7111 at Epoch 22


Fold 3 - Epoch 23/50: 100%|█| 123/123 [00:48<00:00,  2.51batch/s, accuracy=0.812, loss=0.637, lr=0.0


Fold 3 - Training Loss: 0.5878
Fold 3 - Training Accuracy: 0.8455
Fold 3 - Training F1 Score: 0.8105
Fold 3 - Validation Loss: 0.8081 Acc: 0.6848 F1: 0.6099


Fold 3 - Epoch 24/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.750, loss=0.765, lr=0.0


Fold 3 - Training Loss: 0.5487
Fold 3 - Training Accuracy: 0.8829
Fold 3 - Training F1 Score: 0.8490
Fold 3 - Validation Loss: 0.7994 Acc: 0.6889 F1: 0.6093


Fold 3 - Epoch 25/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.547, lr=0.0


Fold 3 - Training Loss: 0.5463
Fold 3 - Training Accuracy: 0.8799
Fold 3 - Training F1 Score: 0.8487
Fold 3 - Validation Loss: 0.7991 Acc: 0.6949 F1: 0.6042


Fold 3 - Epoch 26/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.812, loss=0.590, lr=0.0


Fold 3 - Training Loss: 0.5470
Fold 3 - Training Accuracy: 0.8768
Fold 3 - Training F1 Score: 0.8440
Fold 3 - Validation Loss: 0.8109 Acc: 0.6788 F1: 0.6050


Fold 3 - Epoch 27/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.650, lr=0.0


Fold 3 - Training Loss: 0.5238
Fold 3 - Training Accuracy: 0.8884
Fold 3 - Training F1 Score: 0.8628
Fold 3 - Validation Loss: 0.7680 Acc: 0.7253 F1: 0.6188


Fold 3 - Epoch 28/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.938, loss=0.589, lr=0.0


Fold 3 - Training Loss: 0.5227
Fold 3 - Training Accuracy: 0.8925
Fold 3 - Training F1 Score: 0.8637
Fold 3 - Validation Loss: 0.7781 Acc: 0.7152 F1: 0.6137
Fold 3 - Early stopping triggered after 28 epochs. Best model was at Epoch 22.
Fold 4 完成，最佳F1: 0.6306, 最佳准确率: 0.7111

标签分布检查:
原始数据集分布: [0.52907916 0.14903069 0.32189015]
训练集分布: [0.52902574 0.14941949 0.32155477]
验证集分布: [0.52929293 0.14747475 0.32323232]




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>


Fold 4 - Epoch 1/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.312, loss=1.267, lr=0.00


Fold 4 - Training Loss: 1.0769
Fold 4 - Training Accuracy: 0.3862
Fold 4 - Training F1 Score: 0.3427
Fold 4 - Validation Loss: 0.9906 Acc: 0.5192 F1: 0.2369
Fold 4 - New best model saved with F1: 0.2369, Acc: 0.5192 at Epoch 1


Fold 4 - Epoch 2/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.562, loss=0.954, lr=0.00


Fold 4 - Training Loss: 1.0023
Fold 4 - Training Accuracy: 0.5164
Fold 4 - Training F1 Score: 0.2696
Fold 4 - Validation Loss: 0.9726 Acc: 0.5313 F1: 0.2355


Fold 4 - Epoch 3/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.438, loss=1.105, lr=0.00


Fold 4 - Training Loss: 0.9865
Fold 4 - Training Accuracy: 0.5265
Fold 4 - Training F1 Score: 0.2814
Fold 4 - Validation Loss: 0.9728 Acc: 0.5313 F1: 0.2352


Fold 4 - Epoch 4/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.500, loss=1.090, lr=0.00


Fold 4 - Training Loss: 0.9915
Fold 4 - Training Accuracy: 0.5300
Fold 4 - Training F1 Score: 0.3061
Fold 4 - Validation Loss: 0.9694 Acc: 0.5293 F1: 0.2422
Fold 4 - New best model saved with F1: 0.2422, Acc: 0.5293 at Epoch 4


Fold 4 - Epoch 5/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.688, loss=0.890, lr=0.00


Fold 4 - Training Loss: 0.9811
Fold 4 - Training Accuracy: 0.5321
Fold 4 - Training F1 Score: 0.3043
Fold 4 - Validation Loss: 0.9645 Acc: 0.5253 F1: 0.2444
Fold 4 - New best model saved with F1: 0.2444, Acc: 0.5253 at Epoch 5


Fold 4 - Epoch 6/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=0.869, lr=0.00


Fold 4 - Training Loss: 0.9650
Fold 4 - Training Accuracy: 0.5810
Fold 4 - Training F1 Score: 0.3943
Fold 4 - Validation Loss: 0.8579 Acc: 0.6949 F1: 0.4932
Fold 4 - New best model saved with F1: 0.4932, Acc: 0.6949 at Epoch 6


Fold 4 - Epoch 7/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.562, loss=1.036, lr=0.00


Fold 4 - Training Loss: 0.8740
Fold 4 - Training Accuracy: 0.6663
Fold 4 - Training F1 Score: 0.4710
Fold 4 - Validation Loss: 0.8934 Acc: 0.6444 F1: 0.4294


Fold 4 - Epoch 8/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.625, loss=0.888, lr=0.00


Fold 4 - Training Loss: 0.8716
Fold 4 - Training Accuracy: 0.6633
Fold 4 - Training F1 Score: 0.4746
Fold 4 - Validation Loss: 0.8410 Acc: 0.6828 F1: 0.4893


Fold 4 - Epoch 9/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.625, loss=0.911, lr=0.00


Fold 4 - Training Loss: 0.8396
Fold 4 - Training Accuracy: 0.6996
Fold 4 - Training F1 Score: 0.5000
Fold 4 - Validation Loss: 0.8010 Acc: 0.7232 F1: 0.5135
Fold 4 - New best model saved with F1: 0.5135, Acc: 0.7232 at Epoch 9


Fold 4 - Epoch 10/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.750, loss=0.680, lr=0.0


Fold 4 - Training Loss: 0.8130
Fold 4 - Training Accuracy: 0.7143
Fold 4 - Training F1 Score: 0.5146
Fold 4 - Validation Loss: 0.7971 Acc: 0.7192 F1: 0.5147
Fold 4 - New best model saved with F1: 0.5147, Acc: 0.7192 at Epoch 10


Fold 4 - Epoch 11/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.750, loss=0.800, lr=0.0


Fold 4 - Training Loss: 0.8067
Fold 4 - Training Accuracy: 0.7183
Fold 4 - Training F1 Score: 0.5194
Fold 4 - Validation Loss: 0.7878 Acc: 0.7253 F1: 0.5180
Fold 4 - New best model saved with F1: 0.5180, Acc: 0.7253 at Epoch 11


Fold 4 - Epoch 12/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.688, loss=0.934, lr=0.0


Fold 4 - Training Loss: 0.7894
Fold 4 - Training Accuracy: 0.7299
Fold 4 - Training F1 Score: 0.5315
Fold 4 - Validation Loss: 0.7939 Acc: 0.7232 F1: 0.5149


Fold 4 - Epoch 13/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.875, loss=0.610, lr=0.0


Fold 4 - Training Loss: 0.7936
Fold 4 - Training Accuracy: 0.7304
Fold 4 - Training F1 Score: 0.5309
Fold 4 - Validation Loss: 0.7874 Acc: 0.7232 F1: 0.5171


Fold 4 - Epoch 14/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.625, loss=0.991, lr=0.0


Fold 4 - Training Loss: 0.7940
Fold 4 - Training Accuracy: 0.7269
Fold 4 - Training F1 Score: 0.5268
Fold 4 - Validation Loss: 0.7937 Acc: 0.7192 F1: 0.5113


Fold 4 - Epoch 15/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.625, loss=0.977, lr=0.0


Fold 4 - Training Loss: 0.7622
Fold 4 - Training Accuracy: 0.7481
Fold 4 - Training F1 Score: 0.5433
Fold 4 - Validation Loss: 0.7773 Acc: 0.7293 F1: 0.5226
Fold 4 - New best model saved with F1: 0.5226, Acc: 0.7293 at Epoch 15


Fold 4 - Epoch 16/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.875, loss=0.486, lr=0.0


Fold 4 - Training Loss: 0.7565
Fold 4 - Training Accuracy: 0.7562
Fold 4 - Training F1 Score: 0.5491
Fold 4 - Validation Loss: 0.7969 Acc: 0.7131 F1: 0.5063


Fold 4 - Epoch 17/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.688, loss=0.819, lr=0.0


Fold 4 - Training Loss: 0.7490
Fold 4 - Training Accuracy: 0.7476
Fold 4 - Training F1 Score: 0.5464
Fold 4 - Validation Loss: 0.7696 Acc: 0.7354 F1: 0.5274
Fold 4 - New best model saved with F1: 0.5274, Acc: 0.7354 at Epoch 17


Fold 4 - Epoch 18/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.500, loss=1.016, lr=0.0


Fold 4 - Training Loss: 0.7429
Fold 4 - Training Accuracy: 0.7607
Fold 4 - Training F1 Score: 0.5584
Fold 4 - Validation Loss: 0.7956 Acc: 0.7111 F1: 0.5086


Fold 4 - Epoch 19/50: 100%|█| 123/123 [00:49<00:00,  2.51batch/s, accuracy=0.625, loss=1.028, lr=0.0


Fold 4 - Training Loss: 0.7705
Fold 4 - Training Accuracy: 0.7436
Fold 4 - Training F1 Score: 0.5435
Fold 4 - Validation Loss: 0.7991 Acc: 0.7111 F1: 0.5070


Fold 4 - Epoch 20/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.750, loss=0.765, lr=0.0


Fold 4 - Training Loss: 0.7561
Fold 4 - Training Accuracy: 0.7466
Fold 4 - Training F1 Score: 0.5455
Fold 4 - Validation Loss: 0.8018 Acc: 0.7030 F1: 0.5039


Fold 4 - Epoch 21/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.688, loss=0.834, lr=0.0


Fold 4 - Training Loss: 0.7485
Fold 4 - Training Accuracy: 0.7456
Fold 4 - Training F1 Score: 0.5412
Fold 4 - Validation Loss: 0.7821 Acc: 0.7232 F1: 0.5167


Fold 4 - Epoch 22/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.625, loss=0.941, lr=0.0


Fold 4 - Training Loss: 0.7481
Fold 4 - Training Accuracy: 0.7587
Fold 4 - Training F1 Score: 0.5583
Fold 4 - Validation Loss: 0.7740 Acc: 0.7313 F1: 0.5236


Fold 4 - Epoch 23/50: 100%|█| 123/123 [00:49<00:00,  2.50batch/s, accuracy=0.812, loss=0.641, lr=0.0


Fold 4 - Training Loss: 0.7501
Fold 4 - Training Accuracy: 0.7481
Fold 4 - Training F1 Score: 0.5478
Fold 4 - Validation Loss: 0.7707 Acc: 0.7354 F1: 0.5257
Fold 4 - Early stopping triggered after 23 epochs. Best model was at Epoch 17.
Fold 5 完成，最佳F1: 0.5274, 最佳准确率: 0.7354

交叉验证完成! 5折平均结果:
平均最佳F1: 0.5896
平均最佳准确率: 0.7250
平均最佳Epoch: 20.8
各折结果:
  Fold 1: F1=0.6174, Acc=0.6915, Best Epoch=27
  Fold 2: F1=0.5386, Acc=0.7495, Best Epoch=18
  Fold 3: F1=0.6341, Acc=0.7374, Best Epoch=20
  Fold 4: F1=0.6306, Acc=0.7111, Best Epoch=22
  Fold 5: F1=0.5274, Acc=0.7354, Best Epoch=17
