In [1]:
import os
# 设置环境变量，只让程序看到 GPU 2
os.environ['CUDA_VISIBLE_DEVICES'] = '1'


import torch
import torch.nn as nn
import wandb
import random
import argparse
import numpy as np
from tqdm import tqdm
from transformers import BertModel, AutoModel
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


# 数据预处理函数

In [2]:
from torch.utils.data import Dataset
import json

class BAE2025Dataset(Dataset):
    def __init__(
            self,
            data_path,
            label_type="Actionability",  # 根据需要可以是 "Mistake_Identification", "Mistake_Location", "Providing_Guidance", "Actionability"
            labels={
                "Yes": 0,
                "To some extent": 1, 
                "No": 2,
            }
    ):
        self.data_path = data_path
        self.label_type = label_type
        self.labels = labels
        self._get_data()
    
    def _get_data(self):
        with open(self.data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        self.data = []
        for item in data:
            sent1 = item['conversation_history']
            sent2 = item['response']
            
            # 检查item中是否直接包含我们需要的标签
            if self.label_type in item and item[self.label_type] in self.labels:
                self.data.append(((sent1, sent2), self.labels[item[self.label_type]]))
    
    def __len__(self):
        return len(self.data)
    
    def get_labels(self):
        return self.labels

    def __getitem__(self, idx):
        return self.data[idx]

# 数据加载函数

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import DebertaV2Tokenizer

class BAE2025DataLoader:
    def __init__(
        self,
        dataset,
        batch_size=16,
        max_length=512,
        shuffle=True,
        drop_last=True,
        device=None,
        # tokenizer_name='chinese-bert-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext-large'
        # tokenizer_name='/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
        tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
        # tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.tokenizer.truncation_side = 'left'  # 设置截断方向为左侧,即从句子开头开始截断,假设一个句子过长，则从句子开头开始截断，保留句子结尾的部分
        print("当前使用的 tokenizer 类型：", type(self.tokenizer))
        
        # config = AutoConfig.from_pretrained(tokenizer_name)
        # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, config=config, use_fast=True)
        
        
        # self.tokenizer = DebertaV2Tokenizer.from_pretrained(tokenizer_name)
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.drop_last = drop_last

        if device is None:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu'
            )
        else:
            self.device = device

        self.loader = DataLoader(
            dataset=self.dataset,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            shuffle=self.shuffle,
            drop_last=self.drop_last
        )

    def collate_fn(self, data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 修改这里，处理两个句子的情况
        data = self.tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=[(sent[0], sent[1]) for sent in sents],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_length=True
        )
        input_ids = data['input_ids'].to(self.device)
        attention_mask = data['attention_mask'].to(self.device)
        # token_type_ids = data['token_type_ids'].to(self.device)
        labels = torch.LongTensor(labels).to(self.device)

        # return input_ids, attention_mask, token_type_ids, labels
        return input_ids, attention_mask, labels


    def __iter__(self):
        for data in self.loader:
            yield data

    def __len__(self):
        return len(self.loader)



# 模型代码

In [4]:
# import torch
# import torch.nn as nn
# from transformers import BertModel


# class BertClassificationHead(nn.Module):
#     def __init__(self, hidden_size=1024, num_classes=3, dropout_prob=0.3):
#         super().__init__()
#         self.dense = nn.Linear(hidden_size, hidden_size)
#         self.dropout = nn.Dropout(dropout_prob)
#         self.out_proj = nn.Linear(hidden_size, num_classes)  # (输入维度，输出维度)

#     def forward(self, features, **kwargs):
#         x = features[-1][:, 0, :]  # features[-1]是一个三维张量，其维度为[批次大小, 序列长度, 隐藏大小]。
#         x = self.dropout(x)  # 这是一种正则化技术，用于防止模型过拟合。在训练过程中，它通过随机将输入张量中的一部分元素设置为0，来增加模型的泛化能力。
#         x = self.dense(x)  # 这是一个全连接层，它将输入特征映射到一个新的特征空间。这是通过学习一个权重矩阵和一个偏置向量，并使用它们对输入特征进行线性变换来实现的，方便后续可以引入非线性变换。
#         x = torch.tanh(x)  # 这是一个激活函数，它将线性层的输出转换为非线性，使得模型可以学习并表示更复杂的模式。
#         x = self.dropout(x)  # 增加模型的泛化能力。
#         x = self.out_proj(x)  # 这是最后的全连接层，它将特征映射到最终的输出空间。在这个例子中，输出空间的维度等于分类任务的类别数量。
#         return x
    

# class BertClassifier(nn.Module):
#     def __init__(self, pretrained_model_name, num_classes=3, freeze_pooler=0, dropout=0.3, hidden_size=768):
#         super().__init__()
        
#         self.bert = BertModel.from_pretrained(pretrained_model_name, output_hidden_states=True)
        
#         # 冻结BERT底层，保留顶层微调
#         if freeze_pooler > 0:
#             modules = [self.bert.embeddings, *self.bert.encoder.layer[:freeze_pooler]]
#             for module in modules:
#                 for param in module.parameters():
#                     param.requires_grad = False
                    
                    
#         for param in self.bert.pooler.parameters():
#             param.requires_grad = False
        
#         self.dropout = nn.Dropout(dropout)
        
#         # 获取bert隐藏层大小
#         bert_hidden_size = self.bert.config.hidden_size
        
#         self.classifier = BertClassificationHead(
#             hidden_size=self.bert.config.hidden_size,
#             num_classes=3,  # 三分类任务
#             dropout_prob=dropout
#         )
        
#     def forward(self, input_ids, attention_mask, token_type_ids=None):
#         # 获取BERT输出
#         outputs = self.bert(
#             input_ids, 
#             attention_mask=attention_mask
#             # token_type_ids=token_type_ids
#             # output_hidden_states=True  # 获取所有隐藏层
#         )
        
#         # 使用[CLS]表示的序列表示
#         # pooled_output = outputs.pooler_output
        
#         # 可选：结合最后四层的[CLS]表示以获取更丰富的信息
#         # last_4_layers = outputs.hidden_states[-4:]
#         # cls_embeddings = torch.stack([layer[:, 0, :] for layer in last_4_layers], dim=0)
#         # pooled_output = torch.mean(cls_embeddings, dim=0)  # 平均最后四层
        
#         # 应用dropout
#         # pooled_output = self.dropout1(pooled_output)
        
#         # 分类
#         logits = self.classifier(outputs.hidden_states)
        
#         return logits





# 修改 BertClassificationHead 类
class BertClassificationHead(nn.Module):
    def __init__(self, hidden_size=1024, num_classes=3, dropout_prob=0.3):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.out_proj = nn.Linear(hidden_size, num_classes)
    
    def forward(self, features):
        # DeBERTa 使用 last_hidden_state 而非 hidden_states
        # 提取 [CLS] 标记的表示
        x = features[:, 0, :]  # 使用第一个标记([CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# 修改 BertClassifier 类
class BertClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes=3, freeze_pooler=0, dropout=0.3, hidden_size=768):
        super().__init__()
        
        # 使用 AutoModel 加载 DeBERTa 模型
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        print("当前使用的模型类型：", type(self.bert))
        
        # 冻结底层参数
        # if freeze_pooler > 0:
        #     # DeBERTa 结构与 BERT 不同，需要适应
        #     for i, layer in enumerate(self.bert.encoder.layer):
        #         if i < freeze_pooler:
        #             for param in layer.parameters():
        #                 param.requires_grad = False
                        
                        
        # if freeze_pooler > 0:
        #     modules = [self.bert.embeddings, *self.bert.encoder.layer[:freeze_pooler]]
        #     for module in modules:
        #         for param in module.parameters():
        #             param.requires_grad = False
                        
        
        # 获取 bert 隐藏层大小
        bert_hidden_size = self.bert.config.hidden_size
        
        self.classifier = BertClassificationHead(
            hidden_size=bert_hidden_size,
            num_classes=num_classes,
            dropout_prob=dropout
        )
    
    def forward(self, input_ids, attention_mask):
        # DeBERTa 不使用 token_type_ids
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # 使用 last_hidden_state 而不是 hidden_states
        logits = self.classifier(outputs.last_hidden_state)
        
        return logits




# from transformers import AutoModelForSequenceClassification

# # 修改 DeBERTaClassifier 类
# class BertClassifier(nn.Module):
#     def __init__(self, pretrained_model_name, num_classes=3, freeze_pooler=0, dropout=0.3):
#         super().__init__()
        
#         # 使用 AutoModelForSequenceClassification 加载 DeBERTa 模型
#         self.deberta = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_classes)
#         print("当前使用的模型类型：", type(self.deberta))
#         # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
        
#         # # 可以选择冻结 DeBERTa 编码器的前几层
#         # if freeze_encoder > 0:
#         #     for param in self.deberta.deberta.encoder.layer[:freeze_encoder].parameters():
#         #         param.requires_grad = False
        
#         self.dropout = nn.Dropout(dropout)
    
#     def forward(self, input_ids, attention_mask):
#         outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
#         logits = outputs.logits
#         return logits

# 训练参数设置

In [5]:
import os
import wandb
import random
import argparse
from tqdm import tqdm

import torch
import torch.nn as nn
import numpy as np
from transformers import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# 如果在Jupyter Notebook中运行，可以使用这个自定义参数函数替代argparser
def get_default_configs():
    """在Jupyter环境中使用的默认配置，避免argparse解析错误"""
    class Args:
        def __init__(self):
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
            # self.model_name = "/mnt/cfs/huangzhiwei/BAE2025/models/ModernBERT-large"
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/xlm-roberta-large'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bge-base-en-v1.5'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bert-base-uncased'
            self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
            self.num_classes = 3
            self.dropout = 0.25
            self.freeze_pooler = 8
            self.batch_size = 16
            self.max_length = 512
            self.lr = 1e-5
            self.epochs = 50
            self.device = device
            self.name = None
            self.seed = 42
            self.data_path = '../data_new/train.json'
            self.val_data_path = '../data_new/val.json'
            self.checkpoint_dir = 'checkpoints_track4'
            self.patience = 6
            self.exp_name = 'BAE2025_track4_bert'
    return Args()


# 训练函数

In [6]:
def train(configs):
    
    # 设置随机种子
    random.seed(configs.seed)
    np.random.seed(configs.seed)
    torch.manual_seed(configs.seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # 创建检查点目录
    checkpoint_dir = os.path.join(configs.checkpoint_dir, configs.exp_name)
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # 为保存混淆矩阵创建目录 - 分别为训练集和验证集创建
    train_plot_dir = os.path.join(checkpoint_dir, 'plots', 'train')
    val_plot_dir = os.path.join(checkpoint_dir, 'plots', 'val')
    os.makedirs(train_plot_dir, exist_ok=True)
    os.makedirs(val_plot_dir, exist_ok=True)
    
    # 加载数据集
    train_dataset = BAE2025Dataset(configs.data_path)
    val_dataset = BAE2025Dataset(configs.val_data_path)    

    # 创建数据加载器
    train_dataloader = BAE2025DataLoader(
        dataset=train_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=True,
        drop_last=True,
        device=configs.device,
        tokenizer_name=configs.model_name
    )

    val_dataloader = BAE2025DataLoader(
        dataset=val_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=False,
        drop_last=False,
        device=configs.device,
        tokenizer_name=configs.model_name
    )
    
    # 创建模型
    model = BertClassifier(
        pretrained_model_name=configs.model_name,
        num_classes=configs.num_classes,
        freeze_pooler=configs.freeze_pooler,
        dropout=configs.dropout
    ).to(configs.device)

    criterion = nn.CrossEntropyLoss()

    # 定义优化器
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=configs.lr
    )

    # 初始化最佳验证损失和早停计数器
    best_val_acc = 0.0
    best_val_f1 = 0.0  # 添加F1分数作为评估指标
    best_val_loss = float('inf')
    patience_counter = 0
    
    # 定义类别名称
    class_names = ['Yes', 'To some extent', 'No']
    
    # 添加F1计算所需的库
    from sklearn.metrics import f1_score, confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # 训练循环
    for epoch in range(configs.epochs):
        # 训练阶段
        model.train()
        train_loss = 0.0
        train_acc = 0.0
        train_preds = []
        train_labels_list = []
        
        with tqdm(
            train_dataloader,
            total=len(train_dataloader),
            desc=f'Epoch {epoch + 1}/{configs.epochs}',
            unit='batch',
            ncols=100
        ) as pbar:
            for input_ids, attention_mask, labels in pbar:
                optimizer.zero_grad()
                
                # 前向传播
                logits = model(input_ids, attention_mask)
                
                # 计算损失 - 确保labels是长整型
                labels = labels.long()
                loss = criterion(logits, labels)
                
                # 反向传播
                loss.backward()
                optimizer.step()
                
                preds = logits.argmax(dim=1)
                accuracy = (preds == labels).float().mean()
                accuracy_all = (preds == labels).float().sum()
                
                # 收集预测结果和真实标签，用于计算F1
                train_preds.extend(preds.cpu().numpy())
                train_labels_list.extend(labels.cpu().numpy())
                
                train_loss += loss.item()
                train_acc += accuracy_all.item()
                
                pbar.set_postfix(
                    loss=f'{loss.item():.3f}',
                    accuracy=f'{accuracy.item():.3f}'
                )
        
        train_loss = train_loss / len(train_dataloader)
        train_acc = train_acc / len(train_dataset)
        
        # 计算训练集的F1分数 - 使用macro平均以处理多分类
        train_f1 = f1_score(train_labels_list, train_preds, average='macro')
        
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Training Accuracy: {train_acc:.4f}')
        print(f'Training F1 Score: {train_f1:.4f}')
        
        # # 创建训练集的混淆矩阵
        # # 创建三个二分类混淆矩阵（两两类别之间）
        # class_pairs = [
        #     ([0, 1], ['Yes', 'To some extent']),  # Yes vs To some extent
        #     ([0, 2], ['Yes', 'No']),              # Yes vs No
        #     ([1, 2], ['To some extent', 'No'])    # To some extent vs No
        # ]
        
        # for classes_idx, classes_names in class_pairs:
        #     # 筛选出对应两个类别的预测和标签
        #     mask = np.isin(np.array(train_labels_list), classes_idx)
        #     filtered_preds = np.array(train_preds)[mask]
        #     filtered_labels = np.array(train_labels_list)[mask]
            
        #     # 创建混淆矩阵
        #     cm = confusion_matrix(filtered_labels, filtered_preds, labels=classes_idx)
            
        #     # 计算此对类别的准确率和F1分数
        #     pair_mask = np.isin(np.array(train_labels_list), classes_idx)
        #     pair_acc = np.mean(np.array(train_preds)[pair_mask] == np.array(train_labels_list)[pair_mask])
        #     # 计算二分类F1分数
        #     pair_f1 = f1_score(
        #         np.array(train_labels_list)[pair_mask], 
        #         np.array(train_preds)[pair_mask], 
        #         labels=classes_idx, 
        #         average='macro'
        #     )
            
        #     # 绘制混淆矩阵
        #     plt.figure(figsize=(8, 6))
        #     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
        #                 xticklabels=[classes_names[i == classes_idx[1]] for i in classes_idx],
        #                 yticklabels=[classes_names[i == classes_idx[1]] for i in classes_idx])
        #     plt.xlabel('Predicted')
        #     plt.ylabel('True')
        #     plt.title(f'Train: {classes_names[0]} vs {classes_names[1]}\nAcc: {pair_acc:.4f}, F1: {pair_f1:.4f}')
            
        #     # 保存图表
        #     matrix_path = os.path.join(train_plot_dir, f'cm_{classes_names[0].replace(" ", "_")}_{classes_names[1].replace(" ", "_")}_epoch_{epoch+1}.png')
        #     plt.savefig(matrix_path)
        #     plt.close()
            
        # # 创建完整的三分类混淆矩阵
        # cm_full = confusion_matrix(train_labels_list, train_preds, labels=[0, 1, 2])
        # plt.figure(figsize=(10, 8))
        # sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues',
        #             xticklabels=class_names,
        #             yticklabels=class_names)
        # plt.xlabel('Predicted')
        # plt.ylabel('True')
        # plt.title(f'Train: Full Confusion Matrix\nAcc: {train_acc:.4f}, F1: {train_f1:.4f}')
        
        # # 保存完整混淆矩阵
        # matrix_path = os.path.join(train_plot_dir, f'cm_full_epoch_{epoch+1}.png')
        # plt.savefig(matrix_path)
        # plt.close()
        
        # 验证阶段
        model.eval()
        val_loss = 0.0
        val_corrects = 0.0
        val_preds = []
        val_labels_list = []

        with torch.no_grad():
            for input_ids, attention_mask, labels in val_dataloader:
                # 确保labels是长整型
                labels = labels.long()
                
                # 前向传播
                logits = model(input_ids, attention_mask)
                
                loss = criterion(logits, labels)
                val_loss += loss.item()
                preds = logits.argmax(dim=1)
                accuracy = (preds == labels).float().sum()
                val_corrects += accuracy
                
                # 收集预测结果和真实标签，用于计算F1和混淆矩阵
                val_preds.extend(preds.cpu().numpy())
                val_labels_list.extend(labels.cpu().numpy())
        
        val_loss = val_loss / len(val_dataloader)
        val_acc = val_corrects.double() / len(val_dataset)
        
        # 计算验证集的F1分数
        val_f1 = f1_score(val_labels_list, val_preds, average='macro')
        
        print('Validation Loss: {:.4f} Acc: {:.4f} F1: {:.4f}'.format(val_loss, val_acc, val_f1))
        
        # # 创建验证集三个二分类混淆矩阵（两两类别之间）
        # class_pairs = [
        #     ([0, 1], ['Yes', 'To some extent']),  # Yes vs To some extent
        #     ([0, 2], ['Yes', 'No']),              # Yes vs No
        #     ([1, 2], ['To some extent', 'No'])    # To some extent vs No
        # ]
        
        # for classes_idx, classes_names in class_pairs:
        #     # 筛选出对应两个类别的预测和标签
        #     mask = np.isin(np.array(val_labels_list), classes_idx)
        #     filtered_preds = np.array(val_preds)[mask]
        #     filtered_labels = np.array(val_labels_list)[mask]
            
        #     # 创建混淆矩阵
        #     cm = confusion_matrix(filtered_labels, filtered_preds, labels=classes_idx)
            
        #     # 计算此对类别的准确率和F1分数
        #     pair_mask = np.isin(np.array(val_labels_list), classes_idx)
        #     pair_acc = np.mean(np.array(val_preds)[pair_mask] == np.array(val_labels_list)[pair_mask])
        #     # 计算二分类F1分数
        #     pair_f1 = f1_score(
        #         np.array(val_labels_list)[pair_mask], 
        #         np.array(val_preds)[pair_mask], 
        #         labels=classes_idx, 
        #         average='macro'
        #     )
            
        #     # 绘制混淆矩阵
        #     plt.figure(figsize=(8, 6))
        #     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
        #                 xticklabels=[classes_names[i == classes_idx[1]] for i in classes_idx],
        #                 yticklabels=[classes_names[i == classes_idx[1]] for i in classes_idx])
        #     plt.xlabel('Predicted')
        #     plt.ylabel('True')
        #     plt.title(f'Val: {classes_names[0]} vs {classes_names[1]}\nAcc: {pair_acc:.4f}, F1: {pair_f1:.4f}')
            
        #     # 保存图表
        #     matrix_path = os.path.join(val_plot_dir, f'cm_{classes_names[0].replace(" ", "_")}_{classes_names[1].replace(" ", "_")}_epoch_{epoch+1}.png')
        #     plt.savefig(matrix_path)
        #     plt.close()
            
        # # 创建完整的三分类混淆矩阵
        # cm_full = confusion_matrix(val_labels_list, val_preds, labels=[0, 1, 2])
        # plt.figure(figsize=(10, 8))
        # sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues',
        #             xticklabels=class_names,
        #             yticklabels=class_names)
        # plt.xlabel('Predicted')
        # plt.ylabel('True')
        # plt.title(f'Val: Full Confusion Matrix\nAcc: {val_acc:.4f}, F1: {val_f1:.4f}')
        
        # # 保存完整混淆矩阵
        # matrix_path = os.path.join(val_plot_dir, f'cm_full_epoch_{epoch+1}.png')
        # plt.savefig(matrix_path)
        # plt.close()
        
        # 检查是否保存模型并判断是否需要早停
        # 使用F1分数作为主要指标
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_val_acc = val_acc
            
            # 保存模型
            # state_dict = model.state_dict()
            # torch.save(state_dict, os.path.join(checkpoint_dir, 'best_model_f1.pt'))
            print(f'New best model saved with F1: {best_val_f1:.4f}, Acc: {best_val_acc:.4f}')
            
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= configs.patience:
                print(f'Early stopping triggered after {epoch+1} epochs.')
                break

        model.train()
        
# 在以下主函数中添加判断Jupyter环境的逻辑
if __name__ == '__main__':
    # 判断是否在Jupyter环境中运行
    try:
        # 检查是否在Jupyter中运行
        get_ipython = globals().get('get_ipython', None)
        if get_ipython and 'IPKernelApp' in get_ipython().config:
            # 在Jupyter环境中运行，使用默认配置
            print("Running in Jupyter environment, using default configs")
            configs = get_default_configs()
        else:
            # 在命令行环境中运行，使用argparse
            configs = argparser()
    except:
        # 任何异常都使用argparse处理
        configs = argparser()
    
    # 设置实验名称
    if configs.name is None:
        configs.exp_name = \
            f'{os.path.basename(configs.model_name)}' + \
            f'{"_fp" if configs.freeze_pooler else ""}' + \
            f'_b{configs.batch_size}_e{configs.epochs}' + \
            f'_len{configs.max_length}_lr{configs.lr}'
    else:
        configs.exp_name = configs.name
    
    # 设置设备
    if configs.device is None:
        configs.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
    
    # 调用训练函数
    train(configs)

Running in Jupyter environment, using default configs




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的模型类型： <class 'transformers.models.deberta_v2.modeling_deberta_v2.DebertaV2Model'>


Epoch 1/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.89batch/s, accuracy=0.312, loss=0.984]


Training Loss: 1.0040
Training Accuracy: 0.5071
Training F1 Score: 0.2736
Validation Loss: 0.9327 Acc: 0.5524 F1: 0.2372
New best model saved with F1: 0.2372, Acc: 0.5524


Epoch 2/50: 100%|██████████████████| 123/123 [00:41<00:00,  2.93batch/s, accuracy=0.688, loss=0.814]


Training Loss: 0.9703
Training Accuracy: 0.5515
Training F1 Score: 0.3335
Validation Loss: 0.7664 Acc: 0.7278 F1: 0.5092
New best model saved with F1: 0.5092, Acc: 0.7278


Epoch 3/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.625, loss=0.944]


Training Loss: 0.7868
Training Accuracy: 0.6859
Training F1 Score: 0.4921
Validation Loss: 0.6730 Acc: 0.7641 F1: 0.5363
New best model saved with F1: 0.5363, Acc: 0.7641


Epoch 4/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.625, loss=0.626]


Training Loss: 0.6816
Training Accuracy: 0.7288
Training F1 Score: 0.5309
Validation Loss: 0.6747 Acc: 0.7460 F1: 0.5244


Epoch 5/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.750, loss=0.549]


Training Loss: 0.5958
Training Accuracy: 0.7520
Training F1 Score: 0.5517
Validation Loss: 0.6373 Acc: 0.7762 F1: 0.5444
New best model saved with F1: 0.5444, Acc: 0.7762


Epoch 6/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.688, loss=0.639]


Training Loss: 0.5423
Training Accuracy: 0.7742
Training F1 Score: 0.6073
Validation Loss: 0.6487 Acc: 0.7762 F1: 0.5447
New best model saved with F1: 0.5447, Acc: 0.7762


Epoch 7/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.812, loss=0.687]


Training Loss: 0.4968
Training Accuracy: 0.7904
Training F1 Score: 0.6668
Validation Loss: 0.7460 Acc: 0.7661 F1: 0.6440
New best model saved with F1: 0.6440, Acc: 0.7661


Epoch 8/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.750, loss=0.410]


Training Loss: 0.4458
Training Accuracy: 0.8217
Training F1 Score: 0.7328
Validation Loss: 0.7266 Acc: 0.7802 F1: 0.6039


Epoch 9/50: 100%|██████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.875, loss=0.319]


Training Loss: 0.4004
Training Accuracy: 0.8399
Training F1 Score: 0.7764
Validation Loss: 0.7627 Acc: 0.7137 F1: 0.6335


Epoch 10/50: 100%|█████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.812, loss=0.387]


Training Loss: 0.3486
Training Accuracy: 0.8672
Training F1 Score: 0.8228
Validation Loss: 0.7831 Acc: 0.7157 F1: 0.6322


Epoch 11/50: 100%|█████████████████| 123/123 [00:42<00:00,  2.91batch/s, accuracy=0.938, loss=0.145]


Training Loss: 0.3078
Training Accuracy: 0.8722
Training F1 Score: 0.8333
Validation Loss: 0.9103 Acc: 0.7560 F1: 0.6393


Epoch 12/50: 100%|█████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.875, loss=0.316]


Training Loss: 0.2727
Training Accuracy: 0.8919
Training F1 Score: 0.8600
Validation Loss: 0.9017 Acc: 0.7157 F1: 0.6350


Epoch 13/50: 100%|█████████████████| 123/123 [00:42<00:00,  2.92batch/s, accuracy=0.938, loss=0.250]


Training Loss: 0.2128
Training Accuracy: 0.9278
Training F1 Score: 0.9086
Validation Loss: 1.0393 Acc: 0.6774 F1: 0.6122
Early stopping triggered after 13 epochs.
