In [1]:
import os
# 设置环境变量，只让程序看到 GPU 2
os.environ['CUDA_VISIBLE_DEVICES'] = '3'


import torch
import torch.nn as nn
import wandb
import random
import argparse
import numpy as np
from tqdm import tqdm
from transformers import BertModel, AutoModel
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


# 数据预处理函数

In [2]:
from torch.utils.data import Dataset
import json

class BAE2025Dataset(Dataset):
    def __init__(
            self,
            data_path,
            label_types=["Mistake_Identification", "Mistake_Location", "Providing_Guidance", "Actionability"],
            labels={
                "Yes": 0,
                "To some extent": 1, 
                "No": 2,
            }
    ):
        self.data_path = data_path
        self.label_types = label_types
        self.labels = labels
        self._get_data()
    
    def _get_data(self):
        with open(self.data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        self.data = []
        for item in data:
            sent1 = item['conversation_history']
            sent2 = item['response']
            
            label_values = []
            # 逐个标签提取并转换
            for label_type in self.label_types:
                label = item.get(label_type)
                if label not in self.labels:
                    break  # 如果有任何一个标签缺失或无效，就跳过这个样本
                label_values.append(self.labels[label])
            else:
                # 只有在所有标签都成功提取时才添加到数据集中
                self.data.append(((sent1, sent2), label_values))
    
    def __len__(self):
        return len(self.data)
    
    def get_labels(self):
        return self.labels

    def __getitem__(self, idx):
        return self.data[idx]

# 数据加载函数

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import DebertaV2Tokenizer

class BAE2025DataLoader:
    def __init__(
        self,
        dataset,
        batch_size=16,
        max_length=512,
        shuffle=True,
        drop_last=True,
        device=None,
        # tokenizer_name='chinese-bert-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext-large'
        # tokenizer_name='/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
        tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
        # tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.tokenizer.truncation_side = 'left'  # 设置截断方向为左侧,即从句子开头开始截断,假设一个句子过长，则从句子开头开始截断，保留句子结尾的部分
        print("当前使用的 tokenizer 类型：", type(self.tokenizer))
        
        # config = AutoConfig.from_pretrained(tokenizer_name)
        # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, config=config, use_fast=True)
        
        
        # self.tokenizer = DebertaV2Tokenizer.from_pretrained(tokenizer_name)
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.drop_last = drop_last

        if device is None:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu'
            )
        else:
            self.device = device

        self.loader = DataLoader(
            dataset=self.dataset,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            shuffle=self.shuffle,
            drop_last=self.drop_last
        )

    def collate_fn(self, data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 修改这里，处理两个句子的情况
        data = self.tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=[(sent[0], sent[1]) for sent in sents],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_length=True
        )
        input_ids = data['input_ids'].to(self.device)
        attention_mask = data['attention_mask'].to(self.device)
        # token_type_ids = data['token_type_ids'].to(self.device)
        # labels = torch.LongTensor(labels).to(self.device)
        
        # 将 label 列表变成 tensor，自动处理为二维
        labels = torch.tensor(labels, dtype=torch.long).to(self.device)

        # return input_ids, attention_mask, token_type_ids, labels
        return input_ids, attention_mask, labels


    def __iter__(self):
        for data in self.loader:
            yield data

    def __len__(self):
        return len(self.loader)



# 模型代码

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel


class ExpertLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        output, (hidden, _) = self.lstm(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertBiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.bilstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size // 2,  # 因为是双向的，所以每个方向的隐藏层大小减半
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        output, (hidden, _) = self.bilstm(x)
        # 拼接最后一层的正向和反向隐藏状态
        # hidden shape: [num_layers * num_directions, batch_size, hidden_size//2]
        hidden_forward = hidden[-2]  # 正向的最后一层 [batch_size, hidden_size//2]
        hidden_backward = hidden[-1]  # 反向的最后一层 [batch_size, hidden_size//2]
        hidden_concat = torch.cat([hidden_forward, hidden_backward], dim=1)  # [batch_size, hidden_size]
        return hidden_concat


class ExpertRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        _, hidden = self.rnn(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        _, hidden = self.gru(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertLinear(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.LayerNorm(hidden_size * 2),
            nn.GELU(),
            nn.Linear(hidden_size * 2, hidden_size)
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        # 我们需要把序列信息压缩为一个向量，可以使用平均池化
        pooled = torch.mean(x, dim=1)  # [batch_size, input_size]
        return self.linear(pooled)  # [batch_size, hidden_size]


class BertClassificationHead(nn.Module):
    def __init__(self, hidden_size=1024, num_classes=3, dropout_prob=0.3):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.out_proj = nn.Linear(hidden_size, num_classes)
    
    def forward(self, features):
        # 提取 [CLS] 标记的表示
        x = features[:, 0, :]  # 使用第一个标记([CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class MoERouter(nn.Module):
    """专家路由器，学习为每个样本分配专家权重"""
    def __init__(self, input_size, num_experts):
        super().__init__()
        self.router = nn.Linear(input_size, num_experts)
        
    def forward(self, x):
        # x: [batch_size, input_size]
        # 计算每个专家的权重 (使用softmax确保权重和为1)
        router_logits = self.router(x)
        router_probs = F.softmax(router_logits, dim=-1)
        return router_probs  # [batch_size, num_experts]


class DeBERTaMoEClassifier(nn.Module):
    def __init__(
        self, 
        pretrained_model_name, 
        num_classes=3, 
        freeze_pooler=0,
        expert_hidden_size=256,
        dropout=0.3,
        num_rnn_layers=1,
        num_tasks=4  # 新增参数，指定任务数量
    ):
        super().__init__()
        
        self.num_tasks = num_tasks  # 存储任务数量
        
        # 使用 AutoModel 加载 DeBERTa 模型
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        
        # 获取 bert 隐藏层大小
        self.bert_hidden_size = self.bert.config.hidden_size
        
        # 为每个任务创建独立的分类头
        self.task_classifiers = nn.ModuleList([
            BertClassificationHead(
                hidden_size=self.bert_hidden_size,
                num_classes=num_classes,
                dropout_prob=dropout
            ) for _ in range(num_tasks)
        ])
        
        # 创建多个专家模型
        self.experts = nn.ModuleDict({
            'lstm': ExpertLSTM(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'bilstm': ExpertBiLSTM(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'rnn': ExpertRNN(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'gru': ExpertGRU(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'linear': ExpertLinear(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size
            ),
        })
        
        # 创建路由器 (使用[CLS]标记表示作为路由的输入)
        self.router = MoERouter(self.bert_hidden_size, len(self.experts))
        
        # 为每个任务创建专家输出层
        self.expert_outputs = nn.ModuleList([
            nn.ModuleDict({
                expert_name: nn.Linear(expert_hidden_size, num_classes)
                for expert_name in self.experts.keys()
            }) for _ in range(num_tasks)
        ])
        
        # 为每个任务创建融合层
        combined_dim = num_classes * (1 + len(self.experts))
        self.final_classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(combined_dim, combined_dim // 2),
                nn.LayerNorm(combined_dim // 2),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(combined_dim // 2, num_classes)
            ) for _ in range(num_tasks)
        ])
        
        # 创建可学习的损失权重参数
        # 初始化Track 4 (Actionability)的权重略大，表示更重视这个任务
        # self.loss_weights = nn.Parameter(torch.ones(num_tasks))
        # # 设置初始权重，使Track 4的权重初始值更大
        # with torch.no_grad():
        #     # 假设Track 4是索引3
        #     self.loss_weights[3] = 1.5  # 给Track 4一个更高的初始权重
        
        # 方法1：使用固定的四个权重值
        self.loss_weights = nn.Parameter(torch.tensor([0.1, 0.1, 0.1, 0.7]))

        # self.gate = 

    def forward(self, input_ids, attention_mask):
        # DeBERTa 编码
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # 获取序列隐藏状态
        hidden_states = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        
        # 获取路由权重
        cls_embedding = hidden_states[:, 0]  # [batch_size, hidden_size]
        routing_weights = self.router(cls_embedding)  # [batch_size, num_experts]
        
        # 处理每个任务
        final_logits_list = []
        
        for task_idx in range(self.num_tasks):
            # 获取当前任务的原始分类头结果
            original_logits = self.task_classifiers[task_idx](hidden_states)  # [batch_size, num_classes]
            
            # 获取各专家结果
            expert_logits_list = [original_logits]  # 包含原始分类头
            
            for expert_name, expert in self.experts.items():
                # 获取专家输出
                expert_hidden = expert(hidden_states)  # [batch_size, expert_hidden_size]
                # 映射到类别空间
                expert_logits = self.expert_outputs[task_idx][expert_name](expert_hidden)  # [batch_size, num_classes]
                # 添加到列表
                expert_logits_list.append(expert_logits)
            
            # 拼接所有结果 [batch_size, (1+num_experts)*num_classes]
            combined_logits = torch.cat(expert_logits_list, dim=1)
            
            # 通过最终分类器输出当前任务的最终结果
            task_final_logits = self.final_classifiers[task_idx](combined_logits)
            
            final_logits_list.append(task_final_logits)
        
        # 返回所有任务的预测结果以及损失权重
        return {
            'logits': final_logits_list,  # 每个任务的预测结果列表
            'loss_weights': F.softmax(self.loss_weights, dim=0)  # 归一化的损失权重
        }

    def get_normalized_loss_weights(self):
        """获取归一化后的损失权重"""
        return F.softmax(self.loss_weights, dim=0)

In [5]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from transformers import AutoModel


# class ExpertLSTM(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
#         super().__init__()
#         self.lstm = nn.LSTM(
#             input_size=input_size,
#             hidden_size=hidden_size,
#             num_layers=num_layers,
#             batch_first=True,
#             dropout=dropout if num_layers > 1 else 0,
#         )
        
#     def forward(self, x):
#         # x: [batch_size, seq_len, input_size]
#         output, (hidden, _) = self.lstm(x)
#         # 返回最后一个时间步的隐藏状态
#         return hidden[-1]  # [batch_size, hidden_size]


# class ExpertBiLSTM(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
#         super().__init__()
#         self.bilstm = nn.LSTM(
#             input_size=input_size,
#             hidden_size=hidden_size // 2,  # 因为是双向的，所以每个方向的隐藏层大小减半
#             num_layers=num_layers,
#             batch_first=True,
#             bidirectional=True,
#             dropout=dropout if num_layers > 1 else 0,
#         )
        
#     def forward(self, x):
#         # x: [batch_size, seq_len, input_size]
#         output, (hidden, _) = self.bilstm(x)
#         # 拼接最后一层的正向和反向隐藏状态
#         # hidden shape: [num_layers * num_directions, batch_size, hidden_size//2]
#         hidden_forward = hidden[-2]  # 正向的最后一层 [batch_size, hidden_size//2]
#         hidden_backward = hidden[-1]  # 反向的最后一层 [batch_size, hidden_size//2]
#         hidden_concat = torch.cat([hidden_forward, hidden_backward], dim=1)  # [batch_size, hidden_size]
#         return hidden_concat


# class ExpertRNN(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
#         super().__init__()
#         self.rnn = nn.RNN(
#             input_size=input_size,
#             hidden_size=hidden_size,
#             num_layers=num_layers,
#             batch_first=True,
#             dropout=dropout if num_layers > 1 else 0,
#         )
        
#     def forward(self, x):
#         # x: [batch_size, seq_len, input_size]
#         _, hidden = self.rnn(x)
#         # 返回最后一个时间步的隐藏状态
#         return hidden[-1]  # [batch_size, hidden_size]


# class ExpertGRU(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
#         super().__init__()
#         self.gru = nn.GRU(
#             input_size=input_size,
#             hidden_size=hidden_size,
#             num_layers=num_layers,
#             batch_first=True,
#             dropout=dropout if num_layers > 1 else 0,
#         )
        
#     def forward(self, x):
#         # x: [batch_size, seq_len, input_size]
#         _, hidden = self.gru(x)
#         # 返回最后一个时间步的隐藏状态
#         return hidden[-1]  # [batch_size, hidden_size]


# class ExpertLinear(nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super().__init__()
#         self.linear = nn.Sequential(
#             nn.Linear(input_size, hidden_size * 2),
#             nn.LayerNorm(hidden_size * 2),
#             nn.GELU(),
#             nn.Linear(hidden_size * 2, hidden_size)
#         )
        
#     def forward(self, x):
#         # x: [batch_size, seq_len, input_size]
#         # 我们需要把序列信息压缩为一个向量，可以使用平均池化
#         pooled = torch.mean(x, dim=1)  # [batch_size, input_size]
#         return self.linear(pooled)  # [batch_size, hidden_size]


# class BertClassificationHead(nn.Module):
#     def __init__(self, hidden_size=1024, num_classes=3, dropout_prob=0.3):
#         super().__init__()
#         self.dense = nn.Linear(hidden_size, hidden_size)
#         self.dropout = nn.Dropout(dropout_prob)
#         self.out_proj = nn.Linear(hidden_size, num_classes)
    
#     def forward(self, features):
#         # 提取 [CLS] 标记的表示
#         x = features[:, 0, :]  # 使用第一个标记([CLS])
#         x = self.dropout(x)
#         x = self.dense(x)
#         x = torch.relu(x)
#         x = self.dropout(x)
#         x = self.out_proj(x)
#         return x


# class MoERouter(nn.Module):
#     """专家路由器，学习为每个样本分配专家权重"""
#     def __init__(self, input_size, num_experts):
#         super().__init__()
#         self.router = nn.Linear(input_size, num_experts)
        
#     def forward(self, x):
#         # x: [batch_size, input_size]
#         # 计算每个专家的权重 (使用softmax确保权重和为1)
#         router_logits = self.router(x)
#         router_probs = F.softmax(router_logits, dim=-1)
#         return router_probs  # [batch_size, num_experts]


# class DeBERTaMoEClassifier(nn.Module):
#     def __init__(
#         self, 
#         pretrained_model_name, 
#         num_classes=3, 
#         freeze_pooler=0,
#         expert_hidden_size=256,
#         dropout=0.3,
#         num_rnn_layers=1,
#         num_tasks=4  # 新增参数，指定任务数量
#     ):
#         super().__init__()
        
#         self.num_tasks = num_tasks  # 存储任务数量
        
#         # 使用 AutoModel 加载 DeBERTa 模型
#         self.bert = AutoModel.from_pretrained(pretrained_model_name)
        
#         # 获取 bert 隐藏层大小
#         self.bert_hidden_size = self.bert.config.hidden_size
        
#         # 为每个任务创建独立的分类头
#         self.task_classifiers = nn.ModuleList([
#             BertClassificationHead(
#                 hidden_size=self.bert_hidden_size,
#                 num_classes=num_classes,
#                 dropout_prob=dropout
#             ) for _ in range(num_tasks)
#         ])
        
#         # 创建多个专家模型
#         self.experts = nn.ModuleDict({
#             'lstm': ExpertLSTM(
#                 input_size=self.bert_hidden_size, 
#                 hidden_size=expert_hidden_size,
#                 num_layers=num_rnn_layers,
#                 dropout=dropout
#             ),
#             'bilstm': ExpertBiLSTM(
#                 input_size=self.bert_hidden_size, 
#                 hidden_size=expert_hidden_size,
#                 num_layers=num_rnn_layers,
#                 dropout=dropout
#             ),
#             'rnn': ExpertRNN(
#                 input_size=self.bert_hidden_size, 
#                 hidden_size=expert_hidden_size,
#                 num_layers=num_rnn_layers,
#                 dropout=dropout
#             ),
#             'gru': ExpertGRU(
#                 input_size=self.bert_hidden_size, 
#                 hidden_size=expert_hidden_size,
#                 num_layers=num_rnn_layers,
#                 dropout=dropout
#             ),
#             'linear': ExpertLinear(
#                 input_size=self.bert_hidden_size, 
#                 hidden_size=expert_hidden_size
#             ),
#         })
        
#         # 创建路由器 (使用[CLS]标记表示作为路由的输入)
#         self.router = MoERouter(self.bert_hidden_size, len(self.experts))
        
#         # 为每个任务创建专家输出层
#         self.expert_outputs = nn.ModuleList([
#             nn.ModuleDict({
#                 expert_name: nn.Linear(expert_hidden_size, self.bert_hidden_size)
#                 for expert_name in self.experts.keys()
#             }) for _ in range(num_tasks)
#         ])
        
#         # 为每个任务创建融合层
#         combined_dim = self.bert_hidden_size
#         self.final_classifiers = nn.ModuleList([
#             nn.Sequential(
#                 # nn.Linear(combined_dim, combined_dim // 2),
#                 nn.LayerNorm(combined_dim),
#                 nn.Dropout(dropout),
#                 nn.ReLU(),
#                 nn.Linear(combined_dim, num_classes)
#             ) for _ in range(num_tasks)
#         ])
        
#         # 创建可学习的损失权重参数
#         # 初始化Track 4 (Actionability)的权重略大，表示更重视这个任务
#         # self.loss_weights = nn.Parameter(torch.ones(num_tasks))
#         # # 设置初始权重，使Track 4的权重初始值更大
#         # with torch.no_grad():
#         #     # 假设Track 4是索引3
#         #     self.loss_weights[3] = 1.5  # 给Track 4一个更高的初始权重
        
#         # 方法1：使用固定的四个权重值
#         self.loss_weights = nn.Parameter(torch.tensor([0.1, 0.1, 0.2, 0.6]))


#     def forward(self, input_ids, attention_mask):
#         # DeBERTa 编码
#         outputs = self.bert(
#             input_ids=input_ids,
#             attention_mask=attention_mask
#         )
        
#         # 获取序列隐藏状态
#         hidden_states = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        
#         # 获取路由权重
#         cls_embedding = hidden_states[:, 0]  # [batch_size, hidden_size]
#         routing_weights = self.router(cls_embedding)  # [batch_size, num_experts]
        
#         # 处理每个任务
#         final_logits_list = []
        
#         for task_idx in range(self.num_tasks):
        
#             # 在每个任务循环内部重置专家输出列表
#             expert_logits_list = []
        
#             for expert_name, expert in self.experts.items():
#                 # 获取专家输出
#                 expert_hidden = expert(hidden_states)  # [batch_size, expert_hidden_size]
#                 # 映射到类别空间
#                 expert_logits = self.expert_outputs[task_idx][expert_name](expert_hidden)  # [batch_size, num_classes]
#                 # 添加到列表
#                 expert_logits_list.append(expert_logits)
            
#             stack_logits = torch.stack(expert_logits_list, dim=1)  # [b, expert_num, h]
#             moe_logits = torch.sum(routing_weights.unsqueeze(-1) * stack_logits, dim=1)
            
#             # 通过最终分类器输出当前任务的最终结果
#             task_final_logits = self.final_classifiers[task_idx](moe_logits)
            
#             final_logits_list.append(task_final_logits)
        
#         # 返回所有任务的预测结果以及损失权重
#         return {
#             'logits': final_logits_list,  # 每个任务的预测结果列表
#             'loss_weights': F.softmax(self.loss_weights, dim=0)  # 归一化的损失权重
#         }

#     def get_normalized_loss_weights(self):
#         """获取归一化后的损失权重"""
#         return F.softmax(self.loss_weights, dim=0)

# 训练参数设置

In [6]:
import os
import wandb
import random
import argparse
from tqdm import tqdm

import torch
import torch.nn as nn
import numpy as np
from transformers import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# 如果在Jupyter Notebook中运行，可以使用这个自定义参数函数替代argparser
def get_default_configs():
    """在Jupyter环境中使用的默认配置，避免argparse解析错误"""
    class Args:
        def __init__(self):
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
            # self.model_name = "/mnt/cfs/huangzhiwei/BAE2025/models/ModernBERT-large"
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/xlm-roberta-large'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bge-base-en-v1.5'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bert-base-uncased'
            self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
            self.num_classes = 3
            self.dropout = 0.25
            self.freeze_pooler = 8
            self.batch_size = 16
            self.max_length = 512
            self.lr = 2e-5
            self.epochs = 50
            self.device = device
            self.name = None
            self.seed = 42
            self.data_path = '../data_new/all.json'
            self.val_data_path = '../data_new/val.json'
            self.checkpoint_dir = '/mnt/cfs/huangzhiwei/BAE2025/projects/predict/1234'
            self.patience = 8
            self.expert_hidden_size = 512
            self.num_rnn_layers = 1
            self.warmup_ratio = 0.1
            self.num_tasks = 4   # 新增参数，指定任务数量
            self.exp_name = 'BAE2025_track4_bert'
    return Args()


# 训练函数

In [7]:
def train(configs):
    
    # 设置随机种子
    random.seed(configs.seed)
    np.random.seed(configs.seed)
    torch.manual_seed(configs.seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # 创建检查点目录
    checkpoint_dir = os.path.join(configs.checkpoint_dir, configs.exp_name)
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # 为保存混淆矩阵创建目录 - 为每个任务分别创建
    task_names = ["track1", "track2", "track3", "track4"]
    plot_dirs = {}
    
    for task_name in task_names:
        train_plot_dir = os.path.join(checkpoint_dir, 'plots', task_name, 'train')
        val_plot_dir = os.path.join(checkpoint_dir, 'plots', task_name, 'val')
        os.makedirs(train_plot_dir, exist_ok=True)
        os.makedirs(val_plot_dir, exist_ok=True)
        plot_dirs[task_name] = {
            'train': train_plot_dir,
            'val': val_plot_dir
        }
    
    # 加载数据集
    train_dataset = BAE2025Dataset(configs.data_path)
    val_dataset = BAE2025Dataset(configs.val_data_path)    

    # 创建数据加载器
    train_dataloader = BAE2025DataLoader(
        dataset=train_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=True,
        drop_last=True,
        device=configs.device,
        tokenizer_name=configs.model_name
    )

    val_dataloader = BAE2025DataLoader(
        dataset=val_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=False,
        drop_last=False,
        device=configs.device,
        tokenizer_name=configs.model_name
    )
    
    # 创建多任务模型
    model = DeBERTaMoEClassifier(
        pretrained_model_name=configs.model_name,
        num_classes=configs.num_classes,
        freeze_pooler=configs.freeze_pooler,
        num_rnn_layers=configs.num_rnn_layers,
        expert_hidden_size=configs.expert_hidden_size,
        dropout=configs.dropout,
        num_tasks=configs.num_tasks  # 指定4个任务
    ).to(configs.device)

    # 为每个任务定义交叉熵损失函数
    criterion = nn.CrossEntropyLoss()

    # 定义优化器
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=configs.lr
    )
    
    # 添加Warmup + Cosine Decay学习率调度
    from transformers import get_cosine_schedule_with_warmup
    
    # 计算总训练步数
    total_steps = len(train_dataloader) * configs.epochs
    
    # 计算warmup步数 (默认总步数的10%，可通过configs.warmup_ratio调整)
    warmup_ratio = getattr(configs, 'warmup_ratio', 0.1)  # 如果未定义，则使用默认值0.1
    warmup_steps = int(warmup_ratio * total_steps)
    
    # 创建学习率调度器
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    # 初始化最佳验证损失和早停计数器
    best_val_acc = 0.0
    best_val_f1 = 0.0
    best_val_loss = float('inf')
    patience_counter = 0
    
    # 定义类别名称
    class_names = ['Yes', 'To some extent', 'No']
    
    # 添加F1计算所需的库
    from sklearn.metrics import f1_score, confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # 训练循环
    for epoch in range(configs.epochs):
        # 训练阶段
        model.train()
        train_loss = 0.0
        train_task_losses = [0.0] * configs.num_tasks  # 记录每个任务的损失
        train_task_accs = [0.0] * configs.num_tasks    # 记录每个任务的准确率
        train_task_preds = [[] for _ in range(configs.num_tasks)]  # 每个任务的预测结果
        train_task_labels = [[] for _ in range(configs.num_tasks)]  # 每个任务的真实标签
        
        with tqdm(
            train_dataloader,
            total=len(train_dataloader),
            desc=f'Epoch {epoch + 1}/{configs.epochs}',
            unit='batch',
            ncols=120
        ) as pbar:
            for input_ids, attention_mask, labels in pbar:
                optimizer.zero_grad()
                
                # 前向传播
                outputs = model(input_ids, attention_mask)
                logits_list = outputs['logits']  # 每个任务的预测结果列表
                loss_weights = outputs['loss_weights']  # 可学习的损失权重
                
                # 计算每个任务的损失并加权求和
                batch_losses = []
                for task_idx in range(configs.num_tasks):
                    task_labels = labels[:, task_idx].long()
                    task_loss = criterion(logits_list[task_idx], task_labels)
                    batch_losses.append(task_loss)
                    train_task_losses[task_idx] += task_loss.item()
                    
                    # 计算当前任务的准确率
                    task_preds = logits_list[task_idx].argmax(dim=1)
                    task_acc = (task_preds == task_labels).float().mean()
                    train_task_accs[task_idx] += task_acc.item()
                    
                    # 收集预测结果和真实标签
                    train_task_preds[task_idx].extend(task_preds.cpu().numpy())
                    train_task_labels[task_idx].extend(task_labels.cpu().numpy())
                
                # 将每个损失与其权重相乘，然后求和
                weighted_losses = [loss * weight for loss, weight in zip(batch_losses, loss_weights)]
                final_loss = sum(weighted_losses)
                
                # 反向传播
                final_loss.backward()
                optimizer.step()
                scheduler.step()  # 更新学习率
                
                train_loss += final_loss.item()
                
                # 更新进度条显示
                curr_lr = scheduler.get_last_lr()[0]
                curr_weights = loss_weights.detach().cpu().numpy()
                
                # 格式化权重显示
                weights_str = " ".join([f"w{i+1}:{w:.2f}" for i, w in enumerate(curr_weights)])
                
                pbar.set_postfix(
                    loss=f'{final_loss.item():.3f}',
                    lr=f'{curr_lr:.6f}',
                    weights=weights_str
                )
        
        # 计算每个任务的平均损失和准确率
        train_loss = train_loss / len(train_dataloader)
        train_task_losses = [loss / len(train_dataloader) for loss in train_task_losses]
        train_task_accs = [acc / len(train_dataloader) for acc in train_task_accs]
        
        # 计算每个任务的F1分数
        train_task_f1s = [
            f1_score(labels, preds, average='macro') 
            for labels, preds in zip(train_task_labels, train_task_preds)
        ]
        
        # 打印训练结果
        print(f'Training Loss: {train_loss:.4f}')
        for task_idx, task_name in enumerate(task_names):
            print(f'Task {task_name} - Loss: {train_task_losses[task_idx]:.4f}, '
                  f'Acc: {train_task_accs[task_idx]:.4f}, '
                  f'F1: {train_task_f1s[task_idx]:.4f}')
        
        # 打印当前的损失权重
        print(f'Current Loss Weights: {loss_weights.detach().cpu().numpy()}')
        
        # 为每个任务创建混淆矩阵
        for task_idx, task_name in enumerate(task_names):
            # 创建完整的三分类混淆矩阵
            cm_full = confusion_matrix(
                train_task_labels[task_idx], 
                train_task_preds[task_idx], 
                labels=[0, 1, 2]
            )
            plt.figure(figsize=(10, 8))
            sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues',
                        xticklabels=class_names,
                        yticklabels=class_names)
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title(f'Train: {task_name} Confusion Matrix\n'
                      f'Acc: {train_task_accs[task_idx]:.4f}, '
                      f'F1: {train_task_f1s[task_idx]:.4f}')
            
            # 保存完整混淆矩阵
            matrix_path = os.path.join(plot_dirs[task_name]['train'], f'cm_full_epoch_{epoch+1}.png')
            plt.savefig(matrix_path)
            plt.close()
        
        # # 验证阶段
        # model.eval()
        # val_loss = 0.0
        # val_task_losses = [0.0] * configs.num_tasks
        # val_task_corrects = [0.0] * configs.num_tasks
        # val_task_preds = [[] for _ in range(configs.num_tasks)]
        # val_task_labels = [[] for _ in range(configs.num_tasks)]

        # with torch.no_grad():
        #     for input_ids, attention_mask, labels in val_dataloader:
        #         # 前向传播
        #         outputs = model(input_ids, attention_mask)
        #         logits_list = outputs['logits']
        #         loss_weights = outputs['loss_weights']
                
        #         # 计算每个任务的损失
        #         for task_idx in range(configs.num_tasks):
        #             task_labels = labels[:, task_idx].long()
        #             task_logits = logits_list[task_idx]
                    
        #             task_loss = criterion(task_logits, task_labels)
        #             val_task_losses[task_idx] += task_loss.item()
                    
        #             task_preds = task_logits.argmax(dim=1)
        #             task_corrects = (task_preds == task_labels).float().sum()
        #             val_task_corrects[task_idx] += task_corrects.item()
                    
        #             # 收集预测结果和真实标签
        #             val_task_preds[task_idx].extend(task_preds.cpu().numpy())
        #             val_task_labels[task_idx].extend(task_labels.cpu().numpy())
                
        #         # 计算加权总损失
        #         val_batch_losses = [criterion(logits_list[i], labels[:, i].long()) for i in range(configs.num_tasks)]
        #         weighted_losses = [loss * weight for loss, weight in zip(val_batch_losses, loss_weights)]
        #         val_loss += sum(weighted_losses).item()
        
        # # 计算验证损失和准确率
        # val_loss = val_loss / len(val_dataloader)
        # val_task_losses = [loss / len(val_dataloader) for loss in val_task_losses]
        # val_task_accs = [correct / len(val_dataset) for correct in val_task_corrects]
        
        # # 计算每个任务的F1分数
        # val_task_f1s = [
        #     f1_score(labels, preds, average='macro') 
        #     for labels, preds in zip(val_task_labels, val_task_preds)
        # ]
        
        # # 打印验证结果
        # print(f'Validation Loss: {val_loss:.4f}')
        # for task_idx, task_name in enumerate(task_names):
        #     print(f'Task {task_name} - Loss: {val_task_losses[task_idx]:.4f}, '
        #           f'Acc: {val_task_accs[task_idx]:.4f}, '
        #           f'F1: {val_task_f1s[task_idx]:.4f}')
        
        # # 为每个任务创建验证集混淆矩阵
        # for task_idx, task_name in enumerate(task_names):
        #     # 创建完整的三分类混淆矩阵
        #     cm_full = confusion_matrix(
        #         val_task_labels[task_idx], 
        #         val_task_preds[task_idx], 
        #         labels=[0, 1, 2]
        #     )
        #     plt.figure(figsize=(10, 8))
        #     sns.heatmap(cm_full, annot=True, fmt='d', cmap='Blues',
        #                 xticklabels=class_names,
        #                 yticklabels=class_names)
        #     plt.xlabel('Predicted')
        #     plt.ylabel('True')
        #     plt.title(f'Val: {task_name} Confusion Matrix\n'
        #               f'Acc: {val_task_accs[task_idx]:.4f}, '
        #               f'F1: {val_task_f1s[task_idx]:.4f}')
            
        #     # 保存完整混淆矩阵
        #     matrix_path = os.path.join(plot_dirs[task_name]['val'], f'cm_full_epoch_{epoch+1}.png')
        #     plt.savefig(matrix_path)
        #     plt.close()
        
        # # 检查是否保存模型，使用Track 4 (Actionability)的F1分数作为主要指标
        # # 这里我们特别关注Track 4的性能
        # track4_idx = 3  # Track 4的索引
        # if val_task_f1s[track4_idx] > best_val_f1:
        #     best_val_f1 = val_task_f1s[track4_idx]
        #     best_val_acc = val_task_accs[track4_idx]
            
        #     # 保存模型
        #     state_dict = model.state_dict()
        #     torch.save(state_dict, os.path.join(checkpoint_dir, 'best_model_f1.pt'))
        #     print(f'New best model saved with Track 4 F1: {best_val_f1:.4f}, Acc: {best_val_acc:.4f}')
            
        #     patience_counter = 0
        # else:
        #     patience_counter += 1
        #     if patience_counter >= configs.patience:
        #         print(f'Early stopping triggered after {epoch+1} epochs.')
        #         break

        # model.train()
        
        # 保存每个epoch的损失权重
        np.save(
            os.path.join(checkpoint_dir, f'loss_weights_epoch_{epoch+1}.npy'), 
            loss_weights.detach().cpu().numpy()
        )

        if epoch == 22 :
            state_dict = model.state_dict()
            torch.save(state_dict, os.path.join(checkpoint_dir, 'best_model_f1.pt'))
            break
        
# 在以下主函数中添加判断Jupyter环境的逻辑
if __name__ == '__main__':
    # 判断是否在Jupyter环境中运行
    try:
        # 检查是否在Jupyter中运行
        get_ipython = globals().get('get_ipython', None)
        if get_ipython and 'IPKernelApp' in get_ipython().config:
            # 在Jupyter环境中运行，使用默认配置
            print("Running in Jupyter environment, using default configs")
            configs = get_default_configs()
        else:
            # 在命令行环境中运行，使用argparse
            configs = argparser()
    except:
        # 任何异常都使用argparse处理
        configs = argparser()
    
    # 设置实验名称
    if configs.name is None:
        configs.exp_name = \
            f'{os.path.basename(configs.model_name)}' + \
            f'{"_fp" if configs.freeze_pooler else ""}' + \
            f'_b{configs.batch_size}_e{configs.epochs}' + \
            f'_len{configs.max_length}_lr{configs.lr}'
    else:
        configs.exp_name = configs.name
    
    # 设置设备
    if configs.device is None:
        configs.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
    
    # 调用训练函数
    train(configs)

Running in Jupyter environment, using default configs




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>


Epoch 1/50: 100%|█| 154/154 [01:28<00:00,  1.74batch/s, loss=0.943, lr=0.000004, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.9897
Task track1 - Loss: 0.8796, Acc: 0.6335, F1: 0.2966
Task track2 - Loss: 1.0064, Acc: 0.5503, F1: 0.3337
Task track3 - Loss: 1.0269, Acc: 0.5495, F1: 0.2627
Task track4 - Loss: 1.0206, Acc: 0.4529, F1: 0.3125
Current Loss Weights: [0.20745215 0.20736727 0.20735952 0.37782112]


Epoch 2/50: 100%|█| 154/154 [01:29<00:00,  1.72batch/s, loss=0.730, lr=0.000008, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.9303
Task track1 - Loss: 0.7400, Acc: 0.7358, F1: 0.3024
Task track2 - Loss: 0.9328, Acc: 0.6031, F1: 0.2930
Task track3 - Loss: 0.9954, Acc: 0.5686, F1: 0.2456
Task track4 - Loss: 0.9979, Acc: 0.5004, F1: 0.3136
Current Loss Weights: [0.20768104 0.20739464 0.20728901 0.37763524]


Epoch 3/50: 100%|█| 154/154 [01:29<00:00,  1.73batch/s, loss=0.822, lr=0.000012, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.9247
Task track1 - Loss: 0.7273, Acc: 0.7447, F1: 0.3174
Task track2 - Loss: 0.9354, Acc: 0.5933, F1: 0.2941
Task track3 - Loss: 0.9916, Acc: 0.5649, F1: 0.2519
Task track4 - Loss: 0.9907, Acc: 0.5073, F1: 0.3062
Current Loss Weights: [0.2080429  0.2074159  0.20715913 0.37738204]


Epoch 4/50: 100%|█| 154/154 [01:28<00:00,  1.73batch/s, loss=0.871, lr=0.000016, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.9001
Task track1 - Loss: 0.7091, Acc: 0.7386, F1: 0.3130
Task track2 - Loss: 0.8937, Acc: 0.6185, F1: 0.3137
Task track3 - Loss: 0.9771, Acc: 0.5686, F1: 0.2572
Task track4 - Loss: 0.9668, Acc: 0.5357, F1: 0.3476
Current Loss Weights: [0.20851013 0.20753069 0.20694102 0.37701815]


Epoch 5/50: 100%|█| 154/154 [01:28<00:00,  1.73batch/s, loss=0.896, lr=0.000020, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.8551
Task track1 - Loss: 0.6585, Acc: 0.7719, F1: 0.3492
Task track2 - Loss: 0.8583, Acc: 0.6627, F1: 0.4051
Task track3 - Loss: 0.9363, Acc: 0.5942, F1: 0.3481
Task track4 - Loss: 0.9176, Acc: 0.6185, F1: 0.4370
Current Loss Weights: [0.20914547 0.20762795 0.2066485  0.37657815]


Epoch 6/50: 100%|█| 154/154 [01:28<00:00,  1.74batch/s, loss=0.843, lr=0.000020, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.8256
Task track1 - Loss: 0.6208, Acc: 0.8263, F1: 0.5066
Task track2 - Loss: 0.8426, Acc: 0.6859, F1: 0.4308
Task track3 - Loss: 0.9210, Acc: 0.6177, F1: 0.3973
Task track4 - Loss: 0.8780, Acc: 0.6757, F1: 0.4795
Current Loss Weights: [0.2098579  0.20762862 0.20626982 0.3762437 ]


Epoch 7/50: 100%|█| 154/154 [01:27<00:00,  1.76batch/s, loss=0.835, lr=0.000020, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.8034
Task track1 - Loss: 0.5986, Acc: 0.8344, F1: 0.5298
Task track2 - Loss: 0.8255, Acc: 0.7074, F1: 0.4538
Task track3 - Loss: 0.9096, Acc: 0.6262, F1: 0.4181
Task track4 - Loss: 0.8474, Acc: 0.6660, F1: 0.4751
Current Loss Weights: [0.2105682  0.20758832 0.20583075 0.37601277]


Epoch 8/50: 100%|█| 154/154 [01:26<00:00,  1.78batch/s, loss=0.919, lr=0.000020, weights=w1:0.21 w2:0.21 w3:0.21 w4:0.38


Training Loss: 0.7866
Task track1 - Loss: 0.5919, Acc: 0.8320, F1: 0.5260
Task track2 - Loss: 0.8110, Acc: 0.7127, F1: 0.4594
Task track3 - Loss: 0.8964, Acc: 0.6299, F1: 0.4369
Task track4 - Loss: 0.8221, Acc: 0.6968, F1: 0.4977
Current Loss Weights: [0.21123397 0.20755833 0.20542131 0.37578642]


Epoch 9/50: 100%|█| 154/154 [01:27<00:00,  1.77batch/s, loss=0.787, lr=0.000020, weights=w1:0.21 w2:0.21 w3:0.20 w4:0.38


Training Loss: 0.7686
Task track1 - Loss: 0.5770, Acc: 0.8283, F1: 0.5169
Task track2 - Loss: 0.7990, Acc: 0.7212, F1: 0.4699
Task track3 - Loss: 0.8833, Acc: 0.6408, F1: 0.4585
Task track4 - Loss: 0.7970, Acc: 0.7078, F1: 0.5070
Current Loss Weights: [0.2118851  0.20745765 0.20496753 0.37568974]


Epoch 10/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.923, lr=0.000019, weights=w1:0.21 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.7546
Task track1 - Loss: 0.5714, Acc: 0.8401, F1: 0.5278
Task track2 - Loss: 0.7820, Acc: 0.7224, F1: 0.4701
Task track3 - Loss: 0.8746, Acc: 0.6575, F1: 0.5173
Task track4 - Loss: 0.7776, Acc: 0.7382, F1: 0.5311
Current Loss Weights: [0.21250169 0.2073601  0.20454463 0.3755935 ]


Epoch 11/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.660, lr=0.000019, weights=w1:0.21 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.7401
Task track1 - Loss: 0.5542, Acc: 0.8417, F1: 0.5359
Task track2 - Loss: 0.7774, Acc: 0.7297, F1: 0.4790
Task track3 - Loss: 0.8585, Acc: 0.6696, F1: 0.5353
Task track4 - Loss: 0.7604, Acc: 0.7350, F1: 0.5280
Current Loss Weights: [0.21310651 0.20721084 0.20412597 0.37555668]


Epoch 12/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.662, lr=0.000019, weights=w1:0.21 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.7280
Task track1 - Loss: 0.5528, Acc: 0.8356, F1: 0.5455
Task track2 - Loss: 0.7538, Acc: 0.7451, F1: 0.4960
Task track3 - Loss: 0.8493, Acc: 0.6769, F1: 0.5504
Task track4 - Loss: 0.7475, Acc: 0.7488, F1: 0.5478
Current Loss Weights: [0.21368396 0.20713124 0.20368913 0.37549567]


Epoch 13/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.755, lr=0.000018, weights=w1:0.21 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.7043
Task track1 - Loss: 0.5339, Acc: 0.8429, F1: 0.5510
Task track2 - Loss: 0.7271, Acc: 0.7626, F1: 0.5119
Task track3 - Loss: 0.8269, Acc: 0.6924, F1: 0.5714
Task track4 - Loss: 0.7224, Acc: 0.7654, F1: 0.5685
Current Loss Weights: [0.2142418  0.20705909 0.20331177 0.37538725]


Epoch 14/50: 100%|█| 154/154 [01:26<00:00,  1.78batch/s, loss=0.590, lr=0.000018, weights=w1:0.21 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.6901
Task track1 - Loss: 0.5166, Acc: 0.8523, F1: 0.5648
Task track2 - Loss: 0.7077, Acc: 0.7739, F1: 0.5234
Task track3 - Loss: 0.8085, Acc: 0.7001, F1: 0.5865
Task track4 - Loss: 0.7156, Acc: 0.7654, F1: 0.5991
Current Loss Weights: [0.21479398 0.20704433 0.20291424 0.37524745]


Epoch 15/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.547, lr=0.000018, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.6725
Task track1 - Loss: 0.5105, Acc: 0.8575, F1: 0.5824
Task track2 - Loss: 0.6971, Acc: 0.7886, F1: 0.5355
Task track3 - Loss: 0.7886, Acc: 0.7204, F1: 0.6172
Task track4 - Loss: 0.6891, Acc: 0.7841, F1: 0.6284
Current Loss Weights: [0.21531011 0.20696661 0.20255445 0.37516883]


Epoch 16/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.698, lr=0.000017, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.6584
Task track1 - Loss: 0.5142, Acc: 0.8543, F1: 0.5672
Task track2 - Loss: 0.6834, Acc: 0.7877, F1: 0.5319
Task track3 - Loss: 0.7659, Acc: 0.7273, F1: 0.6257
Task track4 - Loss: 0.6694, Acc: 0.8003, F1: 0.6488
Current Loss Weights: [0.2157625  0.20687877 0.20221183 0.37514693]


Epoch 17/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.717, lr=0.000017, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.6427
Task track1 - Loss: 0.5001, Acc: 0.8547, F1: 0.5557
Task track2 - Loss: 0.6635, Acc: 0.8072, F1: 0.5492
Task track3 - Loss: 0.7578, Acc: 0.7435, F1: 0.6562
Task track4 - Loss: 0.6513, Acc: 0.8101, F1: 0.6727
Current Loss Weights: [0.2161974  0.20681542 0.20185505 0.37513214]


Epoch 18/50: 100%|█| 154/154 [01:26<00:00,  1.78batch/s, loss=0.574, lr=0.000016, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.6215
Task track1 - Loss: 0.4773, Acc: 0.8653, F1: 0.5785
Task track2 - Loss: 0.6389, Acc: 0.8239, F1: 0.5619
Task track3 - Loss: 0.7470, Acc: 0.7472, F1: 0.6625
Task track4 - Loss: 0.6275, Acc: 0.8231, F1: 0.6917
Current Loss Weights: [0.21662502 0.2067537  0.20146444 0.37515685]


Epoch 19/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.619, lr=0.000016, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.6099
Task track1 - Loss: 0.4810, Acc: 0.8567, F1: 0.5722
Task track2 - Loss: 0.6319, Acc: 0.8186, F1: 0.5580
Task track3 - Loss: 0.7163, Acc: 0.7707, F1: 0.6983
Task track4 - Loss: 0.6151, Acc: 0.8348, F1: 0.7102
Current Loss Weights: [0.21700901 0.20668548 0.20115222 0.37515324]


Epoch 20/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.568, lr=0.000015, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.6000
Task track1 - Loss: 0.4748, Acc: 0.8649, F1: 0.5820
Task track2 - Loss: 0.6130, Acc: 0.8304, F1: 0.5668
Task track3 - Loss: 0.7130, Acc: 0.7768, F1: 0.7099
Task track4 - Loss: 0.6047, Acc: 0.8377, F1: 0.7202
Current Loss Weights: [0.21737087 0.20664188 0.20082982 0.37515748]


Epoch 21/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.477, lr=0.000014, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.5903
Task track1 - Loss: 0.4730, Acc: 0.8608, F1: 0.5678
Task track2 - Loss: 0.6102, Acc: 0.8332, F1: 0.5683
Task track3 - Loss: 0.6924, Acc: 0.7886, F1: 0.7238
Task track4 - Loss: 0.5927, Acc: 0.8429, F1: 0.7397
Current Loss Weights: [0.2177144  0.20658536 0.20053881 0.37516144]


Epoch 22/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.578, lr=0.000014, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.5798
Task track1 - Loss: 0.4624, Acc: 0.8689, F1: 0.5898
Task track2 - Loss: 0.6080, Acc: 0.8304, F1: 0.5662
Task track3 - Loss: 0.7067, Acc: 0.7723, F1: 0.7017
Task track4 - Loss: 0.5648, Acc: 0.8486, F1: 0.7399
Current Loss Weights: [0.21801776 0.20647296 0.20018995 0.37531927]


Epoch 23/50: 100%|█| 154/154 [01:26<00:00,  1.79batch/s, loss=0.491, lr=0.000013, weights=w1:0.22 w2:0.21 w3:0.20 w4:0.3


Training Loss: 0.5592
Task track1 - Loss: 0.4543, Acc: 0.8689, F1: 0.6055
Task track2 - Loss: 0.5946, Acc: 0.8397, F1: 0.5736
Task track3 - Loss: 0.6788, Acc: 0.7930, F1: 0.7301
Task track4 - Loss: 0.5369, Acc: 0.8620, F1: 0.7664
Current Loss Weights: [0.21828558 0.20632422 0.19985993 0.37553027]
