In [1]:
import os
# 设置环境变量，只让程序看到 GPU 2
os.environ['CUDA_VISIBLE_DEVICES'] = '3'


import torch
import torch.nn as nn
import wandb
import random
import argparse
import numpy as np
from tqdm import tqdm
from transformers import BertModel, AutoModel
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


# 数据预处理函数

In [2]:
from torch.utils.data import Dataset
import json

class BAE2025Dataset(Dataset):
    def __init__(
            self,
            data_path,
            label_type="Actionability",  # 根据需要可以是 "Mistake_Identification", "Mistake_Location", "Providing_Guidance", "Actionability"
            labels={
                "Yes": 0,
                "To some extent": 1, 
                "No": 2,
            }
    ):
        self.data_path = data_path
        self.label_type = label_type
        self.labels = labels
        self._get_data()
    
    def _get_data(self):
        with open(self.data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        self.data = []
        for item in data:
            sent1 = item['conversation_history']
            sent2 = item['response']
            
            # 检查item中是否直接包含我们需要的标签
            if self.label_type in item and item[self.label_type] in self.labels:
                self.data.append(((sent1, sent2), self.labels[item[self.label_type]]))
    
    def __len__(self):
        return len(self.data)
    
    def get_labels(self):
        return self.labels

    def __getitem__(self, idx):
        return self.data[idx]

# 数据加载函数

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import DebertaV2Tokenizer

class BAE2025DataLoader:
    def __init__(
        self,
        dataset,
        batch_size=16,
        max_length=512,
        shuffle=True,
        drop_last=True,
        device=None,
        # tokenizer_name='chinese-bert-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext'
        # tokenizer_name='chinese-roberta-wwm-ext-large'
        # tokenizer_name='/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
        tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
        # tokenizer_name='/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.tokenizer.truncation_side = 'left'  # 设置截断方向为左侧,即从句子开头开始截断,假设一个句子过长，则从句子开头开始截断，保留句子结尾的部分
        print("当前使用的 tokenizer 类型：", type(self.tokenizer))
        
        # config = AutoConfig.from_pretrained(tokenizer_name)
        # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, config=config, use_fast=True)
        
        
        # self.tokenizer = DebertaV2Tokenizer.from_pretrained(tokenizer_name)
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.drop_last = drop_last

        if device is None:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu'
            )
        else:
            self.device = device

        self.loader = DataLoader(
            dataset=self.dataset,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            shuffle=self.shuffle,
            drop_last=self.drop_last
        )

    def collate_fn(self, data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 修改这里，处理两个句子的情况
        data = self.tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=[(sent[0], sent[1]) for sent in sents],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_length=True
        )
        input_ids = data['input_ids'].to(self.device)
        attention_mask = data['attention_mask'].to(self.device)
        # token_type_ids = data['token_type_ids'].to(self.device)
        labels = torch.LongTensor(labels).to(self.device)

        # return input_ids, attention_mask, token_type_ids, labels
        return input_ids, attention_mask, labels


    def __iter__(self):
        for data in self.loader:
            yield data

    def __len__(self):
        return len(self.loader)



# 模型代码

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel


class ExpertLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        output, (hidden, _) = self.lstm(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertBiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.bilstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size // 2,  # 因为是双向的，所以每个方向的隐藏层大小减半
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        output, (hidden, _) = self.bilstm(x)
        # 拼接最后一层的正向和反向隐藏状态
        # hidden shape: [num_layers * num_directions, batch_size, hidden_size//2]
        hidden_forward = hidden[-2]  # 正向的最后一层 [batch_size, hidden_size//2]
        hidden_backward = hidden[-1]  # 反向的最后一层 [batch_size, hidden_size//2]
        hidden_concat = torch.cat([hidden_forward, hidden_backward], dim=1)  # [batch_size, hidden_size]
        return hidden_concat


class ExpertRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        _, hidden = self.rnn(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        _, hidden = self.gru(x)
        # 返回最后一个时间步的隐藏状态
        return hidden[-1]  # [batch_size, hidden_size]


class ExpertLinear(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(input_size, hidden_size * 2),
            nn.LayerNorm(hidden_size * 2),
            nn.GELU(),
            nn.Linear(hidden_size * 2, hidden_size)
        )
        
    def forward(self, x):
        # x: [batch_size, seq_len, input_size]
        # 我们需要把序列信息压缩为一个向量，可以使用平均池化
        pooled = torch.mean(x, dim=1)  # [batch_size, input_size]
        return self.linear(pooled)  # [batch_size, hidden_size]


class BertClassificationHead(nn.Module):
    def __init__(self, hidden_size=1024, num_classes=3, dropout_prob=0.3):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.out_proj = nn.Linear(hidden_size, num_classes)
    
    def forward(self, features):
        # 提取 [CLS] 标记的表示
        x = features[:, 0, :]  # 使用第一个标记([CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class MoERouter(nn.Module):
    """专家路由器，学习为每个样本分配专家权重"""
    def __init__(self, input_size, num_experts):
        super().__init__()
        self.router = nn.Linear(input_size, num_experts)
        
    def forward(self, x):
        # x: [batch_size, input_size]
        # 计算每个专家的权重 (使用softmax确保权重和为1)
        router_logits = self.router(x)
        router_probs = F.softmax(router_logits, dim=-1)
        return router_probs  # [batch_size, num_experts]


class DeBERTaMoEClassifier(nn.Module):
    def __init__(
        self, 
        pretrained_model_name, 
        num_classes=3, 
        freeze_pooler=0,
        expert_hidden_size=256,
        dropout=0.3,
        num_rnn_layers=1
    ):
        super().__init__()
        
        # 使用 AutoModel 加载 DeBERTa 模型
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        
        # 获取 bert 隐藏层大小
        self.bert_hidden_size = self.bert.config.hidden_size
        
        # 保留原有的分类头
        self.original_classifier = BertClassificationHead(
            hidden_size=self.bert_hidden_size,
            num_classes=num_classes,
            dropout_prob=dropout
        )
        
        # 创建多个专家模型
        self.experts = nn.ModuleDict({
            'lstm': ExpertLSTM(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'bilstm': ExpertBiLSTM(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'rnn': ExpertRNN(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'gru': ExpertGRU(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size,
                num_layers=num_rnn_layers,
                dropout=dropout
            ),
            'linear': ExpertLinear(
                input_size=self.bert_hidden_size, 
                hidden_size=expert_hidden_size
            ),
        })
        
        # 创建路由器 (使用[CLS]标记表示作为路由的输入)
        self.router = MoERouter(self.bert_hidden_size, len(self.experts))
        
        # 各专家模型的输出映射层，将各自的hidden_size映射到统一的输出空间
        self.expert_outputs = nn.ModuleDict({
            expert_name: nn.Linear(expert_hidden_size, num_classes)
            for expert_name in self.experts.keys()
        })
        
        # 最终的融合层，将所有结果拼接后映射到输出类别
        # (1个原始分类头 + 5个专家) * 每个输出num_classes = 6 * num_classes
        combined_dim = num_classes * (1 + len(self.experts))
        self.final_classifier = nn.Sequential(
            nn.Linear(combined_dim, combined_dim // 2),
            nn.LayerNorm(combined_dim // 2),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(combined_dim // 2, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # DeBERTa 编码
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # 获取序列隐藏状态
        hidden_states = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        
        # 获取原始分类头结果
        original_logits = self.original_classifier(hidden_states)  # [batch_size, num_classes]
        
        # 获取路由权重
        cls_embedding = hidden_states[:, 0]  # [batch_size, hidden_size]
        routing_weights = self.router(cls_embedding)  # [batch_size, num_experts]
        
        # 获取各专家结果
        expert_outputs = {}
        for expert_name, expert in self.experts.items():
            # 获取专家输出
            expert_hidden = expert(hidden_states)  # [batch_size, expert_hidden_size]
            # 映射到类别空间
            expert_logits = self.expert_outputs[expert_name](expert_hidden)  # [batch_size, num_classes]
            # 存储结果
            expert_outputs[expert_name] = expert_logits
        
        # 根据路由权重加权专家结果
        # 首先，将所有专家的结果拼接到一起
        expert_logits_list = [original_logits]  # 包含原始分类头
        expert_names = list(self.experts.keys())
        
        for expert_name in expert_names:
            expert_logits_list.append(expert_outputs[expert_name])
        
        # 拼接所有结果 [batch_size, (1+num_experts)*num_classes]
        combined_logits = torch.cat(expert_logits_list, dim=1)
        
        # 通过最终分类器输出最终结果
        final_logits = self.final_classifier(combined_logits)
        
        # return {
        #     'logits': final_logits,  # 最终预测
        #     'original_logits': original_logits,  # 原始分类头预测
        #     'expert_logits': expert_outputs,  # 各专家预测
        #     'routing_weights': routing_weights,  # 路由权重
        #     'combined_logits': combined_logits  # 拼接的中间结果
        # }
        
        return final_logits


# FGM

In [5]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}
        # 明确指定为word_embeddings
        self.emb_name = 'word_embeddings'
    
    def attack(self, epsilon=1.):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
    
    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

# 训练参数设置

In [6]:
import os
import wandb
import random
import argparse
from tqdm import tqdm

import torch
import torch.nn as nn
import numpy as np
from transformers import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# 如果在Jupyter Notebook中运行，可以使用这个自定义参数函数替代argparser
def get_default_configs():
    """在Jupyter环境中使用的默认配置，避免argparse解析错误"""
    class Args:
        def __init__(self):
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/bge-large-en-v1.5'
            # self.model_name = "/mnt/cfs/huangzhiwei/BAE2025/models/ModernBERT-large"
            # self.model_name = '/mnt/cfs/huangzhiwei/pykt-moekt/SBM/xlm-roberta-large'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bge-base-en-v1.5'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/bert-base-uncased'
            self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/deberta-v3-base'
            # self.model_name = '/mnt/cfs/huangzhiwei/BAE2025/models/roberta-base'
            self.num_classes = 3
            self.dropout = 0.1
            self.freeze_pooler = 8
            self.batch_size = 8
            self.max_length = 512
            self.lr = 3e-5
            self.epochs = 50
            self.device = device
            self.name = None
            self.seed = 42
            self.data_path = '../data_new/all.json'
            self.val_data_path = '../data_new/val.json'
            self.checkpoint_dir = 'checkpoints_track4'
            self.patience = 6
            self.expert_hidden_size = 768
            self.num_rnn_layers = 2
            self.warmup_ratio = 0.1
            self.exp_name = 'BAE2025_track4_bert'
            self.weight_decay = 0.01
    return Args()


# 训练函数

In [7]:
import wandb
def train(configs):

    # 设置随机种子
    random.seed(configs.seed)
    np.random.seed(configs.seed)
    torch.manual_seed(configs.seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # 创建检查点目录
    checkpoint_dir = os.path.join(configs.checkpoint_dir, configs.exp_name)
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # 为保存混淆矩阵创建目录 - 分别为训练集和验证集创建
    train_plot_dir = os.path.join(checkpoint_dir, 'plots', 'train')
    val_plot_dir = os.path.join(checkpoint_dir, 'plots', 'val')
    os.makedirs(train_plot_dir, exist_ok=True)
    os.makedirs(val_plot_dir, exist_ok=True)
    
    # 加载数据集
    train_dataset = BAE2025Dataset(configs.data_path)
    val_dataset = BAE2025Dataset(configs.val_data_path)    

    # 创建数据加载器
    train_dataloader = BAE2025DataLoader(
        dataset=train_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=True,
        drop_last=True,
        device=configs.device,
        tokenizer_name=configs.model_name
    )

    val_dataloader = BAE2025DataLoader(
        dataset=val_dataset,
        batch_size=configs.batch_size,
        max_length=configs.max_length,
        shuffle=False,
        drop_last=False,
        device=configs.device,
        tokenizer_name=configs.model_name
    )
    
    # 创建模型
    model = DeBERTaMoEClassifier(
        pretrained_model_name=configs.model_name,
        num_classes=configs.num_classes,
        freeze_pooler=configs.freeze_pooler,
        num_rnn_layers=configs.num_rnn_layers,
        expert_hidden_size=configs.expert_hidden_size,
        dropout=configs.dropout
    ).to(configs.device)

    criterion = nn.CrossEntropyLoss()

    # 定义优化器
    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=configs.lr,
        weight_decay=configs.weight_decay
    )
    
    
    # 初始化最佳验证损失和早停计数器
    best_val_acc = 0.0
    best_val_f1 = 0.0  # 添加F1分数作为评估指标
    best_val_loss = float('inf')
    patience_counter = 0
    

    # 初始化最佳指标记录
    best_metrics = {
        'epoch': 0,
        'val_f1': 0.0,
        'val_acc': 0.0,
        'val_loss': float('inf')
    }

    # 定义类别名称
    class_names = ['Yes', 'To some extent', 'No']
    
    # 添加F1计算所需的库
    from sklearn.metrics import f1_score, confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # 训练循环
    for epoch in range(configs.epochs):
        # 训练阶段
        model.train()
        train_loss = 0.0
        train_acc = 0.0
        train_preds = []
        train_labels_list = []
        
        with tqdm(
            train_dataloader,
            total=len(train_dataloader),
            desc=f'Epoch {epoch + 1}/{configs.epochs}',
            unit='batch',
            ncols=100
        ) as pbar:
            for input_ids, attention_mask, labels in pbar:
                optimizer.zero_grad()
                
                # 前向传播
                logits = model(input_ids, attention_mask)
                
                # 计算损失 - 确保labels是长整型
                labels = labels.long()
                loss = criterion(logits, labels)
                # 反向传播
                loss.backward()

                optimizer.step()
                # scheduler.step()  # 更新学习率 - 添加此行
                
                preds = logits.argmax(dim=1)
                accuracy = (preds == labels).float().mean()
                accuracy_all = (preds == labels).float().sum()
                
                # 收集预测结果和真实标签，用于计算F1
                train_preds.extend(preds.cpu().numpy())
                train_labels_list.extend(labels.cpu().numpy())
                
                train_loss += loss.item()
                train_acc += accuracy_all.item()
                
                # 添加当前学习率到进度条
                # curr_lr = scheduler.get_last_lr()[0]
                pbar.set_postfix(
                    loss=f'{loss.item():.3f}',
                    accuracy=f'{accuracy.item():.3f}',
                    # lr=f'{curr_lr:.6f}'  # 显示当前学习率
                )
        
        train_loss = train_loss / len(train_dataloader)
        train_acc = train_acc / len(train_dataset)
        
        # 计算训练集的F1分数 - 使用macro平均以处理多分类
        train_f1 = f1_score(train_labels_list, train_preds, average='macro')
        
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Training Accuracy: {train_acc:.4f}')
        print(f'Training F1 Score: {train_f1:.4f}')
        
        
        
        # # 验证阶段
        # model.eval()
        # val_loss = 0.0
        # val_corrects = 0.0
        # val_preds = []
        # val_labels_list = []

        # with torch.no_grad():
        #     for input_ids, attention_mask, labels in val_dataloader:
        #         # 确保labels是长整型
        #         labels = labels.long()
                
        #         # 前向传播
        #         logits = model(input_ids, attention_mask)
                
        #         loss = criterion(logits, labels)
        #         val_loss += loss.item()
        #         preds = logits.argmax(dim=1)
        #         accuracy = (preds == labels).float().sum()
        #         val_corrects += accuracy
                
        #         # 收集预测结果和真实标签，用于计算F1和混淆矩阵
        #         val_preds.extend(preds.cpu().numpy())
        #         val_labels_list.extend(labels.cpu().numpy())
        
        # val_loss = val_loss / len(val_dataloader)
        # val_acc = val_corrects.double() / len(val_dataset)
        
        # # 计算验证集的F1分数
        # val_f1 = f1_score(val_labels_list, val_preds, average='macro')
        
        # print('Validation Loss: {:.4f} Acc: {:.4f} F1: {:.4f}'.format(val_loss, val_acc, val_f1))

        
        
        # # 检查是否保存模型并判断是否需要早停
        # # 使用F1分数作为主要指标
        # if val_f1 > best_val_f1:
        #     best_val_f1 = val_f1
        #     best_val_acc = val_acc
            
        #     # 保存模型
        #     # state_dict = model.state_dict()
        #     # torch.save(state_dict, os.path.join(checkpoint_dir, 'best_model_f1.pt'))
        #     print(f'New best model saved with F1: {best_val_f1:.4f}, Acc: {best_val_acc:.4f}')
            
        #     patience_counter = 0
        # else:
        #     patience_counter += 1
        #     if patience_counter >= configs.patience:
        #         print(f'Early stopping triggered after {epoch+1} epochs.')
        #         break


        if epoch >= 12  and epoch <= 22:
            # 保存模型
            state_dict = model.state_dict()
            torch.save(state_dict, os.path.join("/mnt/cfs/huangzhiwei/BAE2025/projects/predict/deberta_best", f'best_model_f1_{epoch+1}.pt'))
        
        if epoch > 22:
            break
        # model.train()


        
# 在以下主函数中添加判断Jupyter环境的逻辑
if __name__ == '__main__':
    # 判断是否在Jupyter环境中运行
    try:
        # 检查是否在Jupyter中运行
        get_ipython = globals().get('get_ipython', None)
        if get_ipython and 'IPKernelApp' in get_ipython().config:
            # 在Jupyter环境中运行，使用默认配置
            print("Running in Jupyter environment, using default configs")
            configs = get_default_configs()
        else:
            # 在命令行环境中运行，使用argparse
            configs = argparser()
    except:
        # 任何异常都使用argparse处理
        configs = argparser()
    
    # 设置实验名称
    if configs.name is None:
        configs.exp_name = \
            f'{os.path.basename(configs.model_name)}' + \
            f'{"_fp" if configs.freeze_pooler else ""}' + \
            f'_b{configs.batch_size}_e{configs.epochs}' + \
            f'_len{configs.max_length}_lr{configs.lr}'
    else:
        configs.exp_name = configs.name
    
    # 设置设备
    if configs.device is None:
        configs.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu'
        )
    
    # 调用训练函数
    train(configs)

Running in Jupyter environment, using default configs




当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>
当前使用的 tokenizer 类型： <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>


Epoch 1/50: 100%|██████████████████| 309/309 [01:30<00:00,  3.43batch/s, accuracy=0.500, loss=1.050]


Training Loss: 1.0309
Training Accuracy: 0.5190
Training F1 Score: 0.2877


Epoch 2/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.750, loss=0.782]


Training Loss: 1.0256
Training Accuracy: 0.5170
Training F1 Score: 0.2543


Epoch 3/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.500, loss=0.940]


Training Loss: 1.0105
Training Accuracy: 0.5299
Training F1 Score: 0.2551


Epoch 4/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.500, loss=1.082]


Training Loss: 1.0075
Training Accuracy: 0.5291
Training F1 Score: 0.2490


Epoch 5/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.500, loss=1.025]


Training Loss: 1.0024
Training Accuracy: 0.5319
Training F1 Score: 0.2587


Epoch 6/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.375, loss=1.167]


Training Loss: 0.9953
Training Accuracy: 0.5327
Training F1 Score: 0.2713


Epoch 7/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.375, loss=1.153]


Training Loss: 0.9944
Training Accuracy: 0.5279
Training F1 Score: 0.2721


Epoch 8/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.625, loss=1.002]


Training Loss: 0.9934
Training Accuracy: 0.5234
Training F1 Score: 0.2704


Epoch 9/50: 100%|██████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.625, loss=0.868]


Training Loss: 0.9919
Training Accuracy: 0.5473
Training F1 Score: 0.3008


Epoch 10/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.750, loss=0.892]


Training Loss: 0.9913
Training Accuracy: 0.5392
Training F1 Score: 0.3115


Epoch 11/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.250, loss=1.243]


Training Loss: 0.9947
Training Accuracy: 0.5331
Training F1 Score: 0.2983


Epoch 12/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.375, loss=1.103]


Training Loss: 0.9868
Training Accuracy: 0.5254
Training F1 Score: 0.2619


Epoch 13/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.500, loss=0.997]


Training Loss: 0.9867
Training Accuracy: 0.5339
Training F1 Score: 0.2929


Epoch 14/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.250, loss=1.229]


Training Loss: 0.9859
Training Accuracy: 0.5315
Training F1 Score: 0.2510


Epoch 15/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.500, loss=1.031]


Training Loss: 0.9886
Training Accuracy: 0.5291
Training F1 Score: 0.2460


Epoch 16/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.750, loss=0.849]


Training Loss: 0.9843
Training Accuracy: 0.5299
Training F1 Score: 0.2440


Epoch 17/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.625, loss=1.019]


Training Loss: 0.9816
Training Accuracy: 0.5311
Training F1 Score: 0.2481


Epoch 18/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.125, loss=1.311]


Training Loss: 0.9804
Training Accuracy: 0.5315
Training F1 Score: 0.2517


Epoch 19/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.625, loss=0.898]


Training Loss: 0.9823
Training Accuracy: 0.5299
Training F1 Score: 0.2484


Epoch 20/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.625, loss=0.964]


Training Loss: 0.9827
Training Accuracy: 0.5303
Training F1 Score: 0.2471


Epoch 21/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.500, loss=0.900]


Training Loss: 0.9789
Training Accuracy: 0.5315
Training F1 Score: 0.2474


Epoch 22/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.45batch/s, accuracy=0.500, loss=1.123]


Training Loss: 0.9831
Training Accuracy: 0.5311
Training F1 Score: 0.2495


Epoch 23/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.500, loss=0.976]


Training Loss: 0.9820
Training Accuracy: 0.5311
Training F1 Score: 0.2488


Epoch 24/50: 100%|█████████████████| 309/309 [01:29<00:00,  3.46batch/s, accuracy=0.750, loss=0.807]

Training Loss: 0.9844
Training Accuracy: 0.5327
Training F1 Score: 0.2535



