In [1]:
import pandas as pd
data_path = "../../data/open_source_8454_combine_short_description.csv"
df = pd.read_csv(data_path)
# df_label_0 = df[df["Label"] == 0]
# df_label_0_to_drop = df_label_0.sample(n=3000, random_state=42)
# df = df.drop(df_label_0_to_drop.index)
df

Unnamed: 0,Inbound Message,Label
1,outlook hello team meetings skype meetings etc...,0
4,skype error skype error,0
6,event critical hostname company com value moun...,1
10,engineering tool says connected unable submit ...,0
12,unable login tool sgxqsuojr xwbesorf cards una...,0
...,...,...
8447,erp two accounts added sorry another two accou...,2
8448,tablet needs reimaged due multiple issues crm ...,23
8449,emails coming mail good afternoon receiving em...,22
8452,machine est funcionando unable access machine ...,44


In [2]:
import sys
import os
sys.path.append(os.path.abspath('../ml_lib'))

from config import *
from dataset import CustomDataset
from train_eval import train_epoch, eval_model
from device import get_device_info

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import StratifiedKFold

device = get_device_info()

messages, labels = df['Inbound Message'].tolist(), df['Label'].tolist()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# KFold 初始化
skf = StratifiedKFold(n_splits=KFOLD_SPLIT, shuffle=True, random_state=SEED)

Using device: cuda
CUDA Available: True
CUDA Device: NVIDIA GeForce RTX 3050 6GB Laptop GPU
Current GPU Memory Allocated: 0.00 GB
Current GPU Memory Cached: 0.00 GB


In [4]:
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import numpy as np
import time

all_fold_results = []

start_time = time.time()

for fold, (train_idx, val_idx) in enumerate(skf.split(messages, labels)):
    print(f"Fold {fold + 1}")
    
    train_messages = [messages[i] for i in train_idx]
    train_labels = [labels[i] for i in train_idx]
    val_messages = [messages[i] for i in val_idx]
    val_labels = [labels[i] for i in val_idx]

    print("Training set label distribution:", {label: train_labels.count(label) for label in set(train_labels)})
    print("Validation set label distribution:", {label: val_labels.count(label) for label in set(val_labels)})
    
    train_dataset = CustomDataset(train_messages, train_labels, tokenizer, MAX_LENGTH)
    val_dataset = CustomDataset(val_messages, val_labels, tokenizer, MAX_LENGTH)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    NUM_LABELS = len(set(train_labels) | set(val_labels))
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUM_LABELS)
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    criterion = CrossEntropyLoss()

    epoch_results = []
    best_macro_f1 = 0
    epochs_without_improvement = 0
    
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
        metrics = eval_model(model, val_dataloader, criterion, device, num_labels=NUM_LABELS)
        epoch_results.append(metrics)
        print(metrics)

        current_macro_f1 = metrics["macro_f1-score"]
        
        if current_macro_f1 > best_macro_f1:
            best_macro_f1 = current_macro_f1  # 更新最佳 macro_f1-score
            epochs_without_improvement = 0   # 重置計數
        else:
            epochs_without_improvement += 1  # 增加計數
        
        # 若連續 patience 3 次沒有提升，則停止訓練
        if epochs_without_improvement >= PATIENCE:
            print(f"Early stopping triggered at epoch {epoch + 1}")
            break
    
    # avg_metrics = {metric: np.mean([epoch[metric] for epoch in epoch_results]) for metric in epoch_results[0].keys()}
    # all_fold_results.append(avg_metrics)
    best_epoch_metrics = max(epoch_results, key=lambda x: x["macro_f1-score"])
    all_fold_results.append(best_epoch_metrics)

end_time = time.time()
print(f"Total time: {end_time - start_time:.2f}s")

final_metrics = {metric: np.mean([result[metric] for result in all_fold_results]) for metric in all_fold_results[0].keys()}
print("Final 10-fold Cross-Validation Results:")
for metric, value in final_metrics.items():
    print(f"{metric}: {value}")



Fold 1
Training set label distribution: {0: 876, 1: 28, 2: 126, 3: 27, 4: 231, 5: 130, 6: 107, 7: 35, 8: 76, 9: 73, 10: 79, 11: 194, 12: 217, 13: 32, 14: 26, 15: 28, 16: 23, 17: 260, 18: 104, 19: 51, 20: 16, 21: 39, 22: 88, 23: 180, 24: 18, 25: 56, 26: 97, 27: 55, 28: 13, 29: 15, 30: 17, 31: 90, 32: 40, 33: 36, 34: 34, 35: 13, 36: 32, 37: 24, 38: 7, 39: 116, 40: 13, 41: 9, 42: 166, 43: 18, 44: 22, 45: 10, 46: 61, 47: 595, 48: 227, 49: 78}
Validation set label distribution: {0: 98, 1: 3, 2: 14, 3: 3, 4: 26, 5: 15, 6: 11, 7: 4, 8: 9, 9: 8, 10: 9, 11: 21, 12: 24, 13: 4, 14: 3, 15: 3, 16: 2, 17: 29, 18: 12, 19: 5, 20: 2, 21: 5, 22: 9, 23: 20, 24: 2, 25: 7, 26: 10, 27: 6, 28: 2, 29: 1, 30: 2, 31: 10, 32: 5, 33: 4, 34: 3, 35: 2, 36: 3, 37: 3, 39: 13, 40: 1, 41: 2, 42: 18, 43: 2, 44: 3, 45: 1, 46: 7, 47: 66, 48: 25, 49: 9}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 307/307 [02:10<00:00,  2.36it/s]


{'val_loss': 2.1378087520599367, 'val_accuracy': 0.41208791208791207, 'precision': 0.28925255435049985, 'recall': 0.41208791208791207, 'f1-score': 0.29951824907403585, 'macro_f1-score': 0.10259991974422954, 'balanced_accuracy': 0.12564264092865682, 'mcc': 0.3563669747549857}
Epoch 2/20


 26%|██▌       | 79/307 [00:33<01:37,  2.35it/s]


KeyboardInterrupt: 

In [5]:
epoch_results

[{'val_loss': 2.1378087520599367,
  'val_accuracy': 0.41208791208791207,
  'precision': 0.28925255435049985,
  'recall': 0.41208791208791207,
  'f1-score': 0.29951824907403585,
  'macro_f1-score': 0.10259991974422954,
  'balanced_accuracy': 0.12564264092865682,
  'mcc': 0.3563669747549857}]