In [1]:
import pandas as pd
data_path = "../../data/open_source_8454_combine_short_description.csv"
df = pd.read_csv(data_path)
# df_label_0 = df[df["Label"] == 0]
# df_label_0_to_drop = df_label_0.sample(n=3000, random_state=42)
# df = df.drop(df_label_0_to_drop.index)
df

Unnamed: 0,Inbound Message,Label
0,login issue verified user details employee man...,0
1,outlook hello team meetings skype meetings etc...,0
2,cant log vpn cannot log vpn best,0
3,unable access tool page unable access tool page,0
4,skype error skype error,0
...,...,...
8449,emails coming mail good afternoon receiving em...,22
8450,telephony software issue telephony software issue,0
8451,vip windows password reset tifpdchb pedxruyf v...,0
8452,machine est funcionando unable access machine ...,44


In [2]:
import sys
import os
sys.path.append(os.path.abspath('../ml_lib'))

from config import *
from dataset import CustomDataset
from train_eval import train_epoch, eval_model
from device import get_device_info

In [3]:
from sklearn.model_selection import StratifiedKFold

device = get_device_info()
messages, labels = df['Inbound Message'].tolist(), df['Label'].tolist()

# KFold 初始化
skf = StratifiedKFold(n_splits=KFOLD_SPLIT, shuffle=True, random_state=SEED)

Using device: cuda
CUDA Available: True
CUDA Device: NVIDIA GeForce RTX 3050 6GB Laptop GPU
Current GPU Memory Allocated: 0.00 GB
Current GPU Memory Cached: 0.00 GB


In [4]:
from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer, AutoModelForSequenceClassification

def get_tokenizer(model_name):
    if "bert" in model_name.lower():
        return BertTokenizer.from_pretrained(model_name)
    elif "roberta" in model_name.lower():
        return RobertaTokenizer.from_pretrained(model_name)
    elif "deberta" in model_name.lower():
        return DebertaTokenizer.from_pretrained(model_name)
    else:
        raise ValueError("Unsupported model tokenizer.")

def get_model(model_name, hybrid=None, num_labels=2):
    if hybrid is None:
        return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    elif hybrid == "lstm":
        return BertLSTMClassifier(model_name, num_labels)
    elif hybrid == "bilstm":
        return BertBiLSTMClassifier(model_name, num_labels)
    elif hybrid == "cnn":
        return BertCNNClassifier(model_name, num_labels)
    else:
        raise ValueError("Unsupported hybrid type.")

def apply_resampling(X, y, method="none"):
    if method == "none":
        return X, y
    elif method == "ros":
        return RandomOverSampler().fit_resample(X, y)
    elif method == "smote":
        return SMOTE().fit_resample(X, y)
    elif method == "textgan":
        return textgan_augment(X, y)  # 假設你有自訂 function
    else:
        raise ValueError("Unknown resampling method")

In [5]:
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import numpy as np
import time

def run_kfold_experiment(
    X, y, model_name, hybrid_type, resample_method, tokenizer,
    kfold=KFOLD_SPLIT, patience=PATIENCE, max_length=MAX_LENGTH, 
    batch_size=BATCH_SIZE, lr=LR, weight_decay=WEIGHT_DECAY
):
    skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=SEED)
    all_fold_results = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n[Fold {fold + 1}]")
        X_train = [X[i] for i in train_idx]
        y_train = [y[i] for i in train_idx]
        X_val = [X[i] for i in val_idx]
        y_val = [y[i] for i in val_idx]

        X_train, y_train = apply_resampling(X_train, y_train, method=resample_method)

        train_dataset = CustomDataset(X_train, y_train, tokenizer, max_length)
        val_dataset = CustomDataset(X_val, y_val, tokenizer, max_length)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        num_labels = len(set(y))
        model = get_model(model_name, hybrid_type, num_labels=num_labels)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = CrossEntropyLoss()

        epoch_results = []
        best_macro_f1 = 0
        epochs_without_improvement = 0

        for epoch in range(EPOCHS):
            print(f"Epoch {epoch + 1}/{EPOCHS}")
            train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
            metrics = eval_model(model, val_loader, criterion, device, num_labels)
            print(metrics)
            epoch_results.append(metrics)

            current_macro_f1 = metrics["macro_f1-score"]
            if current_macro_f1 > best_macro_f1:
                best_macro_f1 = current_macro_f1
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1

            if epochs_without_improvement >= patience:
                print("Early stopping.")
                break

        best_epoch_metrics = max(epoch_results, key=lambda x: x["macro_f1-score"])
        all_fold_results.append(best_epoch_metrics)

    return all_fold_results


In [6]:
model_list = ["bert-base-uncased", "roberta-base", "microsoft/deberta-base"]
hybrid_list = [None, "cnn", "lstm", "bilstm"]
resample_list = ["none", "ros", "smote", "textgan"]

for model_name in model_list:
    tokenizer = get_tokenizer(model_name)
    for hybrid in hybrid_list:
        for resample in resample_list:
            print(f"▶ Running: {model_name} + {hybrid or 'plain'} + {resample}")
            results = run_kfold_experiment(
                messages, labels, model_name, hybrid, resample, tokenizer
            )
            # 儲存 results


▶ Running: bert-base-uncased + plain + none

[Fold 1]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 476/476 [03:30<00:00,  2.26it/s]


{'val_loss': 1.5768805764756113, 'val_accuracy': 0.6205673758865248, 'precision': 0.4982209971912229, 'recall': 0.6205673758865248, 'f1-score': 0.50601774679372, 'macro_f1-score': 0.10698931571681669, 'balanced_accuracy': 0.11806378673375514, 'mcc': 0.4543086361958166}
Epoch 2/20


  5%|▌         | 25/476 [00:11<03:23,  2.22it/s]


KeyboardInterrupt: 

In [None]:
epoch_results