---
### Pre-setup
---

In [1]:
import os
data_path_prefix = '/kaggle/input/dm-lab2-with-emo/'
submission_path_prefix = '/kaggle/working/'

# check if local or kaggle by checking if the path exists
if not os.path.exists(data_path_prefix):
    data_path_prefix = ''
    submission_path_prefix = ''
    print('Using local data path')
else:
    print('Using kaggle data path')
    
main_data_path = data_path_prefix + 'tweets_DM.json'
emotion_data_path = data_path_prefix + 'emotion.csv'
identification_data_path = data_path_prefix + 'data_identification.csv'

Using kaggle data path


---
### Model fitting - transformer + BERT

In [2]:
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.amp import autocast
from torch.cuda.amp import GradScaler
import torch.multiprocessing as mp

# 設置多處理啟動方式，解決 os.fork 警告
if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)

    # 載入數據集
    train_data = pd.read_csv(data_path_prefix + 'train_emo.csv')
    test_data = pd.read_csv(data_path_prefix + 'test_emo.csv')

    # 分割數據集
    X_train, X_val, y_train, y_val = train_test_split(
        train_data['text'], train_data['emotion'], test_size=0.2, random_state=42, stratify=train_data['emotion']
    )

    # 標籤編碼
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_val = le.transform(y_val)

    # 計算類別權重
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = torch.tensor(class_weights, dtype=torch.float)

    # ======== PyTorch Dataset 定義 ========
    class TweetDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_len):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_len = max_len

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            text = str(self.texts.iloc[idx])
            label = self.labels[idx]

            encoding = self.tokenizer(
                text,
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_tensors="pt",
            )

            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'label': torch.tensor(label, dtype=torch.long)
            }

    # 初始化 Tokenizer
    print('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_len = 128

    # DataLoader，使用 num_workers=0 防止多進程問題
    train_loader = DataLoader(
        TweetDataset(pd.Series(X_train), y_train, tokenizer, max_len),
        batch_size=64, shuffle=True, num_workers=0, pin_memory=True
    )
    val_loader = DataLoader(
        TweetDataset(pd.Series(X_val), y_val, tokenizer, max_len),
        batch_size=64, num_workers=0, pin_memory=True
    )
    test_loader = DataLoader(
        TweetDataset(test_data['text'], [0] * len(test_data), tokenizer, max_len),
        batch_size=64, num_workers=0, pin_memory=True
    )

    # ======== 模型與優化器 ========
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 包裝模型以支持多 GPU
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=len(le.classes_)
    )
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs")
        model = torch.nn.DataParallel(model)

    model = model.to(device)
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(device))
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    scaler = GradScaler()

    # 學習率調度器
    scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=int(len(train_loader) * 0.1), num_training_steps=len(train_loader) * 3
    )

    # ======== 訓練與驗證函數 ========
    def train_epoch(model, data_loader, criterion, optimizer, scaler, device, scheduler):
        model.train()
        losses = []
        all_preds = []
        all_labels = []

        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            with autocast(device_type='cuda'):  # 修正過時警告
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

            losses.append(loss.item())
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

        f1 = f1_score(all_labels, all_preds, average='weighted')
        return np.mean(losses), f1

    def eval_model(model, data_loader, criterion, device):
        model.eval()
        losses = []
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in data_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                with autocast(device_type='cuda'):  # 修正過時警告
                    outputs = model(input_ids, attention_mask=attention_mask)
                    logits = outputs.logits
                    loss = criterion(logits, labels)

                losses.append(loss.item())
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        f1 = f1_score(all_labels, all_preds, average='weighted')
        return np.mean(losses), f1
    

Loading BERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs


  scaler = GradScaler()


In [3]:
# ======== 訓練與驗證 ========
epochs = 3
best_f1 = 0
patience = 2
early_stop_counter = 0

for epoch in range(epochs):
    train_loss, train_f1 = train_epoch(model, train_loader, criterion, optimizer, scaler, device, scheduler)
    val_loss, val_f1 = eval_model(model, val_loader, criterion, device)

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}')

    if val_f1 > best_f1:
        best_f1 = val_f1
        early_stop_counter = 0
        torch.save(model.module.state_dict(), 'best_model.pth')  # 保存多 GPU 模型權重
    else:
        early_stop_counter += 1

    if early_stop_counter >= patience:
        print("Early stopping triggered.")
        break


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch 1/3
Train Loss: 1.2512, Train F1: 0.5591
Val Loss: 1.1103, Val F1: 0.6110


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch 2/3
Train Loss: 1.0140, Train F1: 0.6316
Val Loss: 1.0735, Val F1: 0.6290


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch 3/3
Train Loss: 0.8700, Train F1: 0.6639
Val Loss: 1.1117, Val F1: 0.6338


In [None]:

# ======== 測試資料預測 ========
model.module.load_state_dict(torch.load('best_model.pth'))  # 加載多 GPU 模型權重
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_predictions.extend(preds.cpu().numpy())

test_data['emotion'] = le.inverse_transform(test_predictions)
submission = test_data[['id', 'emotion']]
submission.to_csv(submission_path_prefix + 'submission.csv', index=False)

print('Submission saved successfully!')

  model.module.load_state_dict(torch.load('best_model.pth'))  # 加載多 GPU 模型權重
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Submission saved successfully!
