# 掛載 Google Drive

In [8]:
!pip install google.colab



In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# 觀看系統設定

In [3]:
!lsb_release -a

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.4 LTS
Release:	22.04
Codename:	jammy


In [4]:
!nvidia-smi

Fri Feb 14 09:21:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [6]:
# 切換目錄 (Colab 預設目錄為 /content，使用 %cd 切換目錄)
%cd /content/drive/My Drive/Colab Notebooks/

/content/drive/My Drive/Colab Notebooks


# 微調模型

In [7]:
# 安裝套件
!pip install torch torchvision torchaudio transformers datasets evaluate accelerate scikit-learn

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia

In [9]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(0))

True
0
<torch.cuda.device object at 0x7affa8992e90>


In [10]:
'''
AutoTokenizer：這有助於將我們的文字資料標記為 BERT 可以理解的格式。 「Auto」前綴意味著它可以為各種模型推斷適當的分詞器。
AutoModelForSequenceClassification：一個通用的類別，是用於「序列分類」任務的模型架構。「Auto」前綴使其在各種預訓練模型中具有通用性。
TrainingArguments：定義訓練配置的設定，例如 learning rateb、batch size 和 epoch。
Trainer：用於訓練和評估，使 finetune 變得簡單。
pipeline：使用模型的模型。
DataCollat​​eWithPadding：確保我們分詞化後的資料，以一致的長度串接在一起，並在必要時增加 padding。這對於訓練的穩定性和效率至關重要。
'''
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

import random
from sklearn.metrics import f1_score

# 轉換標籤
label_map = {
    "平淡語氣": 0,
    "開心語調": 1,
    "關切語調": 2,
    "憤怒語調": 3,
    "驚奇語調": 4,
    "悲傷語調": 5,
    "厭惡語調": 6,
    "疑問語調": 7
}

'''
函式
'''
def load_dataset_from_hub(seed=42):
    # 從 Hugging Face 讀取資料集
    dataset = load_dataset("Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset")

    # 整合訓練資料
    train_data = dataset["train"]
    sentences = train_data["text"]
    labels = [label_map[label] for label in train_data["emotion"]]

    return sentences, labels

# 轉換成 huggingface trainer 可以使用的 datasets
def convert_to_dataset(sentences, labels, tokenizer, max_seq_length):
    # 建立 Dataset
    dataset = Dataset.from_dict({
        'sentences': sentences,
        'labels': labels
    })

    # 回傳切分資料 (訓練 和 驗證)
    dataset = dataset.train_test_split(test_size=0.2)

    # 預處理資料
    def preprocess_data(dataset):
        # 將句子轉換為 token (tokenization)
        return tokenizer(
            dataset['sentences'],
            truncation=True,  # 超過512截斷
            padding=True,     # 不足512補足512
            return_tensors='pt',
            max_length=max_seq_length
        )

    # 轉換資料
    train_data = dataset['train'].map(preprocess_data, batched=True)
    valid_data = dataset['test'].map(preprocess_data, batched=True)

    return DatasetDict({
        'train': train_data,
        'test': valid_data
    })

# 計算模型評估指標
def compute_metrics(predicted_results):
    labels = predicted_results.label_ids
    preds = predicted_results.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average='macro') # binary, micro, macro, weighted
    return {
        'f1': f1,
    }

In [11]:
from sklearn.metrics import f1_score

# 驗證 F1 score 算法
y_true = [0,4,1,2,5,3,6]
y_pred = [0,4,8,7,5,4,6]
print(f1_score(y_true, y_pred, average='macro')) # binary, micro, macro

# 參考: https://blog.csdn.net/qq_40671063/article/details/130447922

0.4074074074074074


In [14]:
# 主程式 - 微調模型
if __name__ == "__main__":
    '''
    設定 hyperparameters
    '''
    model_name = 'google-bert/bert-base-chinese' # 預訓練模型名稱
    max_seq_length = 512 # 可訓練的序列最大長度
    num_labels = 8 # 8元分類
    output_dir = './output' # 輸出模型資料夾

    # 讀取訓練資料
    sentences, labels = load_dataset_from_hub()

    # 載入 tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # 將資料轉換為 huggingface 可以使用的格式
    dataset = convert_to_dataset(
        sentences,
        labels,
        tokenizer,
        max_seq_length
    )

    # 讀取模型
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    # 設定訓練參數
    training_args = TrainingArguments(
        output_dir=output_dir, # 輸出資料夾
        overwrite_output_dir=True,
        num_train_epochs=5, # 訓練回合數
        per_device_train_batch_size=16, # 批次大小
        per_device_eval_batch_size=16, # 批次大小
        gradient_accumulation_steps=2, # 兩次才更新權重(32*2)
        learning_rate=2e-5, # 學習率
        warmup_ratio=0.1, # 處理10%之後才開始往學習率前進
        weight_decay=0.01,
        eval_strategy="steps", # epoch, steps, no
        eval_steps=50, # 多久評估一次 F score
        save_strategy="steps", # epoch, steps, no
        save_steps=50, # 多久存一次檔
        save_total_limit=3, # 最多存兩份
        load_best_model_at_end=True,
        seed=42, # 隨機種子
        # lr_scheduler_type="linear", # https://blog.csdn.net/muyao987/article/details/139319466
        # report_to='wandb', # https://wandb.ai/
    )

    # 設定 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'], #不評估可以不加
        #data_collator=None, # DataCollatorWithPadding(tokenizer),
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
    )

    # 開始訓練
    trainer.train()

    # 儲存模型
    trainer.save_model(output_dir) # , safe_serialization=True 比較安全的格式

    # 儲存 tokenizer
    tokenizer.save_pretrained(output_dir)

Map:   0%|          | 0/3327 [00:00<?, ? examples/s]

Map:   0%|          | 0/832 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
50,No log,1.372567,0.662704
100,No log,0.671555,0.80536
150,No log,0.510929,0.837772
200,No log,0.429138,0.862074
250,No log,0.453524,0.851509
300,No log,0.419076,0.86013
350,No log,0.38774,0.883335
400,No log,0.37855,0.884208
450,No log,0.392892,0.874923
500,0.499100,0.385173,0.884985


# 拿微調好的模型，進行預測

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
)
from pprint import pprint

model_dir = './output'
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
pipe = pipeline(task='text-classification', model=model, tokenizer=tokenizer, device=0)

texts = [
    "我每天都能跟她一起上學，我好開心！",
    "最好的朋友要離開臺灣了，以後可能不容易再見面...",
    "我覺得我快不行了...",
    "剛剛收到研究所錄取的通知書！",
    "今年的冬天好像比較晚來。"
]

results = pipe(texts)

reverse_label_map = {str(v): k for k, v in label_map.items()}  # label_map 反轉

print(*map(
    lambda t_p: f"{t_p[0]} => {reverse_label_map[t_p[1]['label'].split('_')[1]]} ({t_p[1]['score']:.2f})",
    zip(texts, pipe(texts))
), sep='\n')


Device set to use cuda:0


我每天都能跟她一起上學，我好開心！ => 開心語調 (0.98)
最好的朋友要離開臺灣了，以後可能不容易再見面... => 悲傷語調 (0.98)
我覺得我快不行了... => 悲傷語調 (0.61)
剛剛收到研究所錄取的通知書！ => 開心語調 (0.95)
今年的冬天好像比較晚來。 => 平淡語氣 (0.98)
