### 1. Import the training data and validation data

In [1]:
# read the train and validation data
import pandas as pd
import numpy as np

train_df = pd.read_csv('./train.tsv', sep='\t', encoding='utf_8')
val_df = pd.read_csv('./val.tsv', sep='\t', encoding='utf_8')

In [2]:
train_df.head()

Unnamed: 0,Sentences,label
0,在一些大城市禁放烟花爆竹，是政府的一项利国利民的英明决策，只可惜好景不长。,1
1,“女性朋友”，又见官员的“女性朋友”。,2
2,不想他正好在乡下，约好了和我相见，当我告诉他要借钱的事，他显得有些无奈。,2
3,还好，现在开始下雨了。,0
4,不过，在雇主与保姆“双选”这一过程中，一些保姆竟然先打听起雇主是从事啥职业的，看对自己家人有...,2


In [3]:
val_df.head()

Unnamed: 0,Sentences,label
0,“老师不会告诉别人的，这是我们的秘密哦。,2
1,当理发女孩迟疑的眼眸对着我时，颤抖的嘴唇断续的音：真的要剪掉吗？,5
2,在这个历史的节点上，传统中国与现代中国，中国与西方，天下与世界，贯通为一。,0
3,现在，我需要的是把自己最真的一面展现出来，而不是萎缩，再也不是！,7
4,小王老师就羞赧着说，麻烦也来了呢！,0


In [3]:
emotion_dict = {0: 'Happy', 1: 'Sad', 2: 'Fear', 3: 'Surprise', 4: 'Anger',
                5: 'Neutral'}
print(emotion_dict)
print(train_df['label'].value_counts())

{0: 'Happy', 1: 'Sad', 2: 'Fear', 3: 'Surprise', 4: 'Anger', 5: 'Neutral'}
2    4322
1    3546
0    2178
5    1609
4     748
3     354
Name: label, dtype: int64


### 2. Create Dataset and Dataloader for model training

In [4]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 Albert 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：兩個句子合併後的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""

from torch.utils.data import Dataset

class EmotionDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ['train', 'val' ,'test']
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(mode + '.tsv', sep='\t').fillna("")
        self.len = len(self.df)
        # self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2} ## not useful in this emotion df (label already numbers)
        self.tokenizer = tokenizer
    
     # 定義回傳一筆訓練 / 測試數據的函式
    # @pysnooper.snoop()
    def __getitem__(self, idx):
        sentence, label = self.df.iloc[idx, :].values
        # 將 label 也轉換成索引方便轉換成 tensor
        label_tensor = torch.tensor(label)
        
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(sentence)
        word_pieces += tokens_a + ["[CLS]"]
        num_of_words = len(word_pieces)
        
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * num_of_words, 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

In [5]:
import torch
from transformers import BertTokenizer

pretrained_model_name = "bert-base-chinese"
# define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

# create training set
trainset = EmotionDataset("train", tokenizer=tokenizer)
trainset[0]

(tensor([ 101, 1762,  671,  763, 1920, 1814, 2356, 4881, 3123, 4170, 5709, 4255,
         5001, 8024, 3221, 3124, 2424, 4638,  671, 7555, 1164, 1744, 1164, 3696,
         4638, 5739, 3209, 1104, 5032, 8024, 1372, 1377, 2667, 1962, 3250,  679,
         7270,  511,  101]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(1))

In [7]:
# Create validation set
valset = EmotionDataset("val", tokenizer=tokenizer)
valset[0]

(tensor([ 101, 3189, 3315, 3300, 6387, 1914, 4035, 4514, 1416, 8024, 6006, 6432,
         6929, 4035, 4514,  758, 5709, 1061, 7305, 8024, 5273, 5273, 5344, 5344,
         8024, 4692,  677, 1343, 1282, 1146, 4178, 7317, 8024,  852, 3796, 4035,
         4514, 1416, 4638,  782, 8024, 3187, 6389, 3221, 1920,  782, 6820, 3221,
         2207, 2111, 8024, 1316, 1139, 1936, 1765, 2128, 7474, 8024, 6432, 6929,
         3221, 1369, 5102,  741, 2421,  738, 6387, 3291, 1394, 6844,  511,  101]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(3))

In [8]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `FakeNewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [29]:
# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵

# create dataloader for training data
BATCH_SIZE = 16
trainloader = DataLoader(trainset, shuffle=True, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [30]:
# create dataloader for validation data
BATCH_SIZE = 16
valloader = DataLoader(valset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [31]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([16, 54]) 
tensor([[ 101, 2769,  679, 4761, 6887, 6821, 3221, 1962, 6820, 3221,  679, 1962,
          511,  101,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [ 101, 4197, 5445, 8024,  800,  812, 3766, 3300, 5632, 2346, 4638, 1164,
         4660,  807, 6134, 8024, 3300, 1525,  671,  702,  782, 1920,  807, 6134,
         3221, 4696, 3633,  807, 6134,  749, 6821,  763, 1814, 2356, 4683, 3837,
         4638, 4495, 2100, 3326, 1164, 1469, 1164, 4660, 6206, 3724, 1450, 8043,
          101,    0,    0,    0,    0,    0],
        [ 101,  791, 1921, 1962, 1008, 7439,  678,  676, 2428, 1416,  511,  101,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,

### 3. Add a new layer on top of BERT to complete downstream task

- Via BertForSequenceClassification

##### Model 1: Using Default BertForSequenceClassification

In [47]:
from IPython.display import clear_output
from transformers import BertForSequenceClassification

pretrained_model_name = "bert-base-chinese"
num_labels = 6

model = BertForSequenceClassification.from_pretrained(
                pretrained_model_name, num_labels=num_labels)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=6, bias=True)


##### Model 2: Increase the Dropout rate (to 0.2) for both fully connected layers and the attention probabilities

In [15]:
# from IPython.display import clear_output
# from transformers import BertForSequenceClassification, BertModel

# pretrained_model_name = "bert-base-chinese"
# num_labels = 9

# model = BertForSequenceClassification.from_pretrained(
#                 pretrained_model_name, num_labels=num_labels,
#                 hidden_dropout_prob = 0.2,
#                 attention_probs_dropout_prob = 0.2)

# clear_output()

# # high-level 顯示此模型裡的 modules
# print("""
# name            module
# ----------------------""")
# for name, module in model.named_children():
#     if name == "bert":
#         for n, _ in module.named_children():
#             print(f"{name}:{n}")
#     else:
#         print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.2, inplace=False)
classifier      Linear(in_features=768, out_features=9, bias=True)


In [48]:
# 算算整個分類模型以及裡頭的簡單分類器有多少參數：
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：102272262
線性分類器的參數量：4614



In [49]:
"""
定義一個可以針對特定 DataLoader 取得模型預測結果以及分類準確度的函式
之後也可以用來生成上傳到 Kaggle 競賽的預測結果

2019/11/22 更新：在將 `tokens`、`segments_tensors` 等 tensors
丟入模型時，強力建議指定每個 tensor 對應的參數名稱，以避免 HuggingFace
更新 repo 程式碼並改變參數順序時影響到我們的結果。
"""

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, valloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 0.10223116313094367


In [50]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter()

In [51]:
%load_ext tensorboard
%tensorboard --logdir 'runs'

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 24276), started 2:16:48 ago. (Use '!kill 24276' to kill it.)

In [52]:
%%time

# 訓練模式
device = "cuda:0"


# 使用 Adam Optim 更新整個分類模型的參數
learning_rate = 5e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


EPOCHS = 8
train_running_loss = 0.0
val_running_loss = 0.0
for epoch in range(EPOCHS):
    
    # Training 
    model.train()
    for i, data in enumerate(trainloader):
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()
        
        # Record loss to tensorbard
        train_running_loss += loss.item()
        if i % 10 == 9:    # every 10 mini-batches...
            # ...log the running loss
            writer.add_scalar('training loss',
                            loss.item() / 10,
                            epoch * len(trainloader) + i)
            
    # Validation
    model.train()
    
    with torch.no_grad():
        for i, data in enumerate(valloader):
            # forward pass
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors, 
                            labels=labels)
            loss = outputs[0]
            val_running_loss += loss.item()
        # ...log the running loss
        writer.add_scalar('validation loss',
                        val_running_loss/len(valloader),
                        epoch + 1)
    
    
    
    # 計算分類準確率
    _, train_acc = get_predictions(model, trainloader, compute_acc=True)
    _, val_acc = get_predictions(model, valloader, compute_acc=True)

    print('[epoch %d] training loss: %.3f, training acc: %.3f, validation acc: %.3f' %
          (epoch + 1, train_running_loss, train_acc, val_acc))
    
    writer.add_scalar('training accuracy',
                        train_acc,
                        epoch + 1)
    writer.add_scalar('validation accuracy',
                        val_acc,
                        epoch + 1)
    
    # save model for every epoch
    save_directory = f'.\saved_model\Bert_3\Epoch{epoch+1}'   ### Change this before training new model
    tokenizer.save_pretrained(save_directory)
    model.save_pretrained(save_directory)
    
    
    train_running_loss = 0.0
    val_running_loss = 0.0

[epoch 1] training loss: 969.364, training acc: 0.668, validation acc: 0.591
[epoch 2] training loss: 772.521, training acc: 0.740, validation acc: 0.577
[epoch 3] training loss: 605.269, training acc: 0.817, validation acc: 0.597
[epoch 4] training loss: 449.908, training acc: 0.878, validation acc: 0.582


KeyboardInterrupt: 

In [24]:
writer.close()

### 3. Predict new input sentence

In [38]:
# load trained model
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = 'D:/HKU/FYP/research_method/Emotion_Detection/Ren_CECps/Bert/saved_model/Bert_2/Epoch4'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [41]:
emotion_dict = {0: 'Happy', 1: 'Sad', 2: 'Fear', 3: 'Surprise', 4: 'Anger',
                5: 'Neutral'}

In [46]:
%%time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

X_train_chinese = ["我很高兴可以见到你", "你在干什么呀你", "我真的是憤怒了", "你真的是疯了", "我想了解机器学习的课程", "今天", "公开试成绩不如理想"]
batch = tokenizer(X_train_chinese, padding=True, truncation=True, max_length=512, return_tensors='pt')
batch = batch.to(device)



with torch.no_grad():
    outputs = model(**batch)
    logits = outputs[0]
    predictions = F.softmax(outputs.logits, dim=1)
    _, label = torch.max(logits.data, 1)
    print(predictions)
    print(label)

device: cuda:0
tensor([[9.8951e-01, 7.8748e-04, 1.1294e-03, 3.2023e-03, 1.6918e-03, 3.1902e-03,
         1.2232e-04, 1.9207e-04, 1.7354e-04],
        [2.5000e-02, 1.8427e-02, 8.6540e-01, 1.9107e-02, 1.8076e-02, 5.3879e-02,
         3.1401e-05, 4.5647e-05, 3.4724e-05],
        [2.6426e-03, 3.5786e-03, 7.8153e-03, 2.8660e-03, 9.8064e-01, 1.4792e-03,
         2.6240e-04, 5.1924e-04, 1.9747e-04],
        [7.4592e-02, 7.4522e-02, 6.5876e-01, 7.7423e-02, 1.0753e-01, 6.9381e-03,
         4.8619e-05, 1.2719e-04, 6.1229e-05],
        [1.0010e-02, 1.7802e-03, 7.2933e-03, 1.8169e-03, 7.0223e-04, 9.7823e-01,
         3.7622e-05, 6.6074e-05, 6.3261e-05],
        [7.1304e-03, 2.1038e-02, 2.8067e-02, 9.7130e-04, 1.0018e-03, 9.4128e-01,
         2.2050e-04, 1.8149e-04, 1.1411e-04],
        [2.0978e-02, 4.8590e-01, 4.7537e-01, 2.8740e-03, 1.0018e-02, 4.7629e-03,
         3.3819e-05, 2.5958e-05, 3.8533e-05]], device='cuda:0')
tensor([0, 2, 4, 2, 5, 5, 1], device='cuda:0')
Wall time: 121 ms
