In [1]:
import os
import numpy as np
import pandas as pd
import csv
import itertools
import pickle
from tqdm import tqdm_notebook, trange
from IPython.display import clear_output

In [2]:
def data_processing(file):
    # read and parse tsv file
    reviews = []
    label = []
    with open(file, encoding='utf-8') as f:
        rows = csv.reader(f, delimiter='\t', quotechar='"')
        for index, row in enumerate(rows):
            if index % 2 == 0:
                reviews.append(row)
            else:
                label.append(row)

        for i, j in zip(reviews, label):
            if len(i) != len(j):
                unequal_idx = reviews.index(i)
                reviews.pop(unequal_idx)
                label.pop(unequal_idx)

        # check whether the length of reviews & sentiment are equal
        for i, j in zip(reviews, label):
            assert len(i) == len(j)
            
    reviews = list(itertools.chain.from_iterable(reviews))
    label = list(itertools.chain.from_iterable(label))
    
    df_bert = pd.DataFrame({
        'reviews': reviews,
        'label': label
    })
    
    return df_bert

file_name = 'training_set.tsv'
data_bert = data_processing(file_name)
data_bert.head()

Unnamed: 0,reviews,label
0,千呼万唤始出来，,neutral
1,尼康的APSC小相机终于发布了，,neutral
2,COOLPIX A. 你怎么看呢？,neutral
3,我看，尼康是挤牙膏挤惯了啊，,neutral
4,1，外观既没有V1时尚，,negative


In [3]:
# negative = 0
# neutral = 1
# positive = 2
1 / (data_bert.label.value_counts() / data_bert.shape[0])

neutral      1.278601
positive     7.142151
negative    12.840000
Name: label, dtype: float64

## 將資料分為訓練及測試集

In [4]:
from sklearn.model_selection import train_test_split
train_data_bert, test_data_bert = train_test_split(data_bert, test_size=0.2, random_state=42)

# os.mkdir('data')
train_data_bert.to_csv('data/train.tsv', sep='\t', index=False)
test_data_bert.to_csv('data/test.tsv', sep='\t', index=False)

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import BertTokenizer

PRETRAINED_MODEL_NAME = "bert-base-chinese" #指定繁簡中文 BERT-BASE 預訓練模型
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) # 取得此預訓練模型所使用的 tokenizer

class ReviewsDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode
        self.df = pd.read_csv(r"data/" + mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {"negative": 0, "neutral": 1, "positive": 2}
        self.tokenizer = tokenizer
        
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        # test mode
        if self.mode == "test": 
            review = self.df.iloc[idx, :1].values[0]
            label_tensor = None # 在 test mode 中，label設定為 None 以用於預測
        
        # train mode
        else:
            review, label = self.df.iloc[idx, :2].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
        
        # 建立句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"] # 起始 token
        tokens = self.tokenizer.tokenize(review)
        word_pieces += tokens
        len_tokens = len(word_pieces)
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 設定 segments tensors
        segments_tensor = torch.tensor([0]*len_tokens, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
clear_output()

In [6]:
# 初始化
trainset = ReviewsDataset("train", tokenizer=tokenizer)

In [7]:
# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
review, label = trainset.df.iloc[sample_idx].values

# 利用剛才建的 FakeNewsDataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
Review：{review}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
Review：什么机器在仓,
分類  ：neutral

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101,  784,  720, 3322, 1690, 1762,  797,  117])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0])

label_tensor   ：1

--------------------

[還原 tokens_tensors]
[CLS]什么机器在仓,



In [9]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `ReviewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是'ReviewsDataset' 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# create_mini_batch 會對前兩個 tensors 作 zero padding，並產生 masks_tensors。

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # train mode 有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    
    # test mode 無 labels
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


BATCH_SIZE = 8
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [10]:
# 載入一個可以做中文多分類任務的模型，n_class = 3
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = 'bert-base-chinese'
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

In [11]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in tqdm_notebook(dataloader):
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None] 
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(
                    input_ids=tokens_tensors, 
                    token_type_ids=segments_tensors, 
                    attention_mask=masks_tensors
            )

            logits = outputs[0]
            _, pred = torch.max(logits.data, 1) # 1 -> 對列取 max
            
            # 計算訓練集的精準度
            if compute_acc:
                labels = data[3]
                total += labels.size(0) # 紀錄目前訓練之總data數目
                correct += (pred == labels).sum().item()
            
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
        
    if compute_acc:
        acc = correct / total
        return predictions, acc

    return predictions

In [12]:
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
# _, acc = get_predictions(model, trainloader, compute_acc=True)
# print("classification acc:", acc)

device: cuda:0


In [13]:
# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
optimizer = torch.optim.RMSprop(model.parameters(),
                                weight_decay = 1e-6,
                                lr=1e-5)

m = nn.LogSoftmax(dim=1)
#criterion = nn.NLLLoss(weight=torch.tensor([12.63, 1.28, 7.06]).to(device))
criterion = nn.NLLLoss()
EPOCHS = 12
for epoch in tqdm_notebook(range(EPOCHS)):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = criterion(m(outputs[1]), labels)
        # backward
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()

        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[Epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 1] loss: 255.115, acc: 0.894


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 2] loss: 154.977, acc: 0.937


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 3] loss: 93.908, acc: 0.965


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 4] loss: 65.671, acc: 0.973


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 5] loss: 42.960, acc: 0.981


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 6] loss: 33.028, acc: 0.987


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 7] loss: 30.018, acc: 0.993


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 8] loss: 17.238, acc: 0.992


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 9] loss: 19.196, acc: 0.989


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 10] loss: 14.617, acc: 0.991


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 11] loss: 16.970, acc: 0.983


HBox(children=(IntProgress(value=0, max=578), HTML(value='')))


[Epoch 12] loss: 14.310, acc: 0.989



In [None]:
[Epoch 1] loss: 717.377, acc: 0.819
A Jupyter widget could not be displayed because the widget state could not be found. This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the appropriate cells.

[Epoch 2] loss: 448.183, acc: 0.829
A Jupyter widget could not be displayed because the widget state could not be found. This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the appropriate cells.

[Epoch 3] loss: 291.641, acc: 0.873
A Jupyter widget could not be displayed because the widget state could not be found. This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the appropriate cells.

[Epoch 4] loss: 202.421, acc: 0.953
A Jupyter widget could not be displayed because the widget state could not be found. This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the appropriate cells.

[Epoch 5] loss: 139.828, acc: 0.963
A Jupyter widget could not be displayed because the widget state could not be found. This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the appropriate cells.

[Epoch 6] loss: 93.589, acc: 0.971

In [14]:
# 建立測試集。
testset = ReviewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=64, 
                        collate_fn=create_mini_batch,
                        shuffle=False)

# 預測測試集
predictions = get_predictions(model, testloader)

# ground truth
label_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
    } 
ground_truth = torch.tensor([label_mapping[i] for i in testset.df.iloc[:, 1].values])

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




In [15]:
def get_test_accuracy(pred, truth):
    # predictions: gpu -> cpu
    corr_count = (pred.cpu() == truth).sum().item()
    return corr_count / len(truth)

In [16]:
get_test_accuracy(predictions, ground_truth)

0.8685121107266436

### F- score

In [17]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

test_Y = ground_truth
pred_Y = predictions.cpu()

accuracy = accuracy_score(test_Y, pred_Y)
precision = precision_score(test_Y, pred_Y, average='macro')
recall = recall_score(test_Y, pred_Y, average='macro')
fscore = f1_score(test_Y, pred_Y, average='macro')

print("Accuracy: %g\tPrecision: %g\tRecall: %g\tF-score: %g" % (
    accuracy, precision, recall, fscore))

Accuracy: 0.868512	Precision: 0.771361	Recall: 0.758205	F-score: 0.759835


In [None]:
result = pd.DataFrame({'true':test_Y, 'predict':pred_Y})
pd.crosstab(result.true, result.predict)