# 專題（二）：建置Bert新聞分類器之資料集

## 專案目標
- 目標：請試著建製 BertForSequenceClassification 看得懂的資料集 NewsDataset
- news_clustering_train.tsv 中有 1800 篇新聞，六種類別的新聞各 300 篇
- news_clustering_test.tsv 中有 600 篇新聞，六種類別的新聞各 100 篇
- 六種類別：體育、財經、科技、旅遊、農業、遊戲

## 實作提示
- STEP1：從 news_clustering_train.tsv 和 news_clustering_test.tsv 中取出標題和類別
- STEP2：繼承 torch.utils.data.Dataset 並實作 NewsDataset，其中需要用到 bert tokenizer (請參考官方對BertForSequenceClassification的說明)
- STEP3：因為每一個從 NewsDataset 來的樣本長度都不一樣，所以需要實作 collate_fn，來zero padding 到同一序列長度
- STEP4：使用 torch.utils.data.DataLoader 來創造 train_loader和valid_loader

## 重要知識點：專題結束後你可以學會
- 如何讀取並處理 NLP 資料，產生可以適用 BertForSequenceClassification 的資料集
- 了解 BERT 的 Sequence Classification 任務如何進行

In [1]:
!python --version

Python 3.6.5 :: Anaconda, Inc.


In [2]:
# !pip install -q transformers

In [3]:
import pandas as pd
import numpy as np

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer, BertForSequenceClassification

In [4]:
df_train = pd.read_csv('news_clustering_train.tsv', sep='\t')
df_test = pd.read_csv('news_clustering_test.tsv', sep='\t')

In [5]:
train_titles = {row['index']: row['title'] for _, row in df_train.iterrows()}
train_classes = {row['index']: row['class'] for _, row in df_train.iterrows()}

valid_titles = {row['index']: row['title'] for _, row in df_test.iterrows()}
valid_classes = {row['index']: row['class'] for _, row in df_test.iterrows()}

In [6]:
ALL_NEWS_CLASSES = ['體育', '財經', '科技', '旅遊', '農業', '遊戲']

In [7]:
MODEL_NAME = 'bert-base-chinese'

In [8]:
# 建立數據集
class NewsDataset(Dataset):
    def __init__(self, tokenizer, titles, classes):
        self.tokenizer = tokenizer
        self.indexes = []
        self.texts = []
        self.labels = []
        for index in titles:
            self.indexes.append(index)
            self.texts.append(titles[index])
            self.labels.append(classes[index])

    def __getitem__(self, idx):
        text = self.texts[idx]

        input = self.tokenizer(text, return_tensors='pt')
        label = torch.tensor(ALL_NEWS_CLASSES.index(self.labels[idx]))

        return input, label

    def __len__(self):
        return len(self.indexes)


def create_mini_batch(samples):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    labels = []
    for s in samples:
        input_ids.append(s[0]['input_ids'].squeeze(0))
        token_type_ids.append(s[0]['token_type_ids'].squeeze(0))
        attention_mask.append(s[0]['attention_mask'].squeeze(0))
        labels.append(s[1])

    # zero pad 到同一序列長度
    input_ids = pad_sequence(input_ids, batch_first=True)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)
 
    labels = torch.stack(labels)

    return input_ids, token_type_ids, attention_mask, labels

In [9]:
batch_size = 32

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

train_dataset = NewsDataset(tokenizer, train_titles, train_classes)
valid_dataset = NewsDataset(tokenizer, valid_titles, valid_classes)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=create_mini_batch,
    shuffle=True)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    collate_fn=create_mini_batch)

I0418 20:13:58.927656  9576 filelock.py:274] Lock 1352020389784 acquired on C:\Users\Guan-Ting Chen/.cache\huggingface\transformers\36acdf4f3edf0a14ffb2b2c68ba47e93abd9448825202377ddb16dae8114fe07.accd894ff58c6ff7bd4f3072890776c14f4ea34fcc08e79cd88c2d157756dceb.lock


Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

I0418 20:14:00.212035  9576 filelock.py:318] Lock 1352020389784 released on C:\Users\Guan-Ting Chen/.cache\huggingface\transformers\36acdf4f3edf0a14ffb2b2c68ba47e93abd9448825202377ddb16dae8114fe07.accd894ff58c6ff7bd4f3072890776c14f4ea34fcc08e79cd88c2d157756dceb.lock
I0418 20:14:02.185094  9576 filelock.py:274] Lock 1352020389784 acquired on C:\Users\Guan-Ting Chen/.cache\huggingface\transformers\2dc6085404c55008ba7fc09ab7483ef3f0a4ca2496ccee0cdbf51c2b5d529dff.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

I0418 20:14:02.897734  9576 filelock.py:318] Lock 1352020389784 released on C:\Users\Guan-Ting Chen/.cache\huggingface\transformers\2dc6085404c55008ba7fc09ab7483ef3f0a4ca2496ccee0cdbf51c2b5d529dff.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock
I0418 20:14:03.562598  9576 filelock.py:274] Lock 1352020389784 acquired on C:\Users\Guan-Ting Chen/.cache\huggingface\transformers\7e23f4e1f58f867d672f84d9a459826e41cea3be6d0fe62502ddce9920f57e48.4495f7812b44ff0568ce7c4ff3fdbb2bac5eaf330440ffa30f46893bf749184d.lock


Downloading:   0%|          | 0.00/269k [00:00<?, ?B/s]

I0418 20:14:05.345240  9576 filelock.py:318] Lock 1352020389784 released on C:\Users\Guan-Ting Chen/.cache\huggingface\transformers\7e23f4e1f58f867d672f84d9a459826e41cea3be6d0fe62502ddce9920f57e48.4495f7812b44ff0568ce7c4ff3fdbb2bac5eaf330440ffa30f46893bf749184d.lock


In [10]:
next(iter(train_loader))

(tensor([[ 101, 1521, 8013,  ...,    0,    0,    0],
         [ 101,  821, 3511,  ...,    0,    0,    0],
         [ 101, 1963, 3362,  ..., 3564, 8043,  102],
         ...,
         [ 101, 8131, 2399,  ...,    0,    0,    0],
         [ 101, 2582, 7938,  ...,    0,    0,    0],
         [ 101, 4257, 4767,  ...,    0,    0,    0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 1, 0, 5, 4, 4, 1, 3, 1, 1, 5, 4, 2, 5, 0, 5, 4, 1, 0, 3, 2, 3, 2, 0,
         1, 4, 1, 4, 3, 0, 2, 5]))