In [2]:
import pandas as pd
import glob
import numpy as np 

In [3]:
glob.glob("tweet-sentiment-extraction/*.csv")

['tweet-sentiment-extraction/test.csv',
 'tweet-sentiment-extraction/train.csv',
 'tweet-sentiment-extraction/sample_submission.csv']

In [4]:
train_csv = pd.read_csv("tweet-sentiment-extraction/train.csv")
test_csv = pd.read_csv("tweet-sentiment-extraction/test.csv")
submit = pd.read_csv("tweet-sentiment-extraction/sample_submission.csv")

In [5]:
train_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27486 entries, 0 to 27485
Data columns (total 4 columns):
textID           27486 non-null object
text             27485 non-null object
selected_text    27485 non-null object
sentiment        27486 non-null object
dtypes: object(4)
memory usage: 859.0+ KB


In [6]:
test_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3535 entries, 0 to 3534
Data columns (total 3 columns):
textID       3535 non-null object
text         3535 non-null object
sentiment    3535 non-null object
dtypes: object(3)
memory usage: 82.9+ KB


In [7]:
# remove the null item in the train file
train_csv = train_csv[~((train_csv['text'].isnull()) | (train_csv['selected_text'].isnull()))]

In [8]:
train_csv["sentiment"].value_counts()

neutral     11117
positive     8582
negative     7786
Name: sentiment, dtype: int64

In [9]:
import torch
from transformers import BertTokenizer

In [10]:
PRETRAINED_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)


In [11]:
vocab = tokenizer.vocab
print("字典大小：", len(vocab))

字典大小： 28996


In [12]:
token = tokenizer.tokenize("I have an apple")
print("Convert token to id")
print(f"{token} ---> {tokenizer.convert_tokens_to_ids(token)}")

Convert token to id
['I', 'have', 'an', 'apple'] ---> [146, 1138, 1126, 12075]


In [13]:
# selecting the length of the text is less than 128 or equal to 128
MAX_LENGTH = 128
train_csv = train_csv[train_csv["text"].apply(lambda x: len(x)) <= MAX_LENGTH]
train_csv = train_csv[train_csv["selected_text"].apply(lambda x: len(x)) <= MAX_LENGTH]
print(f"train data remains after selecting: {len(train_csv)}")

train data remains after selecting: 25810


In [14]:
train_csv = train_csv.reset_index()
train_csv = train_csv[["text", "sentiment", "selected_text"]]

# idempotence, 將處理結果另存成 tsv 供 PyTorch 使用
train_csv.to_csv("train.tsv", sep="\t", index=False)
train_csv.head()

Unnamed: 0,text,sentiment,selected_text
0,Oh! Good idea about putting them on ice cream,positive,Good
1,says good (or should i say bad?) afternoon! h...,neutral,says good (or should i say bad?) afternoon!
2,i dont think you can vote anymore! i tried,negative,i dont think you can vote anymore!
3,haha better drunken tweeting you mean?,positive,better
4,headache wanna see my Julie,negative,headache


In [15]:
test_csv.to_csv("test.tsv", sep="\t", index=False)

print("預測樣本數：", len(test_csv))
test_csv.head()

預測樣本數： 3535


Unnamed: 0,textID,text,sentiment
0,11aa4945ff,http://twitpic.com/67swx - i wish i was calli...,positive
1,fd1db57dc0,i'm done.haha. HOUSE MD marathon ulet,positive
2,2524332d66,I'm concerned for that family,positive
3,0fb19285b2,HEY GUYS IT'S WORKING NO NEED TO WORRY. i have...,positive
4,e6c9e5e3ab,26th February,neutral


In [16]:
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    
    def __init__(self, mode, tokenizer):
        self.tokenizer = tokenizer
        self.mode = mode
        self.df = pd.read_csv(self.mode+".tsv", sep="\t")
        self.len = len(self.df)
        self.start = 0
        self.end = 0
    
    # token_tensor, segment_tensor, mask_tensor      
    def __getitem__(self, idx):
        
        if self.mode=="test":
            text = self.df.iloc[idx, :].values
        else:
            text, sentiment, selected_text = self.df.iloc[idx, [0, 1, 2]].values
        
        word_pieces = ["[CLS]"]
        word_pieces += [sentiment, "[SEP]"]
        len_sentiment = len(word_pieces)
        
        sel_tok = tokenizer.tokenize(selected_text)
        sel_ids = tokenizer.convert_tokens_to_ids(sel_tok)
        tokens = tokenizer.tokenize(text)
        word_pieces += (tokens + ["[SEP]"])
        len_text = len(word_pieces) - len_sentiment
            
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
    
        for ind in [i for i, e in enumerate(ids) if e == sel_ids[0]]:
            if ids[ind: ind+len(sel_ids)] == sel_ids:
                self.start = ind
                self.end = ind + len(sel_ids) - 1
                break
        
        laber_tensor = torch.tensor([self.start, self.end]).type(torch.long) if self.mode == "train" else None
        segments_tensor = torch.tensor([0] * len_sentiment + [1] * len_text, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, laber_tensor)
    
    def __len__(self):
        return self.len

trainset = TweetDataset("train", tokenizer=tokenizer)

In [17]:
# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
text_a, text_b, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = " ".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{text_a}
句子 2：{text_b}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1： Oh! Good idea about putting them on ice cream
句子 2：positive
分類  ：Good

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101, 3112,  102, 2048,  106, 2750, 1911, 1164, 4518, 1172, 1113, 2854,
        7081,  102])

segments_tensor：tensor([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

label_tensor   ：tensor([5, 5])

--------------------

[還原 tokens_tensors]
[CLS] positive [SEP] Oh ! Good idea about putting them on ice cream [SEP]



In [18]:
# use to divide data into mini-batch
from torch.utils.data import DataLoader
# make data have the same length
from torch.nn.utils.rnn import pad_sequence

def mini_batch(samples):
    token_tensors = [t[0] for t in samples]
    segment_tensors = [s[1] for s in samples]

    if samples[0][2] is not None:
        label_tensors = [l[2] for l in samples]
        label_tensors = torch.stack(label_tensors)
    else:
        label_tensors = None
        
    token_tensors = pad_sequence(token_tensors, batch_first=True)
    segment_tensors = pad_sequence(segment_tensors, batch_first=True)
        
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    mask_tensors = torch.zeros(token_tensors.shape, dtype=torch.long)
    mask_tensors = mask_tensors.masked_fill(token_tensors != 0, 1)
    
    return token_tensors, segment_tensors, mask_tensors, label_tensors

BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size = BATCH_SIZE, collate_fn = mini_batch)

In [19]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 46]) 
tensor([[ 101, 3112,  102,  ...,    0,    0,    0],
        [ 101, 8795,  102,  ...,    0,    0,    0],
        [ 101, 4366,  102,  ...,    0,    0,    0],
        ...,
        [ 101, 4366,  102,  ...,    0,    0,    0],
        [ 101, 3112,  102,  ...,    0,    0,    0],
        [ 101, 8795,  102,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([64, 46])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 46])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
label_ids.shape        = torch.Size([64, 2])
ten

In [20]:
from transformers import BertForQuestionAnswering

# model information: https://huggingface.co/transformers/model_doc/bert.html#bertforquestionanswering
model = BertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME)

In [21]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": null,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16

In [23]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors,
                            start_positions=labels[:, 0], 
                            end_positions=labels[:, 1])
            
            starts = outputs[0]
            ends = outputs[1]
            _, starts_pred = torch.max(starts.data, 1)
            _, ends_pred = torch.max(ends.data, 1)
 
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                starts_bool = (starts_pred == labels[:, 0])
                ends_bool = (ends_pred == labels[:, 1])
                correct += torch.stack([i and k for i, k in zip(starts_bool,  ends_bool)]).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = torch.tensor(zip(starts_pred, ends_pred))
            else:
                predictions = torch.cat((predictions, torch.tensor(zip(starts_pred, ends_pred))))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
# _, acc = get_predictions(model, trainloader, compute_acc=True)
# print("classification acc:", acc)

device: cpu


In [25]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 6  # 幸運數字
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        token_tensors, segment_tensors, mask_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=token_tensors, 
                        token_type_ids=segment_tensors, 
                        attention_mask=mask_tensors, 
                        start_positions=labels[:, 0], end_positions=labels[:, 1])
        
        starts = outputs[1].data
        ends = outputs[2].data
        _, starts_pred = torch.max(starts.data, 1)
        _, ends_pred = torch.max(ends.data, 1)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' % (epoch + 1, running_loss, acc))

KeyboardInterrupt: 

In [None]:
print(outputs[2].data.shape)
print(outputs[1].data.shape)

In [None]:
" ".join(tokenizer.convert_ids_to_tokens(trainset[0][0].tolist()))

In [None]:
a = pd.read_csv("train.tsv", sep="\t")
a.iloc[0, 0]

In [None]:
#  jaccard --> 聯集分之交集(用於比較兩個集合的相似度)
#  lemmatization
# [SEP] 102 [CLS] 101