# Jigsaw Multilingual Toxic Comment Classification - Bert 
#### Members: 資科四 劉上銓 105703030 資科四 邱顯安 105703012 資科四 林瀚軒 105703004

# TPU
1. 安裝新版的 pip，去除 warning
2. 安裝為了使用 TPU 的相依套件

In [None]:
!python -m pip install --upgrade pip
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
import random
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
import IPython
import sys

import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import transformers
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertConfig, BertModel
import torch.nn as nn
import torch.nn.functional as F

import torch_xla
import torch_xla.core.xla_model as xm

# Bert 基礎參數設定

In [None]:
class config:
    EPOCHS = 1
    BATCH_SIZE = 32
    VAL_BATCH_SIZE = 128
    TEST_BATCH_SIZE = 128
    LR = 3e-5

# 讀取資料
1. 從競賽中取出經預處理的訓練資料 `jigsaw-toxic-comment-train-processed-seqlen128.csv` --> 句子的最大長度是 128
2. 也取出處理過的驗證資料 `validation-processed-seqlen128.csv` 和測試資料 `test-processed-seqlen128.csv`
3. 保留 `[id, comment_text, input_word_ids, input_mask, all_segment_id, toxic]`，comment_text 代表 twitter 的留言，toxic 是 1 表示惡意，0 表示安全
4. 與前面兩個模型相同使用 20000 筆資料當作我們最後的訓練資料

In [None]:
valid = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv")
train = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv")
test = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv")
submit = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv")
train = train[['id', 'comment_text', 'input_word_ids', 'input_mask','all_segment_id', 'toxic']].iloc[:20000]

# Dataset
1. 因為放進 Bert 的資料需要三樣東西，分別是 (token_tensor, segment_tensor, mask_tensor)
    - token tensor 就是每一個句子轉換為 id 之後的句子
    - segment tensor 是因為 Bert 的應用中某一些情況是兩個句子的輸入，必須告訴模型哪一些是第一句，哪一些是第二句
    - mask tensor 主要跟 Bert 兩個主要的任務有關，Bert 會把單字遮罩起來當作克漏字來預測 (另一個任務是 Next Sentence Prediction)
2. 將那三樣東西從 train 拿出來，還有正確答案 label_tensor

In [None]:
class TweetDataset(Dataset):
    def __init__(self, mode, df):
        self.mode = mode
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        token, segment, mask = self.df.loc[idx, ["input_word_ids", "all_segment_id", "input_mask"]].values
        if self.mode=="train" or self.mode == "valid":
            label_tensor = torch.tensor(self.df.loc[idx, "toxic"])
        else:
            label_tensor = torch.tensor(-1)
        tokens_tensor = torch.tensor([int(i) for i in token[1:-1].split(",")])
        segments_tensor = torch.tensor([int(i) for i in segment[1:-1].split(",")])
        masks_tensor = torch.tensor([int(i) for i in mask[1:-1].split(",")])
           
        return tokens_tensor, segments_tensor, masks_tensor, label_tensor

# Dataloader
1. 將從 csv 讀出來的資料經過剛剛的 Dataset 進行處理
2. 將不同的語言從 valid 分離出來，分別存放，為了觀察各語言的情況
3. 之後切割出對應的 batch size 

In [None]:
lang = {'Spanish': 'es', 'Italian': 'it', 'Turkish': 'tr'}

validsets = {}
for i, k in lang.items():
    validsets[i] = TweetDataset("valid", valid[valid["lang"] == k].reset_index(drop=True))
trainset = TweetDataset("train", train)
validset = TweetDataset("valid", valid)
testset = TweetDataset("test", test)

validloaders = {}
for i, k in validsets.items():
    validloaders[i] = DataLoader(k, batch_size=config.VAL_BATCH_SIZE, num_workers=4, shuffle=False)
trainloader = DataLoader(trainset, batch_size=config.BATCH_SIZE, num_workers=4, shuffle=False)
validloader = DataLoader(validset, batch_size=config.VAL_BATCH_SIZE, num_workers=4, shuffle=False)
testloader = DataLoader(testset, batch_size=config.TEST_BATCH_SIZE, num_workers=4, shuffle=False)

# Bert 架構
1. 讀取 bert-base-multilingual-cased 的預訓練模型
2. 取出最後一層的 hidden states 而不是 CLS (自首分類的特殊字元) 的資訊 --> 經過嘗試使用 CLS 的成績比較低落
3. 將取出來的 hidden states 做一次 average pooling，接著做 max pooling，將兩個的結果串接起來，因此維度變為原來的兩倍 ($2\times768$)
4. 接著 dropout(0.3) --> linear(tanh) --> output


In [None]:
class Model(nn.Module):
    
    def __init__(self, labels=1):
        
        super().__init__()
        
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.num_features = self.bert.pooler.dense.out_features
        self.labels = labels
        
        self.drop = nn.Dropout(0.3)
        self.fc1 = nn.Linear(self.num_features * 2, self.num_features)
        self.logit = nn.Linear(self.num_features, self.labels)
        
    def forward(self, tokens_tensors, segments_tensors, masks_tensors):
        
        hidden_states, cls = self.bert(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors)
        avgpool = torch.mean(hidden_states, 1)
        maxpool, _ = torch.max(hidden_states, 1)
        cat = torch.cat((avgpool, maxpool), 1)
        x = self.drop(cat)
        x = torch.tanh(self.fc1(x))
        output = self.logit(x)

        return output

In [None]:
model = Model()

確認模型的輸出跟所設計的架構是否相同

In [None]:
model

# Device
- 將整個模型放入 TPU 當中

In [None]:
device = xm.xla_device()
model.to(device)
print(f"Now we use {device}\n")

# Train - function
1. optimizer: Adam, scheduler: lr 隨著步驟下降, loss function: BCEWithLogitsLoss
2. 將每一個 input 分別放進 tensor 以利計算
3. loss --> backward --> optimize --> schedule
4. 計算一個 epoch 結束後 valid 的成績

In [None]:
def training(model, warmup_prop=0.1):

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LR)
    num_warmup_steps = int(warmup_prop * config.EPOCHS * len(trainloader))
    num_training_steps = config.EPOCHS * len(trainloader)
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    loss_fun = torch.nn.BCEWithLogitsLoss(reduction='mean').to(device)    

    for epoch in range(config.EPOCHS):
        model.train()
        
        optimizer.zero_grad()
        avg_loss = 0
        
        for data in tqdm(trainloader):             
            tokens_tensor, segments_tensor, masks_tensor, labels_tensor = [k.to(device) for k in data if k is not None]
            output = model(tokens_tensor, segments_tensor, masks_tensor)
            loss = loss_fun(output.view(-1).float(), labels_tensor.float().to(device))
            loss.backward()
            avg_loss += loss.item() / len(trainloader)

            xm.optimizer_step(optimizer, barrier=True)
            scheduler.step()
            model.zero_grad()
            optimizer.zero_grad()
                
        model.eval()
        preds = []
        truths = []
        avg_val_loss = 0.

        with torch.no_grad():
            for data in validloader:
                tokens_tensor, segments_tensor, masks_tensor, labels_tensor = [k.to(device) for k in data if k is not None]
                output = model(tokens_tensor, segments_tensor, masks_tensor)
                loss = loss_fun(output.detach().view(-1).float(), labels_tensor.float().to(device))
                avg_val_loss += loss.item() / len(validloader)
                
                probs = torch.sigmoid(output).detach().cpu().numpy()
                preds += list(probs.flatten())
                truths += list(labels_tensor.detach().cpu().numpy().flatten())
            score = roc_auc_score(truths, preds)
        
        lr = scheduler.get_last_lr()[0]
        print(f'[Epoch {epoch + 1}] lr={lr:.1e} loss={avg_loss:.4f} val_loss={avg_val_loss:.4f} val_auc={score:.4f}')

# Predict
1. 將超過 0.5 定為有惡意的評論
2. 輸入模型、切割好 batch 的 dataloader、跟原來的 dataframe 算出預測的答案

In [None]:
threshold = lambda x: 1 if x>=0.5 else 0

def predict(model, dataloader, df, isAccuracy=True):
 
    model.eval().to(device)
    preds = np.empty((0, 1))
    accuracy = None

    with torch.no_grad():
        for data in tqdm(dataloader):
            tokens_tensor, segments_tensor, masks_tensor, labels_tensor = [k.to(device) for k in data if k is not None]
            probs = torch.sigmoid(model(tokens_tensor, segments_tensor, masks_tensor)).detach().cpu().numpy()
            preds = np.concatenate([preds, probs])
            
    preds = preds.reshape(len(preds))        
    predicts = np.array([threshold(i) for i in preds])
    if isAccuracy:
        accuracy = (df["toxic"].values == predicts).sum() / len(df)

    return preds, predicts, accuracy 

# Bert 在訓練之前的成績
- 分別是查看 train、各個語言的 valid 在訓練之前的成果
- 從這裡我們可以看出，不管是 train 還是 valid 的各種語言在沒有訓練之前的 auc 跟 accuracy 的表現都是非常不好的

In [None]:
# before training model accuracy
pre, pre_class, accuracy = predict(model, trainloader, train)
auc = roc_auc_score(train["toxic"].values, pre_class)
print("Train: ")
print(f"Model before fine-tune accuracy: {accuracy * 100:.3f}%\nModel before fine-tune AUC: {auc:.3f}")

for key, value in validloaders.items():
    pre, pre_class, accuracy = predict(model, value, valid[valid["lang"] == lang[key]].reset_index(drop=True))
    auc = roc_auc_score(valid[valid["lang"] == lang[key]].reset_index(drop=True)["toxic"].values, pre_class)
    print(f"{key} Valid: ")
    print(f"Model before fine-tune accuracy: {accuracy * 100:.2f}%\nModel before fine-tune AUC: {auc:.3f}")

pre, pre_class, accuracy = predict(model, validloader, valid)
auc = roc_auc_score(valid["toxic"].values, pre_class)
print(f"Combined Valid: ")
print(f"Model before fine-tune accuracy: {accuracy * 100:.2f}%\nModel before fine-tune AUC: {auc:.3f}")

# Train
1. 訓練階段只訓練了一個 epoch，因為之前嘗試使用更多 epoch 時，loss 不斷的上升
2. 在 5 分鐘以內能完成了 20000 筆資料的訓練 (曾嘗試使用較大量的資料但 performance 沒有太大的進步)

In [None]:
%%time 

training(model)

# Bert 在訓練之後的成績
1. 經過參數的微調讓 Bert 更符合現在這個任務
2. 結果可以看出進步非常地顯著，AUC 跟 accuracy 在所有的資料集上幾乎都有達到 0.8 以上
3. 透過 fine-tune 能讓 transfer learning 產生最大的效果


In [None]:
# After training model accuracy
pre, pre_class, accuracy = predict(model, trainloader, train)
auc = roc_auc_score(train["toxic"].values, pre)
print("Train: ")
print(f"Model before fine-tune accuracy: {accuracy * 100:.3f}%\nModel before fine-tune AUC: {auc:.3f}")

for key, value in validloaders.items():
    pre, pre_class, accuracy = predict(model, value, valid[valid["lang"] == lang[key]].reset_index(drop=True))
    auc = roc_auc_score(valid[valid["lang"] == lang[key]].reset_index(drop=True)["toxic"].values, pre)
    print(f"{key} Valid: ")
    print(f"Model before fine-tune accuracy: {accuracy * 100:.2f}%\nModel before fine-tune AUC: {auc:.3f}")

pre, pre_class, accuracy = predict(model, validloader, valid)
auc = roc_auc_score(valid["toxic"].values, pre)
print(f"Combined Valid: ")
print(f"Model before fine-tune accuracy: {accuracy * 100:.2f}%\nModel before fine-tune AUC: {auc:.3f}")

將 model 的權重儲存起來

In [None]:
torch.save(model.state_dict(), "./model.bin")

# 預測測試資料並輸出

In [None]:
pre, pre_class, accuracy = predict(model, testloader, test, False)
submit['toxic'] = pre
submit.to_csv('submission.csv', index=False)
submit.head()

# 視覺化 Attention
- 從這個視覺話工具中我們可以看出，在某一些 Bert 的某些層是能知道『他 - 阿明』之間的關係，或是『給 - 阿明』這個動作的關係
- 不同的 head 能監控不一樣的特徵

In [None]:
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
    sys.path += ['bertviz_repo']

from transformers import BertTokenizer, BertModel
from bertviz import head_view

def call_html():
    display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))


In [None]:
model_version = 'bert-base-chinese'
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version)

sentence_a = "阿明去買東西，"
sentence_b = "回來的時候要給他錢。"

inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
token_type_ids = inputs['token_type_ids']
input_ids = inputs['input_ids']
attention = model(input_ids, token_type_ids=token_type_ids)[-1]
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

head_view(attention, tokens)