# Document Classification

## 2.1 Data Preprocessing

1.How do you choose the tokenizer for this task? Could we use the white space to tokenize
the text? What about using the complicated tokenizer instead? Make some discussion.

In [1]:
import spacy
import csv
from nltk.tokenize import wordpunct_tokenize
from gensim.utils import tokenize
from torchtext.data import get_tokenizer

# OPEN CSV FILE
with open("train.csv") as csvfile:
    """ LOAD CSV FILE """
    row_ex = list(csv.reader(csvfile))

print("Original Eample Sentence:")
print(row_ex[15][3])
print()

print("NLTK EXAMPLE:")
print(wordpunct_tokenize(row_ex[15][3]))
print()

print("spaCy EXAMPLE:")
token_list = []
nlp = spacy.load("en_core_web_sm")
spacy_doc = nlp(row_ex[15][3])
for token in spacy_doc:
    token_list.append(token.text)
print(token_list)
print()

print("Gensim EXAMPLE:")
print(list(tokenize(row_ex[15][3])))
print()

print("TorchText EXAMPLE:")
tokenizer = get_tokenizer("basic_english")
print(tokenizer(row_ex[1][2]))

Original Eample Sentence:
"We make these sick people known worldwide for their horrifying acts, let’s stop that."

NLTK EXAMPLE:
['"', 'We', 'make', 'these', 'sick', 'people', 'known', 'worldwide', 'for', 'their', 'horrifying', 'acts', ',', 'let', '’', 's', 'stop', 'that', '."']

spaCy EXAMPLE:
['"', 'We', 'make', 'these', 'sick', 'people', 'known', 'worldwide', 'for', 'their', 'horrifying', 'acts', ',', 'let', '’s', 'stop', 'that', '.', '"']

Gensim EXAMPLE:
['We', 'make', 'these', 'sick', 'people', 'known', 'worldwide', 'for', 'their', 'horrifying', 'acts', 'let', 's', 'stop', 'that']

TorchText EXAMPLE:
['jets', 'chairman', 'christopher', 'johnson', 'won', "'", 't', 'fine', 'players', 'for', 'anthem', 'protests']


上面列舉出了各種tokenizer的切割方式，個人的話我比較喜歡spacy的切法，但實際去用到transformer後似乎torchtext的效果會最好，
可能比較適合機器的訓練與判讀，因此最後是使用torchtext作為tokenizer。使用空白做為切割也是一種方法，但不一定適用於所有的dataset，還是要看dataset的樣式來選擇。至於使用較複雜的切法則可能因為切得太細導致本來該在一起的單字被切割成了兩個而下降了訓練的品質。

2.Why we need the special tokens like ⟨pad⟩, ⟨unk⟩?

We need to insert additional "special tokens" such as ⟨PAD⟩, ⟨UNK⟩, ⟨STA⟩, ⟨END⟩ in some cases, which is not in the input text but have important meaning that we want the model to act on.

⟨PAD⟩ Which is Padding token : Added to the end of shorter inputs so that all inputs have the same length. This is because inputs to a neural network model are typically batched and the model operates on entire batches.

⟨UNK⟩ Which is Unknown token : Used to limit the number of distinct tokens.

3.Briefly explain how your procedure is run to handle the text data.

一開始先將資料寫入並進行tokenizer的切割，接著將資料包進dataloader並送進我們所架的transformer中進行model training，當train到一定的準確率後在將test資料送進model中預測出最相似的category並讀出csv檔。
Setting 裡面較為重要的部分是我所選擇的tokenizer種類，作業中最後使用了torchtext，以及使用了glove做為embedding pretrain也提升了許多的準確率。

## 2.2 Transformer

In [2]:
!wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
!unzip glove.6B.zip

'wget' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC
'unzip' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC


In [None]:
import time
import math
import torch
import pandas as pd
import csv
import numpy as np
import spacy
from spacy.lang.en import English
from torch import nn
from torchtext.data.utils import get_tokenizer
import torchtext.vocab as vocab
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from einops import rearrange
import torch.nn.functional as F

# OPEN CSV FILE
with open("train.csv") as csvfile:
    """ LOAD CSV FILE """
    row_train = list(csv.reader(csvfile))

with open("test.csv") as csvfile:
    """ LOAD CSV FILE """
    row_test = list(csv.reader(csvfile))

tokenizer = get_tokenizer("basic_english")
nlp = spacy.load("en_core_web_sm")
glove = vocab.GloVe(name = "6B", dim = 300, cache = '.vector_cache/glove')
embedding_index = {}
count = 0
glove_data = open("glove.6B.300d.txt", encoding = "utf8")
for line in glove_data:
    embedding_index[line.split()[0]] = count
    count += 1
glove_data.close()

# def text_pipeline(x):
    # return [embedding_index[str(token.text.lower())] if (str(token.text.lower()) in embedding_index) else 2 for token in nlp(x)]

def text_pipeline(x):
    return [embedding_index[str(token.lower())] if (str(token.lower()) in embedding_index) else glove.stoi["unk"] for token in tokenizer(x)]

def label_pipeline(x):
    return int(x) - 1

train_data = []
for i in range(len(row_train) - 1):
    train_data.append((row_train[i + 1][1], text_pipeline(row_train[i + 1][2] + "." + row_train[i + 1][3])))

test_data = []
for i in range(len(row_test) - 1):
    test_data.append((row_test[i + 1][1] + "." + row_test[i + 1][2]))

def yield_tokens(data):
    for _, text in data:
        token_list = []
        spacy_doc = nlp(text)
        for token in spacy_doc:
            token_list.append(token.text)
        yield token_list

"""Run on GPU"""
device = "cuda" if torch.cuda.is_available() else "cpu"

max_len = 60

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(_text, dtype = torch.int64)
        pad = nn.ConstantPad1d((0, max_len - len(processed_text)), glove.stoi["."])   
        processed_text = pad(processed_text)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype = torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim = 0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

class Swish(nn.Module):
    def __init__(self):
        super(Swish, self).__init__()

    def forward(self, x):
        return x * F.sigmoid(x)

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.k = nn.Linear(200, 200).to(device)
        self.q = nn.Linear(200, 200).to(device)
        self.v = nn.Linear(200, 200).to(device)
        self.projection = nn.Linear(200, 200).to(device)
        
    def forward(self, x):
        q = rearrange(self.q(x), "b n (h d) -> b h n d", h=4)
        k = rearrange(self.k(x), "b n (h d) -> b h n d", h=4)
        v  = rearrange(self.v(x), "b n (h d) -> b h n d", h=4)
        energy = torch.einsum('bhqd, bhkd -> bhqk', q, k)
        scaling = 200 ** (1/2)
        att = F.softmax(energy, dim=-1) / scaling
        out = torch.einsum('bhal, bhlv -> bhav ', att, v)
        out = rearrange(out, "b h n d -> b n (h d)")
        out = self.projection(out)
        return out
    
def generate_mask(len):
    return torch.triu(torch.ones(len, len) * float("-inf"), diagonal = 1).to(device)

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, class_num, dropout = 0.2, nhead = 2, d_hid = 200, nlayers = 2):
        super(Transformer, self).__init__()
        self.d_model = d_model
        self.embedding = nn.EmbeddingBag(vocab_size, d_model, sparse = False) 
        self.tanh = nn.Tanh()
        self.swish = Swish()
        self.dropout = nn.Dropout(p = dropout)
        self.positional_encoder = PositionalEncoding(d_model) 
        self.num_encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout) 
        self.transformer_encoder = nn.TransformerEncoder(self.num_encoder_layers, nlayers) 
        self.decoder = nn.Linear(d_model, class_num) 
        self.init_weights() 

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.copy_(glove.vectors)
        self.decoder.weight.data.uniform_(-initrange, initrange)
        # self.decoder.bias.data.normal_(mean = 0, std = 0.5)
        self.decoder.bias.data.zero_()

    def forward(self, text, offsets):
        # embedded = self.embedding(text, offsets)
        # embedded = self.tanh(embedded)
        # embedded = self.dropout(embedded)
        out = self.embedding(text, offsets) * math.sqrt(self.d_model)
        mask = generate_mask(out.shape[0])
        out = self.positional_encoder(out)  
        # out = self.swish(out)
        # out = self.dropout(out)
        out = self.transformer_encoder(out, mask)
        # out = self.transformer_encoder(out) 
        # out = self.swish(out)
        # out = self.dropout(out)
        # return self.decoder(embedded) 
        return self.decoder(out)  
    
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, rate = 0.3):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads = num_heads)
        self.ffn = nn.Sequential(
            [nn.Dense(ff_dim, activation = "tanh"), nn.Dense(embed_dim),]
        )
        self.layernorm1 = nn.LayerNormalization(epsilon = 1e-6)
        self.layernorm2 = nn.LayerNormalization(epsilon = 1e-6)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)

    def forward(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training = training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training = training)
        return self.layernorm2(out1 + ffn_output)
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout = 0.2, max_len = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p = dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term) # (5000, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x += self.pe[:x.size(0)]  # (64, d_model)
        return self.dropout(x)


def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log = 1
    start_time = time.time()
    count = 0

    for index, (label, text, offsets) in enumerate(dataloader):
        count = count + 1
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
        if index % log == 0 and index > 0:
            elapsed = time.time() - start_time
            print("| epoch: {:3d} | batch:{:4d}/{:4d}"
                "| accuracy: {:5.3f}".format(epoch + 1, index, len(dataloader), total_acc / total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    total_loss = 0

    with torch.no_grad():
        for _, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            total_loss += criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count, total_loss

"""MODEL SETTING"""
class_num = 4
nhead = 5  # number of heads in nn.MultiheadAttention
d_model = 300 # embedding dimension
d_hid = 512  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 1  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
dropout = 0.3
model = Transformer(len(glove), d_model, class_num, dropout = dropout, nhead = nhead, d_hid = d_hid, nlayers = nlayers).to(device)
epochs = 600
lr = 10
batch_size = 40
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = lr)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma = 0.99)
# lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
total_accu = None
num_train = int(len(train_data) * 0.999)
split_train_, split_valid_ = \
    random_split(train_data, [num_train, len(train_data) - num_train])
"""MODEL SETTING"""

train_dataloader = DataLoader(split_train_, batch_size = batch_size,
                            shuffle = True, collate_fn = collate_batch)
# train_dataloader = DataLoader(train_data, batch_size = batch_size,
#                             shuffle = True, collate_fn = collate_batch)
val_dataloader = DataLoader(split_valid_, batch_size = batch_size,
                            shuffle = True, collate_fn = collate_batch)

def predict(text):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text)).to(device)
        out = model(text, torch.tensor([0]).to(device))
        return out.argmax(1).item() + 1

predict_class = []
id_list = list(range(1, 401))

for epoch in range(epochs):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val, val_loss = evaluate(val_dataloader)

    if epoch + 1 == epochs:
        for i in range(len(test_data)):
            predict_class.append(predict(test_data[i])) 
        col1 = "id"
        col2 = "category"
        data = pd.DataFrame({col1:id_list, col2:predict_class})
        data.to_csv("submission.csv", index = False)

    if total_accu is not None and total_accu > accu_val:
        lr_scheduler.step()
    else:
        total_accu = accu_val
    # lr_scheduler.step()

    print("=" * 100)
    print("| end of epoch: {:3d} | time: {:5.2f}s | "
        "valid accuracy: {:8.3f}".format(epoch + 1, time.time() - epoch_start_time, accu_val))
    print("val loss: %f" %(val_loss))
    print("LR: %f" %lr_scheduler.get_last_lr()[0])
    print("=" * 100)

## Discuss the model structure or hyperparameter setting in your design. (5%) 

Model structure: Input -> Input embedding -> positional embedding -> nlayers times encoder -> decoder -> Linear -> categry。
Parameter choose: d model = 300, nhead = 5, nlayers = 2, d_hid = 300, dropout = 0.3,batch_size = 40, epoch =600, learning rate = 10，經過測試dropout稍微提高可以減少部分overfitting的狀況，但若調太高則會train的不好，d_model的維度則是配合glove因此都選擇300維度，nlayers若是加上太多層會導致模型深度太深，會有train不起來的問題，因此適當的選擇2來用，最後我的transformer用128或256的batch size小果會比40和64的差，故推測小的batch size會更好。