# Self-Attention Text Classification

---
## Data Check

- NSMC 데이터
    - Naver sentiment movie corpus v1.0
    - 네이버 영화 댓글 감정분석 데이터셋
    - 이진 분류
- https://github.com/e9t/nsmc

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/ratings_train.txt', sep='\t')
df.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


In [2]:
df['label'].value_counts()

label
0    75173
1    74827
Name: count, dtype: int64

In [3]:
df.isna().sum()

id          0
document    5
label       0
dtype: int64

In [4]:
df = df.dropna(how='any', axis=0)
df.shape

(149995, 3)

---
## Tokenization

In [5]:
vocab = set()
for doc in df['document']:
    for token in doc.split():
        vocab.add(token)

In [6]:
len(vocab)

357862

In [7]:
vocab_cnt_dict = {}
for doc in df['document']:
    for token in doc.split():
        if token not in vocab_cnt_dict:
            vocab_cnt_dict[token] = 0
        vocab_cnt_dict[token] += 1

In [8]:
vocab_cnt_list = [(token, cnt) for token, cnt in vocab_cnt_dict.items()]

In [9]:
top_vocabs = sorted(vocab_cnt_list, key= lambda tup: tup[1], reverse=True)

In [10]:
cnts = [cnt for _, cnt in top_vocabs]

In [11]:
n_vocab = sum(np.array(cnts)>=3)
top_vocabs = top_vocabs[:n_vocab]

In [12]:
vocabs = [token for token, _ in top_vocabs]
vocabs[:5]

['영화', '너무', '정말', '진짜', '이']

---
## UNK, PAD Token

In [13]:
vocabs.insert(0, '[UNK]')
vocabs.insert(0, '[PAD]')

In [14]:
idx_to_token = vocabs
token_to_idx = {token: i for i, token in enumerate(idx_to_token)}

In [15]:
class Tokenizer:
    
    def __init__(self, vocabs, use_padding=True, max_padding=64, pad_token='[PAD]', unk_token='[UNK]'):
        self.idx_to_token = vocabs
        self.token_to_idx = {token: i for i, token in enumerate(self.idx_to_token)}
        
        self.use_padding = use_padding
        self.max_padding = max_padding
        
        self.pad_token = pad_token
        self.unk_token = unk_token
        
        self.unk_token_idx = self.token_to_idx[self.unk_token]
        self.pad_token_idx = self.token_to_idx[self.pad_token]
        
    def __call__(self, x:str):
        token_ids = []
        token_list = x.split()
        
        for token in token_list:
            if token in self.token_to_idx:
                token_idx = self.token_to_idx[token]
            else:
                token_idx = self.unk_token_idx
            token_ids.append(token_idx)
        
        if self.use_padding:
            token_ids = token_ids[:self.max_padding]
            n_pads = self.max_padding - len(token_ids)
            token_ids = token_ids + [self.pad_token_idx] * n_pads
        
        return token_ids

In [16]:
tokenizer = Tokenizer(vocabs, use_padding=True, max_padding=50, pad_token='[PAD]', unk_token='[UNK]')

---
## DataLoader

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

In [18]:
train_valid_df = pd.read_csv('data/ratings_train.txt', sep='\t')
test_df = pd.read_csv('data/ratings_test.txt', sep='\t')

In [19]:
len(train_valid_df), len(test_df)

(150000, 50000)

In [20]:
train_valid_df = train_valid_df.sample(frac=1.)

In [21]:
train_ratio = 0.8
n_train = int(len(train_valid_df) * train_ratio)

train_df = train_valid_df[:n_train]
valid_df = train_valid_df[n_train:]

In [22]:
len(train_df), len(valid_df), len(test_df)

(120000, 30000, 50000)

In [23]:
# 1/10 샘플링
train_df = train_df.sample(frac=0.1)
valid_df = valid_df.sample(frac=0.1)
test_df = test_df.sample(frac=0.1)

In [24]:
class NSMCDataset(Dataset):
    
    def __init__(self, data_df, tokenizer=None):
        self.data_df = data_df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, idx):
        sample_raw = self.data_df.iloc[idx]
        sample = {}
        
        sample['doc'] = str(sample_raw['document'])
        sample['label'] = int(sample_raw['label'])
        
        assert sample['label'] in set([0, 1])
        
        if self.tokenizer is not None:
            sample['doc_ids'] = self.tokenizer(sample['doc']) 
        
        return sample        

In [25]:
def collate_fn(batch):
    keys = [key for key in batch[0].keys()]
    data = {key: [] for key in keys}

    for item in batch:
        for key in keys:
            data[key].append(item[key])
            
    return data

In [26]:
train_dataset = NSMCDataset(
    data_df=train_df,
    tokenizer=tokenizer
)
valid_dataset = NSMCDataset(
    data_df=valid_df,
    tokenizer=tokenizer
)
test_dataset = NSMCDataset(
    data_df=test_df,
    tokenizer=tokenizer
)

train_dataloader = DataLoader(train_dataset,
                              batch_size=64,
                              collate_fn=collate_fn,
                              shuffle=True)
valid_dataloader = DataLoader(valid_dataset,
                              batch_size=64,
                              collate_fn=collate_fn,
                              shuffle=False)
test_dataloader = DataLoader(test_dataset,
                             batch_size=64,
                             collate_fn=collate_fn,
                             shuffle=False)

In [27]:
sample = next(iter(test_dataloader))

In [28]:
sample.keys()

dict_keys(['doc', 'label', 'doc_ids'])

In [29]:
sample['doc'][2]

'벌써 세번째 감상중'

In [30]:
sample['doc_ids'][2][:10]

[1014, 9074, 1, 0, 0, 0, 0, 0, 0, 0]

---
## Self-Attention Modeling

In [31]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class SelfAttention(nn.Module):

    def __init__(self,
                 vocab_size,
                 embed_dim):

        super().__init__()
        
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.output_dim = embed_dim

        self.embeddings = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, X, return_attention_score=False):

        batch_size, seq_len = X.size()
        X = self.embeddings(X)
        
        q, k, v = self.q_linear(X), self.k_linear(X), self.v_linear(X)
        
        attention_score_raw = q @ k.transpose(-2,-1) / math.sqrt(self.embed_dim)
        
        attention_score = torch.softmax(attention_score_raw, dim=2)
        
        weighted_sum = attention_score @ v

        context = torch.mean(weighted_sum, dim=1)
        
        if return_attention_score:
            return context, attention_score
        
        return context

In [32]:
class Classifier(nn.Module):

    def __init__(self, sr_model, output_dim, vocab_size, embed_dim, **kwargs):

        super().__init__()

        self.sr_model = sr_model(vocab_size=vocab_size,
                                 embed_dim=embed_dim,
                                 **kwargs)

        self.input_dim = self.sr_model.output_dim
        self.output_dim = output_dim
        self.fc = nn.Linear(self.input_dim, self.output_dim)

    def forward(self, x):
        return self.fc(self.sr_model(x))

In [33]:
model = Classifier(sr_model=SelfAttention,
                   output_dim=2,
                   vocab_size=len(vocabs),
                   embed_dim=16)

In [34]:
model.sr_model.embeddings.weight[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SelectBackward0>)

---
## Training

In [35]:
use_cuda = True and torch.cuda.is_available()

if use_cuda:
    model.cuda()

In [37]:
import torch.optim as optim
import numpy as np
from copy import deepcopy

In [38]:
optimizer = optim.Adam(params=model.parameters(), lr=0.01)
calc_loss = nn.CrossEntropyLoss()

In [39]:
n_epoch = 5
global_i = 0

valid_loss_hist = []
train_loss_hist = []

min_valid_loss = 9e+9
best_model = None
best_epoch_i = None

for epoch_i in range(n_epoch):
    model.train()
    
    for batch in train_dataloader:
        optimizer.zero_grad()
        x = torch.tensor(batch['doc_ids'])
        y = torch.tensor(batch['label'])
        
        if use_cuda:
            x = x.cuda()
            y = y.cuda()
            
        y_pred = model(x)
        loss = calc_loss(y_pred, y)
        
        if global_i % 1000 == 0:
            print(f'global_i: {global_i}, epoch_i: {epoch_i}, loss: {loss.item()}')
        
        train_loss_hist.append((global_i, loss.item()))
        
        loss.backward()
        optimizer.step()
        
        global_i += 1
        
    # validation
    model.eval()
    valid_loss_list = []
    
    for batch in valid_dataloader:
        x = torch.tensor(batch['doc_ids'])
        y = torch.tensor(batch['label'])
        
        if use_cuda:
            x = x.cuda()
            y = y.cuda()
            
        y_pred = model(x)
        loss = calc_loss(y_pred, y)
        valid_loss_list.append(loss.item())
        
    valid_loss_mean = np.mean(valid_loss_list)
    valid_loss_hist.append((global_i, valid_loss_mean.item()))
    
    if valid_loss_mean < min_valid_loss:
        min_valid_loss = valid_loss_mean
        best_epoch_i = epoch_i
        best_model = deepcopy(model)
        
    if epoch_i % 1 == 0:
        print('*'*30)
        print(f'valid_loss_mean: {valid_loss_mean}')
        print('*'*30)

print(f'best_epoch_i: {best_epoch_i}, best_global_i: {global_i}')    

global_i: 0, epoch_i: 0, loss: 0.7187510132789612
******************************
valid_loss_mean: 0.6035824697068397
******************************
******************************
valid_loss_mean: 0.5936969138206319
******************************
******************************
valid_loss_mean: 0.6519526744142492
******************************
******************************
valid_loss_mean: 0.7940343428165355
******************************
******************************
valid_loss_mean: 0.9876124662287692
******************************
best_epoch_i: 1, best_global_i: 940
