In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torchtext
import nltk
from konlpy.tag import Mecab
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

## Goal

Sentiment Classification: Classify Good/Bad from movie reviews

* Task: Many to One
* Use: Attention(to know which words is important to classify the good/bad movie) + LSTM

A STRUCTURED SELF-ATTENTIVE
SENTENCE EMBEDDING: https://arxiv.org/pdf/1703.03130.pdf

## Get Datas

In [3]:
# df_train = pd.read_csv('../data/ratings_train.txt', sep='\t', usecols=[1, 2])
# df_test = pd.read_csv('../data/ratings_test.txt', sep='\t', usecols=[1, 2])
# df_train.to_csv('../data/train_docs.txt', sep='\t', index=False, header=False)
# df_test.to_csv('../data/test_docs.txt', sep='\t', index=False, header=False)

Train_loader

In [4]:
batch_size = 64

In [5]:
tagger = Mecab()
tagger = tagger.morphs

In [6]:
REVIEW = Field(tokenize=tagger, use_vocab=True, lower=True, #init_token="<s>", eos_token="</s>", 
               include_lengths=True, batch_first=True)
LABEL = Field(sequential=False, use_vocab=False, preprocessing=lambda x: int(x))

In [7]:
train_data, test_data = TabularDataset.splits(
                   path="../data/", train='train_docs.txt', validation="test_docs.txt",
                   format='tsv', fields=[('review', REVIEW), ('label', LABEL)])

In [8]:
print(len(train_data), len(test_data))

150000 50000


In [9]:
# Build Vocaburary
REVIEW.build_vocab(train_data)
len(REVIEW.vocab)

53078

In [10]:
# make iterator for splits
train_iter, test_iter = BucketIterator.splits(
    (train_data, test_data), batch_size=batch_size, device=DEVICE, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.review), sort_within_batch=True, repeat=False) # x.TEXT 길이 기준으로 정렬

## Model

In [11]:
class bidirec_LSTM(nn.Module):
    def __init__(self, V, D, H, H_f, O, da, r, num_layers=3, bidirec=False, use_cuda=False):
        """
        V: input_size = vocab_size
        D: embedding_size
        H: hidden_size
        H_f: hidden_size (fully-connected)
        O: output_size (fully-connected)
        da: attenion_dimension (hyperparameter)
        r: keywords (different parts to be extracted from the sentence)
        """
        super(bidirec_LSTM, self).__init__()
        self.r = r
        self.da = da
        self.hidden_size = H
        self.num_layers = num_layers
        self.USE_CUDA = use_cuda
        if bidirec:
            self.num_directions = 2
        else:
            self.num_directions = 1
        
        self.embed = nn.Embedding(V, D)
        self.lstm = nn.LSTM(D, H, num_layers, batch_first=True, bidirectional=bidirec)
        self.attn = nn.Linear(self.num_directions*H, self.da, bias=False)
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.attn2 = nn.Linear(self.da, self.r, bias=False)
        self.attn_dist = nn.Softmax(dim=2)
        
        self.fc = nn.Sequential(
            nn.Linear(r*H*self.num_directions, H_f),
            nn.ReLU(),
            nn.Linear(H_f, O),
        )
            
    def init_LSTM(self, batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = Variable(torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size))
        cell = Variable(torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size))
        if self.USE_CUDA:
            hidden = hidden.cuda()
            cell = cell.cuda()
        return hidden, cell
    
    def penalization_term(self, A):
        """
        A : B, r, T
        Frobenius Norm 
        """
        eye = Variable(torch.eye(A.size(1)).expand(A.size(0), self.r, self.r)) # B, r, r
        if self.USE_CUDA:
            eye = eye.cuda()
        P = torch.bmm(A, A.transpose(1, 2)) - eye # B, r, r
        loss_P = ((P**2).sum(1).sum(1) + 1e-10) ** 0.5
        loss_P = torch.sum(loss_P) / A.size(0)
        return loss_P
        
    def forward(self, inputs, inputs_lengths):
        """
        inputs: B, T, V
         - B: batch_size
         - T: max_len = seq_len
         - V: vocab_size
        inputs_lengths: length of each sentences
        """
        embed = self.embed(inputs)  # B, T, V  --> B, T, D
        hidden, cell = self.init_LSTM(inputs.size(0))  # num_layers * num_directions, B, H
        
        # 패딩된 문장을 패킹(패딩은 연산 안들어가도록)
        packed = pack_padded_sequence(embed, inputs_lengths.tolist(), batch_first=True)
        # packed: B * T, D
        output, (hidden, cell) = self.lstm(packed, (hidden, cell))
        # output: B * T, 2H
        # hidden, cell: num_layers * num_directions, B, H
        
        # 패킹된 문장을 다시 unpack
        output, output_lengths = pad_packed_sequence(output, batch_first=True) 
        # output: B, T, 2H

        # Self Attention
        a1 = self.attn(output)  # Ws1(B, da, 2H) * output(B, T, 2H) -> B, T, da
        tanh_a1 = self.tanh(a1)  # B, T, da
        score = self.attn2(tanh_a1)  # Ws2(B, r, da) * tanh_a1(B, T, da) -> B, T, r
        self.A = self.attn_dist(score.transpose(1, 2))  # B, r, T
        self.M = self.A.bmm(output)  # B, r, T * B, T, 2H -> B, r, 2H 
        
        # Penalization Term
        loss_P = self.penalization_term(self.A)
        
        output = self.fc(self.M.view(self.M.size(0), -1)) # B, r, 2H -> resize to B, r*2H -> B, H_f -> Relu -> B, 1
        
        return output, loss_P
    
    def predict(self, inputs, inputs_lengths):
        preds, _ = self.forward(inputs, inputs_lengths)
        return F.sigmoid(preds).ge(0.5).long().view(-1)

In [12]:
V = len(REVIEW.vocab)
D = 100
H = 200
H_f = 1000
O = 1
da = 300
r = 5
num_layers = 3
num_directions = 2
bidirec = True
batch_size = 64
# weight_decay_rate = 0.0001
LR = 0.01
STEP = 15

In [13]:
model = bidirec_LSTM(V, D, H, H_f, O, da, r, num_layers=num_layers, bidirec=bidirec, use_cuda=USE_CUDA)
if USE_CUDA:
    model = model.cuda()

In [14]:
loss_function = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=weight_decay_rate)
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(gamma=0.1, milestones=[2, 7, 12], optimizer=optimizer)

### TRAIN

In [15]:
model.train()
for step in range(STEP):
    losses=[]
    scheduler.step()
    for i, batch in enumerate(train_iter):
        inputs, lengths = batch.review
        targets = batch.label
        # prevent length = 0
        if 0 in lengths:
            idxes = torch.arange(inputs.size(0))
            if USE_CUDA:
                idxes = idxes.cuda()
            mask = Variable(idxes[lengths.ne(0)].long())

            inputs = inputs.index_select(0, mask)
            lengths = lengths.masked_select(lengths.ne(0))
            targets = targets.index_select(0, mask)
        
        model.zero_grad()
        
        preds, loss_P = model(inputs, lengths)
        
        loss = loss_function(preds.view(-1), targets.float()) + loss_P
        losses.append(loss.data[0])
        
        loss.backward()
        optimizer.step()
    
    string = '[{}/{}] loss: {:.4f}'.format(step+1, STEP, np.mean(losses))
    print(string)
    losses = []

  """


[1/15] loss: 0.9496
[2/15] loss: 0.8626
[3/15] loss: 0.8415
[4/15] loss: 0.8402
[5/15] loss: 0.8390
[6/15] loss: 0.7450
[7/15] loss: 0.4578
[8/15] loss: 0.3275
[9/15] loss: 0.2908
[10/15] loss: 0.2629
[11/15] loss: 0.2356
[12/15] loss: 0.2132
[13/15] loss: 0.1974
[14/15] loss: 0.1939
[15/15] loss: 0.1914


In [16]:
torch.save(model.state_dict(), '../model/self_attn_3H_r5.model')

In [17]:
# model = bidirec_LSTM(V, D, H, H_f, O, da, r, num_layers=num_layers, bidirec=bidirec, use_cuda=USE_CUDA)
# if USE_CUDA:
#     model = model.cuda()
# model.load_state_dict(torch.load('../model/self_attn_3H_r5.model'))

### TEST

In [18]:
num_equal=0
for i, batch in enumerate(test_iter):
    inputs, lengths = batch.review
    targets = batch.label
    if 0 in lengths:
        idxes = torch.arange(inputs.size(0))
        if USE_CUDA:
            idxes = idxes.cuda()
        mask = Variable(idxes[lengths.ne(0)].long())

        inputs = inputs.index_select(0, mask)
        lengths = lengths.masked_select(lengths.ne(0))
        targets = targets.index_select(0, mask)
    preds = model.predict(inputs, lengths)
    num_equal += torch.eq(preds, targets).sum().data[0]

print("Accuracy : " , num_equal / len(test_data))

Accuracy :  0.85096


  


## Reference) Self Attention

Suppose we have a sentence, which has n tokens, represented in a sequence of word embeddings.
$$S = (w_1, w_2, \cdots, w_n)\qquad\qquad (1)$$
Here $w_i$, is a vector standing for a $d$ dimentional word embedding for the $i$-th word in the sentence.
$S$ is thus a sequence represented as a 2-D matrix, which concatenates all the word embeddings
together. $S$ should have the shape $n$-by-$d$.

Now each entry in the sequence $S$ are independent with each other. To gain some dependency between
adjacent words within a single sentence, we use a bidirectional LSTM to process the sentence:
$$\begin{aligned}
\overrightarrow{h_t} &= \overrightarrow{LSTM}(w_t, \overrightarrow{h_{t-1}})\qquad\qquad (2) \\
\overleftarrow{h_t} &= \overleftarrow{LSTM}(w_t, \overleftarrow{h_{t-1}})\qquad\qquad (3)
\end{aligned}$$

And we concatenate each $\overrightarrow{h_t}$ with $\overleftarrow{h_t}$ to obtain a hidden state $h_t$. Let the hidden unit number for each unidirectional LSTM be $u$. For simplicity, we note all the $n$ $h_t$s as $H$, who have the size $n$-by-2$u$.
$$H = (h_1, h_2, \cdots, h_n)$$

Our aim is to encode a variable length sentence into a fixed size embedding. We achieve that by
choosing a linear combination of the n LSTM hidden vectors in $H$. Computing the linear combination
requires the self-attention mechanism. The attention mechanism takes the whole LSTM hidden
states $H$ as input, and outputs a vector of weights $a$:
$$a = softmax(w_{s2} \tanh (W_{s1}H^T))$$

Here $W_{s1}$ is a weight matrix with a shape of $d_a$-by-2$u$. and $w_{s2}$ is a vector of parameters with
size $d_a$, where $d_a$ is a hyperparameter we can set arbitrarily. Since $H$ is sized $n$-by-2$u$, the annotation vector a will have a size $n$. the $softmax()$ ensures all the computed weights sum up to 1.
Then we sum up the LSTM hidden states $H$ according to the weight provided by a to get a vector representation m of the input sentence.

This vector representation usually focuses on a specific component of the sentence, like a special set of related words or phrases. So it is expected to reflect an aspect, or component of the semantics in a sentence. However, there can be multiple components in a sentence that together forms the overall semantics of the whole sentence, especially for long sentences. (For example, two clauses linked together by an ”and.”) Thus, to represent the overall semantics of the sentence, we need multiple $m$’s that focus on different parts of the sentence. Thus we need to perform multiple hops of attention.

Say we want $r$ different parts to be extracted from the sentence, with regard to this, we extend the
$w_{s2}$ into a $r$-by-$d_a$ matrix, note it as $W_{s2}$, and the resulting annotation vector a becomes annotation matrix $A$. Formallly,
$$A=softmax(W_{s2}tanh(W_{s1}H^T))$$

Here the $softmax()$ is performed along the second dimension of its input. We can deem Equation 6 as a 2-layer MLP without bias, whose hidden unit numbers is $d_a$, and parameters are $\{W_{s2}, W_{s1}\}$.
The embedding vector $m$ then becomes an $r$-by-2$u$ embedding matrix $M$. We compute the $r$
weighted sums by multiplying the annotation matrix $A$ and LSTM hidden states $H$, the resulting
matrix is the sentence embedding:
$$M=AH$$