In [1]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import os
# print(os.listdir("../input"))
USE_CUDA = torch.cuda.is_available()
DEVICE = 0 if USE_CUDA else -1

The sentiment labels are:

* 0 : negative
* 1 : somewhat negative
* 2 : neutral
* 3 : somewhat positive
* 4 : positive

In [2]:
train = pd.read_csv('../data/SentimentAnalysis/train.tsv', sep='\t')
test = pd.read_csv('../data/SentimentAnalysis/test.tsv', sep='\t')
train_idx = int(len(train.drop_duplicates('SentenceId')) * 0.9)
print('train_sentence_idx: {}'.format(train_idx))

train_sentence_idx: 7676


In [3]:
split_idx = train.loc[train.SentenceId == train_idx].index[-1] + 1
valid = train.iloc[split_idx:, :]
train = train.iloc[:split_idx, :]
print('train_data: {}, valid_data: {}'.format(len(train), len(valid)))

train_data: 141485, valid_data: 14575


In [4]:
def write_files(path, data, test=False):
    with open(path, 'w', encoding='utf-8') as file:
        if test:
            for phrase in data:
                print(phrase, file=file)
        else:
            for phrase, sent in data:
                print('\t'.join([phrase, str(sent)]), file=file)

In [5]:
write_files('./train_data.txt', train.iloc[:, 2:].values)
write_files('./valid_data.txt', valid.iloc[:, 2:].values)
write_files('./test_data.txt', test.iloc[:, -1].values, test=True)

In [6]:
PHRASE = Field(tokenize=str.split, use_vocab=True, lower=True, include_lengths=True,
               batch_first=True)
SENT = Field(sequential=False, use_vocab=False, preprocessing=lambda x: int(x))

In [7]:
train_data, valid_data = TabularDataset.splits(
       path='./', train='train_data.txt', validation="valid_data.txt", 
       format='tsv', fields=[('phrase', PHRASE), ('sent', SENT)])
test_data = TabularDataset.splits(
       path='./', test='test_data.txt', format='tsv', fields=[('phrase', PHRASE)])

In [8]:
PHRASE.build_vocab(train_data)
print('number of vocabulary: {}'.format(len(PHRASE.vocab)))

number of vocabulary: 15648


In [9]:
BATCH = 64

In [10]:
train_loader, valid_loader, test_loader = BucketIterator.splits(
    (train_data, valid_data, test_data[0]), batch_size=BATCH, device=DEVICE,
    sort_key=lambda x: len(x.phrase), sort_within_batch=True, repeat=False)

In [30]:
class bidirec_GRU(nn.Module):
    def __init__(self, V, D, H, H_f, O, da, r, num_layers=3, bidirec=False, use_cuda=False):
        """
        V: input_size = vocab_size
        D: embedding_size
        H: hidden_size
        H_f: hidden_size (fully-connected)
        O: output_size (fully-connected)
        da: attenion_dimension (hyperparameter)
        r: keywords (different parts to be extracted from the sentence)
        """
        super(bidirec_GRU, self).__init__()
        self.r = r
        self.da = da
        self.hidden_size = H
        self.num_layers = num_layers
        self.USE_CUDA = use_cuda
        self.num_directions = 2 if bidirec else 1
        
        self.embed = nn.Embedding(V, D)
        self.gru = nn.GRU(D, H, num_layers, batch_first=True, bidirectional=bidirec)
        self.attn = nn.Linear(self.num_directions*H, self.da, bias=False)
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.attn2 = nn.Linear(self.da, self.r, bias=False)
        self.attn_dist = nn.Softmax(dim=2)
        
        self.fc = nn.Sequential(
            nn.Linear(r*H*self.num_directions, H_f),
            nn.ReLU(),
            nn.Linear(H_f, O)
        )
            
    def init_GRU(self, batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size)
        if self.USE_CUDA:
            hidden = hidden.cuda()
        return hidden
    
    def penalization_term(self, A):
        """
        A : B, r, T
        Frobenius Norm 
        """
        eye = torch.eye(A.size(1)).expand(A.size(0), self.r, self.r) # B, r, r
        if self.USE_CUDA:
            eye = eye.cuda()
        P = torch.bmm(A, A.transpose(1, 2)) - eye # B, r, r
        loss_P = ((P**2).sum(1).sum(1) + 1e-10) ** 0.5
        loss_P = torch.sum(loss_P) / A.size(0)
        return loss_P
        
    def forward(self, inputs, inputs_lengths):
        """
        inputs: B, T, V
         - B: batch_size
         - T: max_len = seq_len
         - V: vocab_size
        inputs_lengths: length of each sentences
        """
        embed = self.embed(inputs)  # B, T, V  --> B, T, D
        hidden = self.init_GRU(inputs.size(0))  # num_layers * num_directions, B, H
        # pack sentences
        packed = pack_padded_sequence(embed, inputs_lengths.tolist(), batch_first=True)
        # packed: B * real_length, D
        output, hidden = self.gru(packed, hidden)
        # output: B * T, 2H
        # hidden: num_layers * num_directions, B, H
        
        # unpack sentences
        output, output_lengths = pad_packed_sequence(output, batch_first=True) 
        # output: B, T, 2H

        # Self Attention
        a1 = self.attn(output)  # Ws1(B, da, 2H) * output(B, T, 2H) -> B, T, da
        tanh_a1 = self.tanh(a1)  # B, T, da
        score = self.attn2(tanh_a1)  # Ws2(B, r, da) * tanh_a1(B, T, da) -> B, T, r
        self.A = self.attn_dist(score.transpose(1, 2))  # B, r, T
        self.M = self.A.bmm(output)  # B, r, T * B, T, 2H -> B, r, 2H 
        
        # Penalization Term
        loss_P = self.penalization_term(self.A)
        
        output = self.fc(self.M.view(self.M.size(0), -1)) # B, r, 2H -> resize to B, r*2H -> B, H_f -> Relu -> B, 1
        
        return output, loss_P
    
    def predict(self, inputs, inputs_lengths):
        preds, _ = self.forward(inputs, inputs_lengths)
        _, idx = F.softmax(preds, dim=1).max(1)
        return idx

In [51]:
V = len(PHRASE.vocab)
D = 100
H = 300
H_f = 1000
O = 5
DA = 300
R = 10
N_LAYERS = 1
bidirec = True
weight_decay_rate = 0.0001
LR = 0.01
STEP = 25

In [52]:
model = bidirec_GRU(V, D, H, H_f, O, DA, R, 
                    num_layers=N_LAYERS, bidirec=bidirec, use_cuda=USE_CUDA)
if USE_CUDA:
    model = model.cuda()

In [53]:
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(gamma=0.1, milestones=[5, 15, 20], optimizer=optimizer)

In [54]:
valid_losses = [10e5]
model.train()
for step in range(STEP):
    losses=[]
    scheduler.step()
    # train
    for batch in train_loader:
        inputs, lengths = batch.phrase
        targets = batch.sent
        if 0 in lengths:
            inputs = inputs[lengths.ne(0)]
            targets = targets[lengths.ne(0)]
            lengths = lengths[lengths.ne(0)]
        
        model.zero_grad()
        
        preds, loss_P = model(inputs, lengths)
        
        loss = loss_function(preds, targets) + loss_P
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
    
    # valid
    model.eval()
    valid_loss = []
    for batch in valid_loader:
        inputs, lengths = batch.phrase
        targets = batch.sent
        if 0 in lengths:
            inputs = inputs[lengths.ne(0)]
            targets = targets[lengths.ne(0)]
            lengths = lengths[lengths.ne(0)]
        
        preds, loss_P = model(inputs, lengths)
        v_loss = loss_function(preds, targets) + loss_P
        valid_loss.append(v_loss.item())
        
    valid_losses.append(np.mean(valid_loss))
    
    if valid_losses[-2] - valid_losses[-1] < 0:
        torch.save(model.state_dict(), './model/model({}_{:.4f})'.format(step, np.mean(valid_loss)))
        
    string = '[{}/{}] train_loss: {:.4f}, valid_loss: {:.4f}, lr: {:.4f}'.format(
        step+1, STEP, np.mean(losses), np.mean(valid_loss), scheduler.get_lr()[0])
    print(string)
    losses = []
    valid_loss = []
    model.train()

  return Variable(arr, volatile=not train), lengths
  return Variable(arr, volatile=not train)


[1/30] train_loss: 4.5219, valid_loss: 4.0295, lr: 0.0100
[2/30] train_loss: 5.9502, valid_loss: 4.7035, lr: 0.0100
[3/30] train_loss: 5.4246, valid_loss: 4.7960, lr: 0.0100
[4/30] train_loss: 8.5222, valid_loss: 7.9284, lr: 0.0100
[5/30] train_loss: 10.8120, valid_loss: 7.4146, lr: 0.0100
[6/30] train_loss: 9.5773, valid_loss: 6.1754, lr: 0.0100
[7/30] train_loss: 10.5724, valid_loss: 6.8577, lr: 0.0100
[8/30] train_loss: 10.3974, valid_loss: 6.8107, lr: 0.0100
[9/30] train_loss: 10.2966, valid_loss: 6.4475, lr: 0.0100
[10/30] train_loss: 9.9651, valid_loss: 6.8658, lr: 0.0100


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), './model/model({}_{:.4f})'.format(step, np.mean(valid_loss)))