In [1]:
import torch
import torch.nn as nn
from data_utils import build_tokenizer, build_embedding_matrix, SentenceDataset,Tokenizer, Vocab
from torch.utils.data import DataLoader
from sklearn import metrics
import os
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F


In [2]:
device = 'gpu' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
data_files = ['data/datasets/Laptops_Train.xml', 'data/datasets/Laptops_Test.xml']
# data_files = ['data/datasets/Restaurants_Train.xml', 'data/datasets/Restaurants_Test.xml']
tokenizer = build_tokenizer(
    fnames=data_files,
    max_length=80,
    data_file='data/datasets/{0}_tokenizer.dat'.format('laptops'))
embedding_matrix = build_embedding_matrix(
    vocab=tokenizer.vocab,
    embed_dim=200,
    data_file='data/datasets/{0}d_{1}_embedding_matrix.dat'.format('200', 'laptops'))
trainset = SentenceDataset(data_files[0] , tokenizer, target_dim=3)
testset = SentenceDataset(data_files[1] , tokenizer, target_dim=3)

End
loading word vectors...
data/datasets/glove.twitter.27B.200d.txt


In [6]:
import pickle

In [7]:
pickle.load(open('data/datasets/laptops_tokenizer.dat','rb'))

<data_utils.Tokenizer at 0x7f0d9084aa60>

#### Parameters needs to be set before runnning this model

In [None]:
epoch = 1
lr=0.001
l2_reg=1e-5
num_epoch = 20
input_cols = ['text']
log_step = 5
model_name = 'lstm'
dataset = 'restaurant'
batch_size = 64
embed_dim = 200
hidden_dim = 200
polarities_dim = 3
polarity_dict = {0: 'positive', 1: 'negative', 2:'neutral'}

In [None]:
train_dataloader = DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset=testset, batch_size=batch_size, shuffle=False)

In [None]:
polarity_count_train = {'positive':0, 'negative': 1, 'neutral':2}
polarity_count_test = {'positive':0, 'negative': 1, 'neutral':2}
for i in train_dataloader:
    for polarity in [polarity_dict[int(j)] for j in i['polarity']]:
        polarity_count_train[polarity] += 1
for i in test_dataloader:
    for polarity in [polarity_dict[int(j)] for j in i['polarity']]:
        polarity_count_test[polarity] += 1
print("Training dataset : " , polarity_count_train)
print("Testing dataset : " , polarity_count_test)

In [None]:
class DynamicLSTM(nn.Module):
    '''
    LSTM which can hold variable length sequence, use like TensorFlow's RNN(input, lenght...).
    '''
    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, batch_first=True, dropout=0,
                 bidirectional=False, only_use_last_hidden_state=False, rnn_type='LSTM'):
        super(DynamicLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.only_use_last_hidden_state = only_use_last_hidden_state
        self.rnn_type = rnn_type
        
        if self.rnn_type == 'LSTM':
            self.RNN = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                               bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        elif self.rnn_type == 'GRU':
            self.RNN = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                              bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        elif self.rnn_type == 'RNN':
            self.RNN = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                              bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
    
    def forward(self, x, x_len):
        '''
        sequence -> sort -> pad and pack -> process using RNN -> unpack -> unsort
        '''
        '''sort'''
        x_sort_idx = torch.sort(x_len, descending=True)[1].long()
        x_unsort_idx = torch.sort(x_sort_idx)[1].long()
        x_len = x_len[x_sort_idx]
        x = x[x_sort_idx]
        '''pack'''
        x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=self.batch_first)
        ''' process '''
        if self.rnn_type == 'LSTM':
            out_pack, (ht, ct) = self.RNN(x_emb_p, None)
        else:
            out_pack, ht = self.RNN(x_emb_p, None)
            ct = None
        '''unsort'''
        ht = ht[:, x_unsort_idx]
        if self.only_use_last_hidden_state:
            return ht
        else:
            out, _ = torch.nn.utils.rnn.pad_packed_sequence(out_pack, batch_first=self.batch_first)
            if self.batch_first:
                out = out[x_unsort_idx]
            else:
                out = out[:, x_unsort_idx]
            if self.rnn_type == 'LSTM':
                ct = ct[:, x_unsort_idx]
            return out, (ht, ct)


In [None]:
class LSTM(nn.Module):
    ''' Standard LSTM '''
    def __init__(self, embedding_matrix):
        super(LSTM, self).__init__()
        self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
        self.lstm = DynamicLSTM(embed_dim, hidden_dim, num_layers=1, batch_first=True)
        self.dense = nn.Linear(hidden_dim, polarities_dim)
    
    def forward(self, inputs):
        text = inputs[0]
        x = self.embed(text)
        x_len = torch.sum(text != 0, dim=-1)
        _, (h_n, _) = self.lstm(x, x_len)
        out = self.dense(h_n[0])
        return out

In [None]:
model = LSTM(embedding_matrix).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(params, lr=lr, weight_decay=l2_reg)
writer = SummaryWriter(f"runs/LSTM/BatchSize {batch_size} LR {lr}")

In [None]:
def reset_params(model):
    for p in model.parameters():
        if p.requires_grad:
            if len(p.shape) > 1:
                torch.nn.init.xavier_normal_(p)
            else:
                stdv = 1. / (p.shape[0]**0.5)
                torch.nn.init.uniform_(p, a=-stdv, b=stdv)

In [None]:
n_trainable_params, n_nontrainable_params = 0, 0
for p in model.parameters():
    n_params = torch.prod(torch.tensor(p.shape))
    if p.requires_grad:
        n_trainable_params += n_params
    else:
        n_nontrainable_params += n_params
print('n_trainable_params: {0}, n_nontrainable_params: {1}'.format(n_trainable_params, n_nontrainable_params))

In [None]:
def train(model, criterion, optimizer, writer, max_test_acc_overall=0, model_name='LSTM'):
    max_test_acc = 0
    max_f1 = 0
    global_step = 0
    for epoch in range(num_epoch):
        print('>' * 50)
        print('epoch:', epoch)
        n_correct, n_total = 0, 0
        for i_batch, sample_batched in enumerate(train_dataloader):
            global_step += 1
            # switch model to training mode, clear gradient accumulators
            model.train()
            optimizer.zero_grad()
            
            inputs = [sample_batched[col].to(device) for col in input_cols]
            outputs = model(inputs)
            targets = sample_batched['polarity'].to(device)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            writer.add_scalar("Training loss", loss, global_step=global_step)
            

            if global_step % log_step == 0:
                n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
                n_total += len(outputs)
                train_acc = n_correct / n_total
                writer.add_scalar("Training Accuracy", 
                                  train_acc,
                                  global_step=global_step)
                test_acc, f1 = evaluate(model, writer, global_step)
                if test_acc > max_test_acc:
                    max_test_acc = test_acc
                    if test_acc > max_test_acc_overall:
                        if not os.path.exists('state_dict'):
                            os.mkdir('state_dict')
                        path = './state_dict/{0}_{1}_{2}class_acc{3:.4f}'.format(model_name, dataset, polarities_dim, test_acc)
                        torch.save(model.state_dict(), path)
                        print('model saved:', path)
                if f1 > max_f1:
                    max_f1 = f1
                print('loss: {:.4f}, acc: {:.4f}, test_acc: {:.4f}, f1: {:.4f}'.format(loss.item(), train_acc, test_acc, f1))
    return max_test_acc, max_f1

In [None]:
def evaluate(model, writer, step):
    # switch model to evaluation mode
    model.eval()
    n_test_correct, n_test_total = 0, 0
    t_targets_all, t_outputs_all = None, None
    with torch.no_grad():
        for t_batch, t_sample_batched in enumerate(test_dataloader):
            t_inputs = [t_sample_batched[col].to(device) for col in input_cols]
            t_targets = t_sample_batched['polarity'].to(device)
            t_outputs = model(t_inputs)

            n_test_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item()
            n_test_total += len(t_outputs)

            t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) if t_targets_all is not None else t_targets
            t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0) if t_outputs_all is not None else t_outputs
    test_acc = n_test_correct / n_test_total
    writer.add_scalar("Testing Accuracy", 
                                  test_acc,
                                  global_step=step)
    f1 = metrics.f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average='macro')
    return test_acc, f1

In [None]:
def run(model, writer):
    max_test_acc_overall = 0
    max_f1_overall = 0
    repeats = 1
    for i in range(repeats):
        print('repeat:', i)
        reset_params(model)
        max_test_acc, max_f1 = train(model, criterion, optimizer, writer, max_test_acc_overall)
        print('max_test_acc: {0}, max_f1: {1}'.format(max_test_acc, max_f1))
        max_test_acc_overall = max(max_test_acc, max_test_acc_overall)
        max_f1_overall = max(max_f1, max_f1_overall)
        print('#' * 50)
    print('max_test_acc_overall:', max_test_acc_overall)
    print('max_f1_overall:', max_f1_overall)

In [None]:
run(model, writer)

### Run the latest saved model 

In [None]:
latest_file = sorted([os.path.join('state_dict',path) for path in os.listdir('state_dict')], key=os.path.getmtime)[-1]
checkpoints = torch.load(latest_file)
model.load_state_dict(checkpoints)
model

In [None]:
sample_data = torch.tensor(tokenizer.text_to_sequence("Keyboard is great, very quiet for all the typing that I do."))
output = model(sample_data.reshape(1,1,-1))
polarity_dict[int(torch.argmax(output, -1))]

#### Parameters needs to be set before runnning this model

In [None]:
epoch = 1
lr=0.001
l2_reg=1e-5
num_epoch = 20
input_cols = ['text', 'aspect']
log_step = 5
model_name = 'ae_lstm'
dataset = 'restaurant'
batch_size = 64
embed_dim = 200
hidden_dim = 200
polarities_dim = 3
polarity_dict = {0: 'positive', 1: 'negative', 2:'neutral'}

In [None]:
class SqueezeEmbedding(nn.Module):
    '''
    Squeeze sequence embedding length to the longest one in the batch
    '''
    def __init__(self, batch_first=True):
        super(SqueezeEmbedding, self).__init__()
        self.batch_first = batch_first
    
    def forward(self, x, x_len):
        '''
        sequence -> sort -> pad and pack -> unpack -> unsort
        '''
        '''sort'''
        x_sort_idx = torch.sort(x_len, descending=True)[1].long()
        x_unsort_idx = torch.sort(x_sort_idx)[1].long()
        x_len = x_len[x_sort_idx]
        x = x[x_sort_idx]
        '''pack'''
        x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=self.batch_first)
        '''unpack'''
        out, _ = torch.nn.utils.rnn.pad_packed_sequence(x_emb_p, batch_first=self.batch_first)
        if self.batch_first:
            out = out[x_unsort_idx]
        else:
            out = out[:, x_unsort_idx]
        return out

In [None]:
class AE_LSTM(nn.Module):
    ''' LSTM with Aspect Embedding '''
    def __init__(self, embedding_matrix):
        super(AE_LSTM, self).__init__()
        self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
        self.squeeze_embedding = SqueezeEmbedding()
        self.lstm = DynamicLSTM(embed_dim*2, hidden_dim, num_layers=1, batch_first=True)
        self.dense = nn.Linear(hidden_dim, polarities_dim)
    
    def forward(self, inputs):
        text, aspect_text = inputs[0], inputs[1]
        x_len = torch.sum(text != 0, dim=-1)
        x_len_max = torch.max(x_len)
        aspect_len = torch.sum(aspect_text != 0, dim=-1).float()
        
        x = self.embed(text)
        x = self.squeeze_embedding(x, x_len)
        aspect = self.embed(aspect_text)
        aspect_pool = torch.div(torch.sum(aspect, dim=1), aspect_len.view(aspect_len.size(0), 1))
        aspect = torch.unsqueeze(aspect_pool, dim=1).expand(-1, x_len_max, -1)
        x = torch.cat((aspect, x), dim=-1)
        
        _, (h_n, _) = self.lstm(x, x_len)
        out = self.dense(h_n[0])
        return out

In [None]:
model_AE = AE_LSTM(embedding_matrix).to(device)

In [None]:
model_AE

In [None]:
criterion = nn.CrossEntropyLoss()
params = filter(lambda p: p.requires_grad, model_AE.parameters())
optimizer = torch.optim.Adam(params, lr=lr, weight_decay=l2_reg)
writer_AE = SummaryWriter(f"runs/AE_LSTM/BatchSize {batch_size} LR {lr}")

In [None]:
run(model_AE , writer_AE)

In [None]:
sample_data = torch.tensor(tokenizer.text_to_sequence("MS Office 2011 for Mac is wonderful, well worth it.")).reshape(1,-1)
sample_aspect = torch.tensor(tokenizer.text_to_sequence('MS Office 2011 for Mac').reshape(1,-1))
data = [sample_data, sample_aspect]
output = model_AE(data)
polarity_dict[int(torch.argmax(output, -1))]

In [None]:
max_length = 80
position_dim = 100

In [None]:
max_length

In [None]:
class PBAN(nn.Module):
    ''' Position-aware bidirectional attention network '''
    def __init__(self, embedding_matrix):
        super(PBAN, self).__init__()
        self.text_embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
        self.pos_embed = nn.Embedding(max_length, position_dim)
        self.left_gru = DynamicLSTM(embed_dim, hidden_dim, num_layers=1, 
                                    batch_first=True, bidirectional=True, rnn_type='GRU')
        self.right_gru = DynamicLSTM(embed_dim+position_dim, hidden_dim, num_layers=1, 
                                     batch_first=True, bidirectional=True, rnn_type='GRU')
        self.weight_m = nn.Parameter(torch.Tensor(hidden_dim*2, hidden_dim*2))
        self.bias_m = nn.Parameter(torch.Tensor(1))
        self.weight_n = nn.Parameter(torch.Tensor(hidden_dim*2, hidden_dim*2))
        self.bias_n = nn.Parameter(torch.Tensor(1))
        self.w_r = nn.Linear(hidden_dim*2, hidden_dim)
        self.w_s = nn.Linear(hidden_dim, polarities_dim)
    
    def forward(self, inputs):
        text, aspect_text, position_tag = inputs[0], inputs[1], inputs[2]
        ''' Sentence representation '''
        x = self.text_embed(text)
        position = self.pos_embed(position_tag)
        x_len = torch.sum(text != 0, dim=-1)
        x = torch.cat((position, x), dim=-1)
        h_x, _ = self.right_gru(x, x_len)
        ''' Aspect term representation '''
        aspect = self.text_embed(aspect_text)
        aspect_len = torch.sum(aspect_text != 0, dim=-1)
        h_t, _ = self.left_gru(aspect, aspect_len)
        ''' Aspect term to position-aware sentence attention '''
        alpha = F.softmax(torch.tanh(torch.add(torch.bmm(torch.matmul(h_t, self.weight_m), torch.transpose(h_x, 1, 2)), self.bias_m)), dim=1)
        s_x = torch.bmm(alpha, h_x)
        ''' Position-aware sentence attention to aspect term '''
        h_x_pool = torch.unsqueeze(torch.div(torch.sum(h_x, dim=1), x_len.float().view(x_len.size(0), 1)), dim=1)
        gamma = F.softmax(torch.tanh(torch.add(torch.bmm(torch.matmul(h_x_pool, self.weight_n), torch.transpose(h_t, 1, 2)), self.bias_n)), dim=1)
        h_r = torch.squeeze(torch.bmm(gamma, s_x), dim=1)
        ''' Output transform '''
        out = torch.tanh(self.w_r(h_r))
        out = self.w_s(out)
        return out

In [None]:
model_PBAN = PBAN(embedding_matrix).to(device)

In [None]:
model_PBAN

In [None]:
epoch = 1
lr=0.001
l2_reg=1e-5
num_epoch = 20
input_cols = ['text', 'aspect', 'position']
log_step = 5
model_name = 'pban_lstm'
dataset = 'restaurant'
batch_size = 64
embed_dim = 200
hidden_dim = 200
polarities_dim = 3
polarity_dict = {0: 'positive', 1: 'negative', 2:'neutral'}

In [None]:
criterion = nn.CrossEntropyLoss()
params = filter(lambda p: p.requires_grad, model_PBAN.parameters())
optimizer = torch.optim.Adam(params, lr=lr, weight_decay=l2_reg)
writer_PBAN = SummaryWriter(f"runs/PBAN_LSTM/BatchSize {batch_size} LR {lr}")

In [None]:
run(model_PBAN , writer_PBAN)