In [1]:
#! /usr/bin/env python
import os
import argparse
import datetime
import re
import numpy as np
import random


In [2]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim

In [3]:
from torchtext import data
from visdom import Visdom
viz = Visdom()

In [4]:
from data import tokenizer, data_split, preprocess_dataset, create_batches
from data import SemEval10_task8

In [5]:
class LSTM_Baseline_Model(nn.Module):
    def __init__(self, word_vocab, label_vocab, word_emb_dim, pos_emb_dim, hidden_dim, output_dim, MAX_POS = 15, use_gpu = True):
        super(LSTM_Baseline_Model, self).__init__()
        
        # Set hyper parameters
        self.word_emb_dim = word_emb_dim
        self.pos_emb_dim = pos_emb_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.input_dim = word_emb_dim + pos_emb_dim * 2
        
        self.MAX_POS = MAX_POS
        
        
        # Set options and other parameters
        self.use_gpu = use_gpu
        self.word_vocab = word_vocab
        self.label_vocab = label_vocab
        #self.pos_vocab = pos_vocab
        
        
        # Free parameters for the model
        # Initialize embeddings (Word and Position embeddings) 
        self.word_emb = nn.Embedding(len(self.word_vocab), self.word_emb_dim).cuda()
        
        self.pos1_emb = nn.Embedding(self.MAX_POS*2+1, self.pos_emb_dim).cuda()
        self.pos1_emb.weight.data.uniform_(-0.04, 0.04)
        self.pos2_emb = nn.Embedding(self.MAX_POS*2+1, self.pos_emb_dim).cuda()
        self.pos2_emb.weight.data.uniform_(-0.04, 0.04)
        
        # Initialize LSTM parameters ()
        self.lstm = nn.LSTM(self.input_dim, hidden_dim, bidirectional=True, batch_first = True).cuda()
        
        
        # Initialize Attention parameters ()
        self.attention_hidden = nn.Linear(hidden_dim * 2, hidden_dim,bias=False).cuda()
        self.attention = nn.Linear(hidden_dim, 1, bias=False).cuda()
        
        # Initialize Classifier parameters ()
        self.classifier = nn.Linear(hidden_dim * 2, output_dim).cuda()
        
        
        self.word_emb.weight.data.copy_(word_vocab.vectors)
    def prepare_inout(self, X, y):
        sents, pos1, pos2 = list(zip(*X))
        #sents = list(zip(*X))
        #pos1 = datas['position_indices_1']
        #pos2 = datas['position_indices_2']
        
        labels = y
        
        words = [ [ self.word_vocab.stoi[word] for word in sent] for sent in sents]
        #print(words)
        words_var = Variable(torch.LongTensor(words).cuda())
        #print(words_var)
        word_embeddings = self.word_emb(words_var)
        #print(word_embedings)
        
        pos1 = np.array(pos1).astype('int')
        #print(pos1)
        pos1_var = Variable(torch.LongTensor(pos1).cuda())
        pos1_embeddings = self.pos1_emb(pos1_var)
        #print(pos1_var)
        #print(pos1_embeddings)

        pos2 = np.array(pos2).astype('int')
        #print(pos2)
        pos2_var = Variable(torch.LongTensor(pos2).cuda())
        pos2_embeddings = self.pos2_emb(pos2_var)
        
        inputs = torch.cat((word_embeddings, pos1_embeddings, pos2_embeddings),-1)
        #print(inputs)
        
        labels = [ self.label_vocab.stoi[label] - 1 for label in labels]
        labels_var = Variable(torch.LongTensor(labels).cuda())
        outputs = labels_var
        
        return inputs, outputs
        
        
        
        
    def forward(self, X, is_train = True):
        # LSTM layer
        X = F.dropout(X, p=0.3, training=is_train)
        hiddens, for_output = self.lstm(X)
        #rev_hiddens, rev_output = self.rev_lstm(X)
        hiddens = F.dropout(hiddens, p=0.5, training=is_train)
        
        # Self Attentive layer
        att_hidden = F.tanh(self.attention_hidden(hiddens))
        
        att_scores = self.attention(att_hidden)
        
        attention_distrib = F.softmax(att_scores, dim = 1)
        context_vector = torch.sum(hiddens * attention_distrib, dim = 1)

        # Classifier
        context_vector = F.dropout(context_vector, p=0.5, training=is_train)
        finals = F.softmax(self.classifier(context_vector), dim = 1)

        return finals
    
    def evaluatation(self, input, output, demonstrate_result = True, analyze = False, header=""):
        batch_Xs, batch_ys = create_batches(input, output, 128, shuffle=False)
        #loss = 0
        tp = 0
        for batch_X, batch_Y in zip(batch_Xs, batch_ys):
            X, Y = self.prepare_inout(batch_X, batch_Y)
            preds = relation_extr(X, is_train = False)
            _, preds_Y = torch.max(preds, -1)
            tp += (preds_Y == Y).float().sum().data.cpu().numpy()[0]
            del X,Y
            
        if demonstrate_result:
            #print('Avg loss: ')
            print(header + " accuracy: ", tp/float(len(output)))
            
            #print('Macro F1-score')
            #print('Micro F1-score')
        

In [6]:
train, dev = data_split(SemEval10_task8(sub_path='SemEval2010_task8_training/TRAIN_FILE.TXT'), test_rate = 0.1)
test = SemEval10_task8(sub_path='SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT')

train_input, train_output = preprocess_dataset(train)
train_words = list(zip(*train_input))[0]

dev_input, dev_output = preprocess_dataset(dev)
dev_words = list(zip(*dev_input))[0]

test_input, test_output = preprocess_dataset(test)
test_words = list(zip(*test_input))[0]

In [7]:
TEXT = data.Field(sequential=True,  lower=False)
TEXT.build_vocab(train_words+test_words+dev_words, vectors="glove.840B.300d")
word_vocab = TEXT.vocab

LABEL = data.Field(sequential=False, use_vocab=False)
LABEL.build_vocab(train_output+test_output+dev_output)
label_vocab = LABEL.vocab

In [8]:
epoch_num = 200

learning_rate = 0.0003
#l2_rate = 10e-4

max_batch_size = 16
#max_num_of_sent = 50
word_emb_dim = 300
pos_emb_dim = 15
hidden_dim = 320

print(len(LABEL.vocab.stoi))
relation_extr = LSTM_Baseline_Model(word_vocab, 
                                    label_vocab, 
                                    word_emb_dim = word_emb_dim, 
                                    pos_emb_dim = pos_emb_dim, 
                                    hidden_dim = hidden_dim, 
                                    output_dim = len(LABEL.vocab.stoi)-1)

20


In [None]:
params = list(relation_extr.parameters())
print(list(relation_extr.named_parameters()))
# gradient clip
torch.nn.utils.clip_grad_norm(params, 5.0)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(params, lr = learning_rate, weight_de)

[('word_emb.weight', Parameter containing:
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.2720 -0.0620 -0.1884  ...   0.1302 -0.1832  0.1323
          ...             ⋱             ...          
 0.0070 -0.0086 -0.6045  ...   0.6283  0.4505  0.0291
-0.4984  0.5471 -0.1409  ...   0.1861 -0.0487  0.4646
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
[torch.cuda.FloatTensor of size 26399x300 (GPU 0)]
), ('pos1_emb.weight', Parameter containing:
1.00000e-02 *
 -1.5874  1.7058 -0.3536 -2.8543 -2.0106 -3.2879  2.7891  3.1329  2.1524 -1.6334
  3.9104  0.4944 -0.3396  0.2564 -0.4568  0.9153 -1.7000  2.5671  1.7345  0.2495
  3.1816 -0.6301  2.8543 -2.8053  2.5533  2.9146  0.1264  3.1328 -3.4234 -3.6173
 -0.8508 -1.3331  0.8847 -0.6192 -1.4569 -0.8130  0.8859 -2.6316  1.7681 -1.3402
 -1.2913  2.2774 -1.1795 -2.6168  0.0351  3.6502 -0.6231 -2.1177  0.8026 -1.5547
  1.4115 -1.8352 -3.4587 -2.7188 -2.0758  2.7975  1.3593  0.5849

In [None]:

for i in range(epoch_num):
    epoch_loss = 0
    tp = 0
    batch_Xs, batch_ys = create_batches(train_input, train_output, max_batch_size)
    for batch_X, batch_y in zip(batch_Xs, batch_ys):
        X, Y = relation_extr.prepare_inout(batch_X, batch_y)
        
        optimizer.zero_grad()
        preds = relation_extr(X)
        #print(preds)
        _, preds_Y = torch.max(preds, -1)
        #print(preds_Y)
        batch_size = Y.size()[0]
        loss = loss_func(preds, Y)
        batch_loss = loss #* batch_size / max_batch_size
        #batch_loss = loss
        
        batch_loss.backward()
        optimizer.step()
        
        epoch_loss += loss.data.cpu().numpy()[0]
        tp += (preds_Y == Y).float().sum().data.cpu().numpy()[0]
        del X, Y, loss, batch_loss, preds, preds_Y
    print("Train epoch",i,epoch_loss/len(train_output))
    print("Train accuracy",i,tp/float(len(train_output)))
    relation_extr.evaluatation(dev_input, dev_output,header = 'Dev')
    relation_extr.evaluatation(test_input, test_output,header = 'Test')
    #print(params[0])
    
    del epoch_loss, tp
    
    
    
    print()

Train epoch 0 0.19459782507684495
Train accuracy 0 0.19166666666666668
Dev accuracy:  0.21875
Test accuracy:  0.20794994479205006

Train epoch 1 0.1916097491317325
Train accuracy 1 0.23069444444444445
Dev accuracy:  0.23625
Test accuracy:  0.2270887007729113

Train epoch 2 0.18881865683529112
Train accuracy 2 0.27305555555555555
Dev accuracy:  0.29875
Test accuracy:  0.26683842473316155

Train epoch 3 0.1864943109287156
Train accuracy 3 0.3065277777777778
Dev accuracy:  0.3025
Test accuracy:  0.284504968715495

Train epoch 4 0.18516193721029495
Train accuracy 4 0.3275
Dev accuracy:  0.335
Test accuracy:  0.32057416267942584

Train epoch 5 0.18309821393754747
Train accuracy 5 0.36444444444444446
Dev accuracy:  0.3525
Test accuracy:  0.347442031652558

Train epoch 6 0.18095912913481393
Train accuracy 6 0.3951388888888889
Dev accuracy:  0.36375
Test accuracy:  0.36694884063305117

Train epoch 7 0.17717202994558545
Train accuracy 7 0.45625
Dev accuracy:  0.45125
Test accuracy:  0.464850938

Train epoch 64 0.1400161980258094
Train accuracy 64 0.9869444444444444
Dev accuracy:  0.68125
Test accuracy:  0.7026131762973868

Train epoch 65 0.1399384648601214
Train accuracy 65 0.9872222222222222
Dev accuracy:  0.68875
Test accuracy:  0.7059256532940743

Train epoch 66 0.14005291329489813
Train accuracy 66 0.9870833333333333
Dev accuracy:  0.6875
Test accuracy:  0.7011409642988591

Train epoch 67 0.14015547149711186
Train accuracy 67 0.9852777777777778
Dev accuracy:  0.70625
Test accuracy:  0.7051895472948104

Train epoch 68 0.1399716458717982
Train accuracy 68 0.9866666666666667
Dev accuracy:  0.705
Test accuracy:  0.715126978284873

Train epoch 69 0.1398924806714058
Train accuracy 69 0.9879166666666667
Dev accuracy:  0.6725
Test accuracy:  0.7051895472948104

Train epoch 70 0.13994608011510637
Train accuracy 70 0.9877777777777778
Dev accuracy:  0.70625
Test accuracy:  0.7092381302907619

Train epoch 71 0.1398193806078699
Train accuracy 71 0.9888888888888889
Dev accuracy:  0.705


In [None]:
print(len(batch_X))

In [None]:
X = Variable(torch.rand(64,10,320).cuda())

In [None]:
LSTM_Baseline(X)

In [None]:
a = [[0]*10]*20
print(a)
a[0][0] = 1
print(a)