In [1]:
#! /usr/bin/env python
import os
import argparse
import datetime
import re
import numpy as np
import random


In [2]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim

In [3]:
from torchtext import data
from visdom import Visdom
viz = Visdom()

In [4]:
from data import tokenizer, data_split, preprocess_dataset, create_batches
from data import SemEval10_task8

In [5]:
class LSTM_Baseline_Model(nn.Module):
    def __init__(self, word_vocab, label_vocab, word_emb_dim, pos_emb_dim, hidden_dim, output_dim, MAX_POS = 15, use_gpu = True):
        super(LSTM_Baseline_Model, self).__init__()
        
        # Set hyper parameters
        self.word_emb_dim = word_emb_dim
        self.pos_emb_dim = pos_emb_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.input_dim = word_emb_dim + pos_emb_dim * 2
        
        self.MAX_POS = MAX_POS
        
        
        # Set options and other parameters
        self.use_gpu = use_gpu
        self.word_vocab = word_vocab
        self.label_vocab = label_vocab
        #self.pos_vocab = pos_vocab
        
        
        # Free parameters for the model
        # Initialize embeddings (Word and Position embeddings) 
        self.word_emb = nn.Embedding(len(self.word_vocab), self.word_emb_dim).cuda()
        
        self.pos1_emb = nn.Embedding(self.MAX_POS*2+1, self.pos_emb_dim).cuda()
        self.pos1_emb.weight.data.uniform_(-0.04, 0.04)
        self.pos2_emb = nn.Embedding(self.MAX_POS*2+1, self.pos_emb_dim).cuda()
        self.pos2_emb.weight.data.uniform_(-0.04, 0.04)
        
        # Initialize LSTM parameters ()
        self.lstm = nn.LSTM(self.input_dim, hidden_dim, bidirectional=True).cuda()
        
        
        # Initialize Attention parameters ()
        self.attention_hidden = nn.Linear(hidden_dim * 2, hidden_dim).cuda()
        self.attention = nn.Linear(hidden_dim, 1).cuda()
        
        # Initialize Classifier parameters ()
        self.classifier = nn.Linear(hidden_dim * 2, output_dim).cuda()
        
        
        self.word_emb.weight.data.copy_(word_vocab.vectors)
    def prepare_inout(self, X, y):
        sents, pos1, pos2 = list(zip(*X))
        #sents = list(zip(*X))
        #pos1 = datas['position_indices_1']
        #pos2 = datas['position_indices_2']
        
        labels = y
        
        words = [ [ self.word_vocab.stoi[word] for word in sent] for sent in sents]
        #print(words)
        words_var = Variable(torch.LongTensor(words).cuda())
        #print(words_var)
        word_embeddings = self.word_emb(words_var)
        #print(word_embedings)
        
        pos1 = np.array(pos1).astype('int')
        #print(pos1)
        pos1_var = Variable(torch.LongTensor(pos1).cuda())
        pos1_embeddings = self.pos1_emb(pos1_var)
        #print(pos1_var)
        #print(pos1_embeddings)

        pos2 = np.array(pos2).astype('int')
        #print(pos2)
        pos2_var = Variable(torch.LongTensor(pos2).cuda())
        pos2_embeddings = self.pos2_emb(pos2_var)
        
        inputs = torch.cat((word_embeddings, pos1_embeddings, pos2_embeddings),-1)
        #print(inputs)
        
        labels = [ self.label_vocab.stoi[label] - 1 for label in labels]
        labels_var = Variable(torch.LongTensor(labels).cuda())
        outputs = labels_var
        
        return inputs, outputs
        
        
        
        
    def forward(self, X, is_train = True):
        # LSTM layer
        X = F.dropout(X, p=0.3, training=is_train)
        hiddens, for_output = self.lstm(X)
        #rev_hiddens, rev_output = self.rev_lstm(X)
        hiddens = F.dropout(hiddens, p=0.3, training=is_train)
        
        # Self Attentive layer
        att_hidden = F.tanh(self.attention_hidden(hiddens))
        
        att_scores = self.attention(att_hidden)
        
        attention_distrib = F.softmax(att_scores, dim = 1)
        context_vector = F.tanh(torch.sum(hiddens * attention_distrib, dim = 1))

        # Classifier
        context_vector = F.dropout(context_vector, p=0.5, training=is_train)
        finals = F.softmax(self.classifier(context_vector), dim = 1)

        return finals
    
    def evaluatation(self, input, output, demonstrate_result = True, analyze = False, header=""):
        batch_Xs, batch_ys = create_batches(input, output, 128, shuffle=False)
        #loss = 0
        tp = 0
        for batch_X, batch_Y in zip(batch_Xs, batch_ys):
            X, Y = self.prepare_inout(batch_X, batch_Y)
            preds = relation_extr(X, is_train = False)
            _, preds_Y = torch.max(preds, -1)
            tp += (preds_Y == Y).float().sum().data.cpu().numpy()[0]
            del X,Y
            
        if demonstrate_result:
            #print('Avg loss: ')
            print(header + " accuracy: ", tp/float(len(output)))
            
            #print('Macro F1-score')
            #print('Micro F1-score')
        

In [6]:
train, dev = data_split(SemEval10_task8(sub_path='SemEval2010_task8_training/TRAIN_FILE.TXT'), test_rate = 0.1)
test = SemEval10_task8(sub_path='SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT')

train_input, train_output = preprocess_dataset(train)
train_words = list(zip(*train_input))[0]

dev_input, dev_output = preprocess_dataset(dev)
dev_words = list(zip(*dev_input))[0]

test_input, test_output = preprocess_dataset(test)
test_words = list(zip(*test_input))[0]

In [7]:
TEXT = data.Field(sequential=True,  lower=False)
TEXT.build_vocab(train_words+test_words+dev_words, vectors="glove.840B.300d")
word_vocab = TEXT.vocab

LABEL = data.Field(sequential=False, use_vocab=False)
LABEL.build_vocab(train_output+test_output+dev_output)
label_vocab = LABEL.vocab

In [8]:
epoch_num = 200

learning_rate = 0.001
#l2_rate = 10e-4

max_batch_size = 1
#max_num_of_sent = 50
word_emb_dim = 300
pos_emb_dim = 5
hidden_dim = 230

print(len(LABEL.vocab.stoi))
relation_extr = LSTM_Baseline_Model(word_vocab, 
                                    label_vocab, 
                                    word_emb_dim = word_emb_dim, 
                                    pos_emb_dim = pos_emb_dim, 
                                    hidden_dim = hidden_dim, 
                                    output_dim = len(LABEL.vocab.stoi)-1)

20


In [None]:
params = list(relation_extr.parameters())
#print(params)
# gradient clip
torch.nn.utils.clip_grad_norm(params, 5.0)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(params, lr = learning_rate)

In [None]:

for i in range(epoch_num):
    epoch_loss = 0
    tp = 0
    batch_Xs, batch_ys = create_batches(train_input, train_output, max_batch_size)
    for batch_X, batch_y in zip(batch_Xs, batch_ys):
        X, Y = relation_extr.prepare_inout(batch_X, batch_y)
        
        optimizer.zero_grad()
        preds = relation_extr(X)
        #print(preds)
        _, preds_Y = torch.max(preds, -1)
        #print(preds_Y)
        batch_size = Y.size()[0]
        loss = loss_func(preds, Y)
        batch_loss = loss
        #batch_loss = loss
        
        batch_loss.backward()
        optimizer.step()
        
        epoch_loss += loss.data.cpu().numpy()[0]
        tp += (preds_Y == Y).float().sum().data.cpu().numpy()[0]
        del X, Y, loss, batch_loss, preds, preds_Y
    print("Train epoch",i,epoch_loss/len(train_output))
    print("Train accuracy",i,tp/float(len(train_output)))
    relation_extr.evaluatation(dev_input, dev_output,header = 'Dev')
    relation_extr.evaluatation(test_input, test_output,header = 'Test')
    #print(params[0])
    
    del epoch_loss, tp
    
    
    
    print()

Train epoch 0 2.7864286074704596
Train accuracy 0 0.2461111111111111
Dev accuracy:  0.27375
Test accuracy:  0.2686786897313213

Train epoch 1 2.7052458430661095
Train accuracy 1 0.32666666666666666
Dev accuracy:  0.345
Test accuracy:  0.31762973868237027

Train epoch 2 2.6094179738230174
Train accuracy 2 0.42194444444444446
Dev accuracy:  0.4325
Test accuracy:  0.40743467059256533



In [None]:
print(len(batch_X))

In [None]:
X = Variable(torch.rand(64,10,320).cuda())

In [None]:
LSTM_Baseline(X)

In [None]:
a = [[0]*10]*20
print(a)
a[0][0] = 1
print(a)