## Load processed data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import yaml
import numpy as np
import torch
from collections import Counter
from torch import nn
from matplotlib import pyplot as plt
import torch.nn.functional as F
%matplotlib inline
import copy
import argparse
from helper import data_generator
import sys; sys.argv=['']; del sys
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
file = 'args/semeval2010_8.yaml'
with open(file) as f:
    args = yaml.load(f, Loader=yaml.Loader)
    parser = argparse.ArgumentParser(description='attention')
    config = parser.parse_args()
    for k, v in args['common'].items():
        setattr(config, k, v)

In [3]:
import pickle
with open(config.dic_path, 'rb') as f:
    vocab, word2id, id2word = pickle.load(f)

In [4]:
train_dg = data_generator(config, config.train_path)
#train_eval_dg = data_generator(config, config.train_path, False)
dev_dg = data_generator(config, config.dev_path, False)
test_dg = data_generator(config, config.test_path, False)

In [6]:
lengths = [len(item[0]) for item in train_dg.data_batch]
max(lengths)

91

## Train a model

In [7]:
from torch.nn import utils as nn_utils
import torch
from torch import optim
import pickle
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.init as init
from self_defined_lstm_linear import LSTM
from self_defined_gru_linear import GRU
from self_defined_rnn_linear import RNN

# from dynamic_lstm import dynamicLSTM
# from dynamic_gru import dynamicGRU
# from dynamic_rnn import dynamicRNN
# from ngram_rnn import nRNN
# from matrix_multi_model import MatMultiModel

# from self_defined_lstm  import LSTM
# from self_defined_gru  import GRU
# from self_defined_rnn  import RNN

# from self_defined_lstm_no_combination import LSTM
# from self_defined_gru_no_combination import GRU
# from self_defined_rnn_no_combination import RNN
#from self_defined_simple_recurrence2 import RNN as sRNN

In [8]:
class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, label_dim=1, rnn_type='lstm'):
        super(SimpleClassifier, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        #self.embeddings.weight.data.uniform_(-0.05, 0.05)
        self.dropout = nn.Dropout(0.5)
        self.label_dim = label_dim

        self.affine = nn.Linear(embed_dim, hidden_dim)
        self.softmax = nn.Softmax(dim=-1)
        self.cnn = nn.Conv1d(embed_dim, hidden_dim, 3, 1, padding=2)
        
        self.sigmoid = nn.Sigmoid()
#         if rnn_type == 'elman':
#             self.rnn = dynamicRNN(hidden_dim, hidden_dim//2, 
#                           bidirectional=True)
#             self.encoder = dynamicRNN(hidden_dim, hidden_dim, 
#                           bidirectional=False)
#         elif rnn_type == 'gru':
#             self.rnn = dynamicGRU(embed_dim, hidden_dim//2, 
#                           bidirectional=True)
#             self.encoder = dynamicGRU(hidden_dim, hidden_dim, 
#                           bidirectional=False)
#         else:
#             self.rnn = dynamicLSTM(embed_dim, hidden_dim, 
#                           bidirectional=True)
#             self.encoder = dynamicLSTM(hidden_dim, hidden_dim, 
#                           bidirectional=False)
        if rnn_type == 'elman':
            self.rnn = RNN(embed_dim, hidden_dim, 
                          bidirectional=False)
        elif rnn_type == 'gru':
            self.rnn = GRU(embed_dim, hidden_dim, 
                          bidirectional=False)
        elif rnn_type == 'nrnn':
            self.rnn = nRNN(embed_dim, hidden_dim, 
                          bidirectional=False)
        elif rnn_type == 'srnn':
            self.rnn = sRNN(embed_dim, hidden_dim, 
                          bidirectional=False)
        elif rnn_type == 'mrnn':
            self.rnn = MatMultiModel(int(np.sqrt(embed_dim)), hidden_dim, 
                      bidirectional=False)
        else:
            self.rnn = LSTM(embed_dim, hidden_dim, 
                          bidirectional=False)
    
        self.linear = nn.Linear(embed_dim, hidden_dim)

        self.decoder = nn.Linear(hidden_dim, label_dim, bias=False)

    # batch_size * sent_l * dim
    def forward(self, seq_ids, seq_lengths=None):
        '''
        Args:
            seq_ids: word indexes, batch_size, max_len, Long Tensor
            seq_lengths: lengths of sentences, batch_size, Long Tensor
        attention:
            score = v h
            att = softmax(score)
        '''
        
        seq_embs = self.embeddings(seq_ids)
        seq_embs = self.dropout(seq_embs)
        #print(seq_embs.shape)
        batch_size, max_len, hidden_dim = seq_embs.size()
        # batch * max_len * hidden_states
        #hidden_vecs = self.affine(seq_embs)
        #hidden_vecs = self.cnn(seq_embs.transpose(1,2))
        #hidden_vecs = hidden_vecs.transpose(1,2)
        #print(hidden_vecs.shape)
        hidden_vecs, final_vec = self.rnn(seq_embs, seq_lengths)
        #_, final_vec = self.encoder(hidden_vecs, seq_lengths)
        final_vec = self.dropout(final_vec)
        senti_scores = self.decoder(final_vec)
        #multi class
        if self.label_dim == 1:
            probs = self.sigmoid(senti_scores)
        else:
            probs = self.softmax(senti_scores)
            logits = torch.log(probs + 0.000000000001)
            return logits, senti_scores
        return probs, senti_scores

In [9]:
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, label_dim=1, bw=1, be=0.1, bm=0.1, dropout=0.5):
        '''
        Average the GRU hidden vectors
        '''
        super(CNNClassifier, self).__init__()
        

        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.embeddings.weight.data.uniform_(-be, be)
        self.cnn = nn.Conv1d(embed_dim, hidden_dim, kernel_size=20, padding=0, stride=20,bias=False)
        #self.cnn = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=2, stride=1)
        #self.cnn.weight.data.uniform_(-0.1, 0.1)
        self.affine = nn.Linear(embed_dim, hidden_dim)
        #self.affine.weight.data.uniform_(-bm, bm)
        
        self.decoder = nn.Linear(hidden_dim, label_dim)
        #self.decoder.weight.data.uniform_(-bw, bw)
        
        self.log_softmax = nn.LogSoftmax(dim=-1)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=-1)
        assert label_dim > 0
        self.label_dim = label_dim

    # batch_size * sent_l * dim
    def forward(self, seq_ids, seq_lengths=None):
        '''
        Args:
            seq_ids: word indexes, batch_size, max_len, Long Tensor
            seq_lengths: lengths of sentences, batch_size, Long Tensor
        attention:
            score = v tanh(Wh+b)
            att = softmax(score)
        '''
        batch_size, max_len = seq_ids.size()
        seq_embs = self.embeddings(seq_ids)
        seq_embs = self.dropout(seq_embs)
        # batch * max_len * hidden_states
        #hidden_vecs = self.affine(seq_embs)
        #hidden_vecs = seq_embs
        #batch size, emb_dim, sent len
        seq_embs = seq_embs.transpose(1, 2)
        #batch size, out_dim, sent_len -1
        hidden_vecs = self.cnn(seq_embs)


        
        h = []
        for i in range(batch_size):
            #h_i, _ = torch.tanh(hidden_vecs[i, :, :seq_lengths[i]]).max(1)
            #h_i = torch.tanh(hidden_vecs[i, :, :seq_lengths[i]]).mean(1)
            #h_i, _ = hidden_vecs[i, :, :seq_lengths[i]].max(1)
            #h_i = hidden_vecs[i, :, :seq_lengths[i]].mean(1)
            h_i = hidden_vecs[i, :, :seq_lengths[i]].sum(1)
            h.append(h_i)
        final_vec = torch.stack(h)

#         final_vec = torch.bmm(attn, hidden_vecs).squeeze(1)
        final_vec = self.dropout(final_vec)
        scores = self.decoder(final_vec)
        if self.label_dim == 1:
            probs = self.sigmoid(scores)
        else:
            probs = self.softmax(scores)
            logits = torch.log(probs + 0.000000000001)
            return logits, scores
        return probs, scores

In [10]:
##Evaluation classification
from sklearn.metrics import confusion_matrix, f1_score
def evaluate_cls(dg, model, label_dim=1):
    #Make prediction
    model.eval()
    count = 0
    #record the gold and the prediction
    gold_labels = []
    pred_labels = []
    dg.reset_samples()
    while dg.index<dg.data_len:
        sent_ids, label_list, sent_lens = next(dg.get_ids_samples())
        outputs, _ =  model(sent_ids.to(device), sent_lens.to(device))
        if label_dim == 1:
            preds = (outputs>0.5).squeeze()
            num = (preds.cpu() == label_list.bool()).sum().cpu().item()
        else:
            preds = outputs.argmax(1)
            num = (label_list==preds.cpu()).sum().item()
        gold_labels += list(label_list.cpu().numpy())
        pred_labels += list(preds.cpu().numpy())
        count += num

    accuracy = count*1.0/dg.data_len
    f1 = f1_score(gold_labels, pred_labels, average='macro')
    print('Evaluation accuracy:', accuracy)
    return accuracy, f1

#regression
def evaluate_reg(dg, model):
    #Make prediction
    model.eval()
    count = 0
    #record the gold and the prediction
    error = 0
    dg.reset_samples()
    while dg.index<dg.data_len:
        sent_ids, label_list, sent_lens = next(dg.get_ids_samples())
        probs, scores = model(sent_ids.to(device), sent_lens.to(device))
        #label_list.apply_(scale_value)
        loss = loss_func(scores.squeeze(), label_list.float().to(device))
        num = len(sent_lens)
        error += loss.item() * num
        count += num

    mse = error/count
    return mse

In [11]:
def ranking_loss(scores, label_list):
    pred_labels = scores.argmax(1) 
    pred_scores = torch.gather(scores, 1, pred_labels.unsqueeze(0))
    pred_scores = pred_scores.squeeze(0)
    gold_scores = torch.gather(scores, 1, label_list.to(device).unsqueeze(0))
    gold_scores = gold_scores.squeeze(0)
    left = torch.log(1+ torch.exp(2*(2.5 - gold_scores)))
    right = torch.log(1+ torch.exp(2*(0.5 + pred_scores)))
    return left+right

In [14]:
# torch.manual_seed(555)
# np.random.seed(555)
label_dim = 19
loss_func = nn.BCELoss()
if label_dim>1:
    loss_func = nn.NLLLoss()

valid_f1_list = []
test_f1_list = []
for scale in range(3):
    #torch.manual_seed(666)
    #model = SimpleAttnClassifier(config.vocab_size, 100, 100, 1, scale)
    hidden_dim = 300
    model = SimpleClassifier(config.vocab_size, 300, hidden_dim, label_dim, 'lstm')
    #model = SimpleClassifier(config.vocab_size, 32*32, 32, label_dim, 'mrnn')
    #model = CNNClassifier(config.vocab_size, 300, 300, label_dim)
    #model.load_vector(config.emb_path, trainable=True)
    
    model = model.to(device)
    ##################################
    ####Weight decay can influence the result, if the value is too large, the model will not converge after iterations
    #optimizer = optim.SGD(model.parameters(), lr=0.5, weight_decay=0.000000)
    #optimizer = optim.Adagrad(model.parameters(), lr=0.5, weight_decay=0.00000, lr_decay=0.001)
    #optimizer = optim.Adagrad(model.parameters(), lr=0.01, weight_decay=0.000001, lr_decay=0.001)

    
    loop_num = int(train_dg.data_len/config.batch_size)+1
    best_model = None
    best_acc = -1
    train_acc = []
    valid_acc = []
    for i in range(40):
        print('Epoch:', i)
        print('#'*20)
        
        ##For gru
        #optimizer = optim.Adagrad(model.parameters(), lr=0.01, weight_decay=0.00001, lr_decay=0.001)
        #for lstm
        #optimizer = optim.Adagrad(model.parameters(), lr=0.02, weight_decay=0.00002, lr_decay=0.0001)
        #for elman
        #optimizer = optim.Adagrad(model.parameters(), lr=0.02, weight_decay=0.00002, lr_decay=0.0001)
        #for elman
        optimizer = optim.Adagrad(model.parameters(), lr=0.01, weight_decay=0.00001)

        total_loss = 0
        model.train()
        #shuffle the training set given the random seed
        train_dg.shuffle_data()
        #sequential sampling, use all the dataset
        train_dg.reset_samples()
        #model.embeddings.required_grad = config.update_emb
        for j in range(loop_num):
            model.zero_grad()
            # generate dataset
            sent_ids,  label_list, sent_lens = next(train_dg.get_sequential_ids_samples())
            logits, scores = model(sent_ids.to(device), sent_lens.to(device))
            #loss = ranking_loss(scores, label_list).mean()
            #label_list.apply_(scale_value)
            if label_dim == 1:
                loss = loss_func(logits.squeeze(), label_list.float().to(device))#
            else:
                loss = loss_func(logits.squeeze(), label_list.to(device))
            
            # Do the backward pass and update the gradient
            #w_h = model.rnn.f_cell.weight_ih
            #loss += w_h.norm(2)**2*0.0001#lstm
            #loss += w_h.norm(2)**2*0.01#gru
#             if i>5:
#                 loss += w_h.norm(2)**2*0.01
            loss.backward()
#             nn.utils.clip_grad_norm_(model.parameters(),0.25)#0.05
            nn.utils.clip_grad_norm_(model.parameters(),5)
            optimizer.step()
        if i%2== 0:
            l = loss.cpu().item()
            print('Sample Loss:{:.3f}'.format(l))

        #Dev Evaluation
        #print('Training Accuracy')
        #t_acc = evaluate(train_eval_dg, model)
        #print('Dev Accuracy')
        dev_acc, f1 = evaluate_cls(dev_dg, model, label_dim)
        print('f1:', round(f1, 4))
        #train_acc.append(t_acc)
        valid_acc.append(dev_acc)
        if best_acc < f1:
            best_acc = f1
            best_model = copy.deepcopy(model)
            
        #best_model = model
#     #Test performance
#     print('Training Accuracy')
#     t_acc = evaluate(train_eval_dg, model)
    #best_model = copy.copy(model)
    #best_model.load_state_dict(best_model_dict)
    #del best_model_dict
    print('Best valid f1:', round(best_acc,4))
    _, f1 = evaluate_cls(test_dg, best_model, label_dim)
    #Test performance       
    print('test f1:', f1)
    valid_f1_list.append(best_acc)
    test_f1_list.append(f1)
#     score_list = []
#     for neg_phrase in negation_phrases_filtered:
#         v = get_phrase_polarity(neg_phrase, best_model, hidden_dim)
#         score_list.append(v)
#     neg_phrase_scores.append(score_list)
    

MVMA-L
Epoch: 0
####################
Sample Loss:2.104
Evaluation accuracy: 0.299
f1: 0.1198
Epoch: 1
####################
Evaluation accuracy: 0.405
f1: 0.261
Epoch: 2
####################
Sample Loss:1.820
Evaluation accuracy: 0.437
f1: 0.3006
Epoch: 3
####################
Evaluation accuracy: 0.486
f1: 0.3788
Epoch: 4
####################
Sample Loss:1.447
Evaluation accuracy: 0.513
f1: 0.4138
Epoch: 5
####################
Evaluation accuracy: 0.519
f1: 0.419
Epoch: 6
####################
Sample Loss:1.238
Evaluation accuracy: 0.537
f1: 0.4546
Epoch: 7
####################
Evaluation accuracy: 0.54
f1: 0.4578
Epoch: 8
####################
Sample Loss:1.121
Evaluation accuracy: 0.531
f1: 0.4378
Epoch: 9
####################
Evaluation accuracy: 0.566
f1: 0.4863
Epoch: 10
####################
Sample Loss:0.941
Evaluation accuracy: 0.554
f1: 0.4792
Epoch: 11
####################
Evaluation accuracy: 0.591
f1: 0.5144
Epoch: 12
####################
Sample Loss:1.051
Evaluation accuracy: 

Evaluation accuracy: 0.596
f1: 0.5578
Epoch: 24
####################
Sample Loss:0.597
Evaluation accuracy: 0.599
f1: 0.5545
Epoch: 25
####################
Evaluation accuracy: 0.604
f1: 0.5686
Epoch: 26
####################
Sample Loss:0.538
Evaluation accuracy: 0.613
f1: 0.5768
Epoch: 27
####################
Evaluation accuracy: 0.604
f1: 0.5684
Epoch: 28
####################
Sample Loss:0.547
Evaluation accuracy: 0.597
f1: 0.5583
Epoch: 29
####################
Evaluation accuracy: 0.608
f1: 0.5753
Epoch: 30
####################
Sample Loss:0.415
Evaluation accuracy: 0.604
f1: 0.5667
Epoch: 31
####################
Evaluation accuracy: 0.591
f1: 0.5574
Epoch: 32
####################
Sample Loss:0.323
Evaluation accuracy: 0.614
f1: 0.5766
Epoch: 33
####################
Evaluation accuracy: 0.615
f1: 0.5715
Epoch: 34
####################
Sample Loss:0.401
Evaluation accuracy: 0.61
f1: 0.572
Epoch: 35
####################
Evaluation accuracy: 0.61
f1: 0.591
Epoch: 36
####################

In [13]:
for v, t in zip(valid_f1_list, test_f1_list):
    print(round(v*100, 1), round(t*100, 1))

### Extract n-grams for each specific relation

In [107]:
# from elman_ngram_feature import *
from gru_ngram_feature import *
# from lstm_ngram_feature import *
# from all_ngram_feature import *

In [None]:
label_dict = {'Message-Topic(e1,e2)': 0,
 'Product-Producer(e2,e1)': 1,
 'Instrument-Agency(e2,e1)': 2,
 'Entity-Destination(e1,e2)': 3,
 'Cause-Effect(e2,e1)': 4,
 'Component-Whole(e1,e2)': 5,
 'Product-Producer(e1,e2)': 6,
 'Member-Collection(e2,e1)': 7,
 'Other': 8,
 'Entity-Origin(e1,e2)': 9,
 'Content-Container(e1,e2)': 10,
 'Entity-Origin(e2,e1)': 11,
 'Cause-Effect(e1,e2)': 12,
 'Component-Whole(e2,e1)': 13,
 'Content-Container(e2,e1)': 14,
 'Instrument-Agency(e1,e2)': 15,
 'Message-Topic(e2,e1)': 16,
 'Member-Collection(e1,e2)': 17,
 'Entity-Destination(e2,e1)': 18}

In [108]:
from nltk import ngrams
sent = '<e1> demolition </e1> was the cause of <e2> terror </e2>'.split()
sent = '<e1> damage </e1> caused by the <e2> bombing </e2>'.split()
sent = '<e1> courtyard </e1> of the <e2> castle </e2>'.split()
sent = '<e1> marble </e1> was dropped into the <e2> bowl </e2>'.split()
sent = '<e1> car </e1> left the <e2> plant </e2>'.split()
sent = '<e1> cigarettes </e1> by the major <e2> producer </e2>'.split()
sent = '<e1> cigarettes </e1> are used by <e2> women </e2>'.split()
for ngram in ngrams(sent, 5):
    ngram_id = [word2id[w] for w in ngram]
    score = get_phrase_polarity_multi(ngram_id, best_model, 300)
    print(' '.join(ngram), score[12].item())

<e1> cigarettes </e1> are used 2.0632264614105225
cigarettes </e1> are used by 0.006082646548748016
</e1> are used by <e2> 4.044539928436279
are used by <e2> women 2.1829018592834473
used by <e2> women </e2> 0.6761854290962219


In [37]:
get_phrase_polarity = get_phrase_polarity_new