In [1]:
from collections import namedtuple, defaultdict
import codecs
from config import config
from bs4 import BeautifulSoup
import pdb
import torch
from nltk.tokenize import word_tokenize
#import tokenizer
import numpy as np
import re
import pickle
import random
##Added by Richard Sun
import en_core_web_sm
nlp = en_core_web_sm.load()


In [2]:
from allennlp.modules.elmo import Elmo, batch_to_ids
options_file = "../data/Elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "../data/Elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo = Elmo(options_file, weight_file, 2, dropout=0)

In [3]:
SentInst = namedtuple("SentenceInstance", "id text text_inds opinions")
OpinionInst = namedtuple("OpinionInstance", "target_text polarity class_ind target_mask")

In [49]:
class dataHelper():
    def __init__(self, config):
        '''
        This class is able to:
        1. Load datasets
        2. Split sentences into words
        3. Map words into Idx
        '''
        self.config = config

        # id map to instance
        self.id2label = ["positive", "neutral", "negative"]
        self.label2id = {v:k for k,v in enumerate(self.id2label)}

        self.UNK = "UNK"
        self.EOS = "EOS"

        # data
        self.train_data = None
        self.test_data = None
    
    def read_xml_data(self, file_name):
        '''
        Read XML data
        '''
        f = codecs.open(file_name, "r", encoding="utf-8")
        soup = BeautifulSoup(f.read(), "lxml")
        sentence_tags = soup.find_all("sentence")
        sentence_list = []
        for sent_tag in sentence_tags:
            sent_id = sent_tag.attrs["id"]
            sent_text = sent_tag.find("text").contents[0]
            opinion_list = []
            try:
                asp_tag = sent_tag.find_all("aspectterms")[0]
            except:
                # print "{0} {1} has no opinions".format(sent_id, sent_text)
                #print(sent_tag)
                continue
            opinion_tags = asp_tag.find_all("aspectterm")
            for opinion_tag in opinion_tags:
                term = opinion_tag.attrs["term"]
                if term not in sent_text: pdb.set_trace()
                polarity = opinion_tag.attrs["polarity"]
                opinion_inst = OpinionInst(term, polarity, None, None)
                opinion_list.append(opinion_inst)
            sent_Inst = SentInst(sent_id, sent_text, None, opinion_list)
            sentence_list.append(sent_Inst)

        return sentence_list


    def tokenize(self, sent_str):
        '''
        Split a sentence into tokens
        '''
        # return word_tokenize(sent_str)
        sent_str = " ".join(sent_str.split("-"))
        sent_str = " ".join(sent_str.split("/"))
        sent_str = " ".join(sent_str.split("!"))
        sent = nlp(sent_str)
        return [item.text for item in sent]
        
    # namedtuple is protected!
    def process_raw_data(self, data):
        '''
        Tokenize each sentence, compute aspect mask for each sentence
        '''
        sent_len = len(data)
        print('Sentences Num:', sent_len)
        for sent_i in np.arange(sent_len):
            sent_inst = data[sent_i]
            sent_tokens = self.tokenize(sent_inst.text)
            sent_inst = sent_inst._replace(text_inds = sent_tokens)
            opinion_list = []
            opi_len = len(sent_inst.opinions)
            for opi_i in np.arange(opi_len):
                opi_inst = sent_inst.opinions[opi_i]

                target = opi_inst.target_text
                target_tokens = self.tokenize(target)
                try:
                    target_start = sent_tokens.index(target_tokens[0])
                    target_end = sent_tokens[max(0, target_start - 1):].index(target_tokens[-1])  + max(0, target_start - 1)
                except:
                    #pdb.set_trace()
                    print('Target error '+target_tokens[0])
                    continue
                    
                if target_start < 0 or target_end < 0:
                    #pdb.set_trace()
                    print('Traget not in the vocabulary')
                    continue
                    
                mask = [0] * len(sent_tokens)
                for m_i in range(target_start, target_end + 1):
                    mask[m_i] = 1

                label = opi_inst.polarity
                if label == "conflict":  continue  # ignore conflict ones
                opi_inst = opi_inst._replace(class_ind = self.label2id[label])
                opi_inst = opi_inst._replace(target_mask = mask)
                opinion_list.append(opi_inst)
            
            sent_inst = sent_inst._replace(opinions = opinion_list)
            
            data[sent_i] = sent_inst
        return data

    
    def read(self, train_data, test_data):
        '''
        Preprocess the data
        '''
        self.train_data = self.read_xml_data(train_data)
        self.test_data = self.read_xml_data(test_data)
        print('Training dataset number:', len(self.train_data))
        print('Testing dataset number:', len(self.test_data))
        train_data = self.process_raw_data(self.train_data)
        test_data = self.process_raw_data(self.test_data)
        return train_data, test_data
    
    def generate_sample(self, all_triples):
        '''
        Generate a batch of training dataset
        '''
        batch_size = self.config.batch_size
        select_index = np.random.choice(len(all_triples), batch_size)
        select_trip = [all_triples[i] for i in select_index]
        return select_trip
        

    # shuffle and to batch size
    def to_batches(self, data, if_batch = False):
        all_triples = []
        # list of list
        pair_couter = defaultdict(int)
        print("Sentence size ", len(data))
        for sent_inst in data:
            tokens = sent_inst.text_inds
            #print(tokens)
            for opi_inst in sent_inst.opinions:
                if opi_inst.polarity is None:  continue # conflict one
                mask = opi_inst.target_mask
                polarity = opi_inst.class_ind
                if tokens is None or mask is None or polarity is None: 
                    continue
                all_triples.append([tokens, mask, polarity])
                pair_couter[polarity] += 1
                
        print(pair_couter)

        if if_batch:
            print('Shuffle')
            random.shuffle(all_triples)
            batch_n = int(len(all_triples) / self.config.batch_size + 1)
            print("{0} instances with {1} batches".format(len(all_triples), batch_n))
            ret_triples = []
            
            offset = 0
            for i in range(batch_n):
                start = self.config.batch_size * i
                end = min(self.config.batch_size * (i+1), len(all_triples) )
                ret_triples.append(all_triples[start : end])
            return ret_triples
        else:
            return all_triples

    
    def debug_single_sample(self, batches, batch_n, sent_n):
        sent_ind = batches[batch_n][sent_n][0]
        print(" ".join([self.id2word[x] for x in sent_ind]))
        mask = batches[batch_n][sent_n][1]
        print(mask)
        label = batches[batch_n][sent_n][2]
        print(self.id2label[label])
        

In [59]:
def elmo_transform(triples):
    '''
    Split the triples into three lists
    '''
    token_list, mask_list, label_list = zip(*triples)
    sent_lens = [len(tokens) for tokens in token_list]
    max_len = max(sent_lens)
    batch_size = len(sent_lens)
    character_ids = batch_to_ids(token_list)
    embeddings = elmo(character_ids)
    #batch_size*word_num * 1024
    sent_vecs = embeddings['elmo_representations'][0]
    #Padding the mask to same lengths
    mask_vecs = torch.zeros(batch_size, max_len)
    for i, mask in enumerate(mask_list):
        mask_vecs[i, :len(mask)] = torch.LongTensor(mask)
    return sent_vecs, mask_vecs, label_list, sent_lens

In [19]:
TRAIN_DATA_PATH = "data/2014/Restaurants_Train_v2.xml"
TEST_DATA_PATH = "data/2014/Restaurants_Test_Gold.xml"

# TRAIN_DATA_PATH = "./data/2014/Laptop_Train_v2.xml"
# TEST_DATA_PATH = "./data/2014/Laptops_Test_Gold.xml"

GLOVE_FILE = "../data/word_embeddings/glove.6B.100d.txt"
OUT_FILE = config.embed_path
DATA_FILE = config.data_path
DIC_FILE = config.dic_path

In [50]:
reader = dataHelper(config)

In [15]:
train, test = reader.read(TRAIN_DATA_PATH, TEST_DATA_PATH)

Training dataset number: 2021
Testing dataset number: 606
Sentences Num: 2021
Sentences Num: 606
Target error MEAL


In [58]:
#train_data = reader.read_data(TRAIN_DATA_PATH)
#test_data = reader.read_data(TEST_DATA_PATH)
train_batch = reader.to_batches(train)
test_batch = reader.to_batches(test)
# reader.debug_single_sample(train_batch, 0, 0)
# pdb.set_trace()

Sentence size  2021
defaultdict(<class 'int'>, {2: 805, 0: 2164, 1: 633})
Sentence size  606
defaultdict(<class 'int'>, {0: 727, 1: 196, 2: 196})


In [60]:
sample = reader.generate_sample(train_batch)
sent_vecs, mask_vecs, label_list, sent_lens = elmo_transform(sample)

In [62]:
sent_vecs, mask_vecs, label_list, sent_lens = elmo_transform([test_batch[0]])

In [97]:
# write to pkl
with open(DATA_FILE, "wb") as f:
    pickle.dump([train_batch, test_batch],f)

In [98]:
with codecs.open(DIC_FILE, 'wb') as f:
    pickle.dump(reader.id2word, f)

In [99]:
reader.gen_vectors_glove()

1411 unk out of 5138 vocab


In [93]:
class Reader():
    def __init__(self, config):
        '''
        This class is able to:
        1. Load datasets
        2. Split sentences into words
        3. Map words into Idx
        '''
        self.config = config

        # id map to instance
        self.id2word = []
        self.word2id = {}
        self.id2label = ["positive", "neutral", "negative"]
        self.label2id = {v:k for k,v in enumerate(self.id2label)}

        self.UNK = "UNK"
        self.EOS = "EOS"

        # data
        self.train_data = None
        self.test_data = None
    
    def read_data(self, file_name):
        f = codecs.open(file_name, "r", encoding="utf-8")
        soup = BeautifulSoup(f.read(), "lxml")
        sentence_tags = soup.find_all("sentence")

        sentence_list = []
        for sent_tag in sentence_tags:
            sent_id = sent_tag.attrs["id"]
            sent_text = sent_tag.find("text").contents[0]
            opinion_list = []
            try:
                asp_tag = sent_tag.find_all("aspectterms")[0]
            except:
                # print "{0} {1} has no opinions".format(sent_id, sent_text)
                continue
            opinion_tags = asp_tag.find_all("aspectterm")
            for opinion_tag in opinion_tags:
                term = opinion_tag.attrs["term"]
                if term not in sent_text: pdb.set_trace()
                polarity = opinion_tag.attrs["polarity"]
                opinion_inst = OpinionInst(term, polarity, None, None)
                opinion_list.append(opinion_inst)
            sent_Inst = SentInst(sent_id, sent_text, None, opinion_list)
            sentence_list.append(sent_Inst)

        return sentence_list

    # generate vocabulary
    def gen_dic(self):
        words_set = set()
        label_set = set()

        # unknow
        words_set.add(self.UNK)
        #Is this data leakage
        for data in [self.train_data, self.test_data]:
            sent_counter = 0
            for sent_inst in data:
                sent_counter += 1
                tokens = self.tokenize(sent_inst.text)
                # pdb.set_trace()
                for token in tokens:
                    if token not in words_set:
                        words_set.add(token)
            print("{0} sentences".format(sent_counter)) 
        self.id2word = list(words_set)
        self.word2id = {v:k for k,v in enumerate(self.id2word)}

        print("{0} tokens".format(self.id2word.__len__()))

    def tokenize(self, sent_str):
        # return word_tokenize(sent_str)
        sent_str = " ".join(sent_str.split("-"))
        sent_str = " ".join(sent_str.split("/"))
        sent_str = " ".join(sent_str.split("!"))
        sent = nlp(sent_str)
        return [item.text for item in sent]
        
    # namedtuple is protected!
    def to_index(self, data):
        sent_len = len(data)
        for sent_i in range(sent_len):
            sent_inst = data[sent_i]
            sent_tokens = self.tokenize(sent_inst.text)
#             If x in the vocabulary return its id, otherwise replace it with unknow
#             sent_inds = [self.word2id[x] if x in self.word2id else self.word2id[self.UNK] 
#                 for x in sent_tokens]
#             if sent_inds is None: 
#                 print('sentence is empty')
#                 continue
            #If we wanna use Elmo, comment the mapping    
            #sent_inst = sent_inst._replace(text_inds = sent_inds)
            sent_inst = sent_inst._replace(text_inds = sent_tokens)

            opinion_list = []
            opi_len = len(sent_inst.opinions)
            for opi_i in range(opi_len):
                opi_inst = sent_inst.opinions[opi_i]

                target = opi_inst.target_text
                target_tokens = self.tokenize(target)
                try:
                    target_start = sent_tokens.index(target_tokens[0])
                    target_end = sent_tokens[max(0, target_start - 1):].index(target_tokens[-1])  + max(0, target_start - 1)
                except:
                    #pdb.set_trace()
                    continue
                    print('Target error'+target_tokens[0])
                if target_start < 0 or target_end < 0:
                    #pdb.set_trace()
                    continue
                    print('Traget not in the vocabulary')
                mask = [0] * len(sent_tokens)
                for m_i in range(target_start, target_end + 1):
                    mask[m_i] = 1

                label = opi_inst.polarity
                if label == "conflict":  continue  # ignore conflict ones
                opi_inst = opi_inst._replace(class_ind = self.label2id[label])
                opi_inst = opi_inst._replace(target_mask = mask)
                opinion_list.append(opi_inst)
            
            sent_inst = sent_inst._replace(opinions = opinion_list)
            
            data[sent_i] = sent_inst

    
    def read(self):
        self.train_data = self.read_data(TRAIN_DATA_PATH)
        self.test_data = self.read_data(TEST_DATA_PATH)
        self.gen_dic()
        self.to_index(self.train_data)
        self.to_index(self.test_data)
        return self.train_data, self.test_data

    # shuffle and to batch size
    def to_batches(self, data, if_batch = False):
        all_triples = []
        # list of list
        pair_couter = defaultdict(int)
        print("Sentence size ", len(data))
        for sent_inst in data:
            tokens = sent_inst.text_inds
            
            for opi_inst in sent_inst.opinions:
                if opi_inst.polarity is None:  continue # conflict one
                mask = opi_inst.target_mask
                polarity = opi_inst.class_ind
                if tokens is None or mask is None or polarity is None: pdb.set_trace()
                all_triples.append([tokens, mask, polarity])
                pair_couter[polarity] += 1
        print(pair_couter)

        if if_batch:
            random.shuffle(all_triples)
            batch_n = int(len(all_triples) / self.config.batch_size + 1)
            print("{0} instances with {1} batches".format(len(all_triples), batch_n))
            ret_triples = []
            
            offset = 0
            for i in range(batch_n):
                start = self.config.batch_size * i
                end = min(self.config.batch_size * (i+1), len(all_triples) )
                ret_triples.append(all_triples[start : end])
            return ret_triples
        else:
            return all_triples

    def gen_vectors_glove(self):
        vocab_dic = {}
        with open(GLOVE_FILE) as f:
            for line in f:
                s_s = line.split()
                if s_s[0] in self.word2id:
                    vocab_dic[s_s[0]] = np.array([float(x) for x in s_s[1:]])

        unknowns = np.random.uniform(-0.01, 0.01, config.embed_dim).astype("float32")
        ret_mat = []
        unk_counter = 0
        for token in self.id2word:
            # token = token.lower()
            if token in vocab_dic:
                ret_mat.append(vocab_dic[token])
            else:
                ret_mat.append(unknowns)
                # print token
                unk_counter += 1
        ret_mat = np.vstack(ret_mat)
        with open(OUT_FILE, "wb") as f:
            pickle.dump(ret_mat, f)
        print("{0} unk out of {1} vocab".format(unk_counter, len(self.id2word)))        
    
    def load_vectors(self):
        with open(OUT_FILE, 'rb') as f:
            self.id2vec = pickle.load(f)
    
    def debug_single_sample(self, batches, batch_n, sent_n):
        sent_ind = batches[batch_n][sent_n][0]
        print(" ".join([self.id2word[x] for x in sent_ind]))
        mask = batches[batch_n][sent_n][1]
        print(mask)
        label = batches[batch_n][sent_n][2]
        print(self.id2label[label])
        

In [54]:
from model_att import *
model = attTSA(config)

  "num_layers={}".format(dropout, num_layers))


In [46]:
doc = nlp(u"To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.")
target = 'food'
for token in doc:
#     if 'food' in token.head.text:
#         print(token)
#     children = [t.text for t in token.children]    
#     if 'food' in children:
#         print(token)
    print(token.text,token.head.text,
           [child for child in token.children])
#     if token.text == 'food':
#         a = token
#     if token.text == 'deficiencies':
#         b= token
    

To be []
be was [To, fair]
completely fair []
fair be [completely]
, was []
the factor []
only factor []
redeeming factor []
factor was [the, only, redeeming]
was was [be, ,, factor, food, ,, but, make, .]
the food []
food was [the, ,, was]
, food []
which was []
was food [which, above]
above was [average]
average above []
, was []
but was []
could make []
n't make []
make was [could, n't, up, for]
up make []
for make [deficiencies]
all deficiencies []
the deficiencies []
other deficiencies []
deficiencies for [all, the, other, of]
of deficiencies [Teodora]
Teodora of []
. was []


In [92]:
for item in doc.noun_chunks:
    print(item)

the only redeeming factor
the food
all the other deficiencies
Teodora


In [91]:
help(doc)

Help on Doc object:

class Doc(builtins.object)
 |  A sequence of Token objects. Access sentences and named entities, export
 |  annotations to numpy arrays, losslessly serialize to compressed binary
 |  strings. The `Doc` object holds an array of `TokenC` structs. The
 |  Python-level `Token` and `Span` objects are views of this array, i.e.
 |  they don't own the data themselves.
 |  
 |  EXAMPLE: Construction 1
 |      >>> doc = nlp(u'Some text')
 |  
 |      Construction 2
 |      >>> from spacy.tokens import Doc
 |      >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
 |                    spaces=[True, False, False])
 |  
 |  Methods defined here:
 |  
 |  __bytes__(...)
 |  
 |  __getitem__(...)
 |      Get a `Token` or `Span` object.
 |      
 |      i (int or tuple) The index of the token, or the slice of the document
 |          to get.
 |      RETURNS (Token or Span): The token at `doc[i]]`, or the span at
 |          `doc[start : end]`.
 |      
 |      EXAMPLE:
 |

In [98]:
with open('data/bailin_data/pre-trained-glove.pkl', 'rb') as f:
            vectors = pickle.load(f)

UnicodeDecodeError: 'ascii' codec can't decode byte 0x82 in position 6: ordinal not in range(128)

In [2]:
from model_glove import *

In [11]:
model = attTSA(config)
parameters = filter(lambda p: p.requires_grad, model.parameters())

Loaded from data/2014/pre-trained-glove.pkl with shape (5135, 300)


  "num_layers={}".format(dropout, num_layers))


In [15]:
def load_data(data_path, if_utf=False):
    f = open(data_path, 'rb')
    obj = pickle.load(f)
    f.close()
    return obj

In [16]:
train_batch, test_batch = load_data('data/bailin_data/data.pkl')

In [24]:
860/len(test_batch)

0.7678571428571429

In [20]:
for triple_list in test_batch:
        model.zero_grad() 
        if len(triple_list) == 0: 
            continue
        ##Modified by Richard Sun
        sent, mask, label = triple_list

In [7]:
import torch.nn.functional as F

In [6]:
a.shape

(32, 10, 64)

In [2]:
import tensorflow as tf
b = tf.Variable(a)