In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

import re
e1pt = re.compile('<e1>.*</e1>')
e2pt = re.compile('<e2>.*</e2>')
E1LABEL = 'E1LABEL'
E2LABEL = 'E2LABEL'

from nltk.corpus import wordnet as wn

import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [3]:
WORD_DIM = 50

In [4]:
# example
    # sentence
    # entity_idx
    # context_idx
    # hypernyms
    # words
    # label

In [5]:
if WORD_DIM == 50:
    with open('./glove.twitter.27B.50d.txt', 'r') as rf:
        word_dict = {}
        lines = rf.readlines()
        count = 0
        print('length is {}'.format(len(lines)))
        for line in lines:
            ls = line[:-1].split(' ')
            word_dict[ls[0]] = [float(item) for item in ls[1:]]
            if count % 10000 == 0:
                print('\rfinished {}'.format(count), end='', flush=True)
            count += 1
    #         break
        print('')
    print(len(word_dict))
else:
    print('word_dim not right')

length is 1193514
finished 1190000
1193514


In [6]:
# read text file
def read_txt(path):
    with open(path, 'r') as rf:
        lines = rf.readlines()
        li = 0
        sentences = []
        relations = []
        for i in range(0,len(lines), 4):
            sentences.append(lines[i])
            relations.append(lines[i+1])
    print(len(sentences), len(relations))
    return sentences, relations

train_sentences, train_relations = read_txt('./raw/TRAIN_FILE.TXT')
test_sentences, test_relations = read_txt('./raw/TEST_FILE_FULL.TXT')

8000 8000
2717 2717


In [7]:
# process labels
# for i in range(len(train_relations)):
#     if train_relations[i] == 'Other\n':
#         train_relations[i] = train_relations[i][:-1]
#         continue
#     train_relations[i] = train_relations[i][:-8]
# for i in range(len(test_relations)):
#     if test_relations[i] == 'Other\n':
#         test_relations[i] = test_relations[i][:-1]
#         continue
#     test_relations[i] = test_relations[i][:-8]

for i in range(len(train_relations)):
    train_relations[i] = train_relations[i][:-1]
for i in range(len(test_relations)):
    test_relations[i] = test_relations[i][:-1]
    

unique_relations = list(set(test_relations + train_relations))
print(len(train_relations), train_relations[:3])
print(len(test_relations), test_relations[:3])
print(len(unique_relations), unique_relations)

8000 ['Component-Whole(e2,e1)', 'Other', 'Instrument-Agency(e2,e1)']
2717 ['Message-Topic(e1,e2)', 'Product-Producer(e2,e1)', 'Instrument-Agency(e2,e1)']
19 ['Entity-Destination(e1,e2)', 'Message-Topic(e1,e2)', 'Cause-Effect(e1,e2)', 'Content-Container(e2,e1)', 'Instrument-Agency(e1,e2)', 'Entity-Origin(e1,e2)', 'Content-Container(e1,e2)', 'Component-Whole(e2,e1)', 'Other', 'Product-Producer(e2,e1)', 'Entity-Origin(e2,e1)', 'Member-Collection(e1,e2)', 'Product-Producer(e1,e2)', 'Instrument-Agency(e2,e1)', 'Cause-Effect(e2,e1)', 'Message-Topic(e2,e1)', 'Entity-Destination(e2,e1)', 'Component-Whole(e1,e2)', 'Member-Collection(e2,e1)']


In [8]:
# unique_relations = ['Message-Topic', 'Instrument-Agency', 'Product-Producer', 'Content-Container', 'Entity-Origin', 'Component-Whole', 'Entity-Destination', 'Member-Collection', 'Cause-Effect',  'Other']
rel2lb = {}
lb2rel = {}
for i,item in enumerate(unique_relations):
    rel2lb[item] = i
    lb2rel[i] = item
train_labels = [rel2lb[item] for item in train_relations]
test_labels = [rel2lb[item] for item in test_relations]
print(len(train_labels), len(test_labels), test_labels[:3])

8000 2717 [1, 9, 13]


In [9]:
def process_sentence(sentence):
    # regex replace
    sentence = re.sub('\d*\t\"', "", sentence)
    sentence = sentence[:-2]
    e1 = e1pt.search(sentence)
    sentence = re.sub(e1pt, E1LABEL, sentence)
    e2 = e2pt.search(sentence)
    sentence = re.sub(e2pt, E2LABEL, sentence)
    
    # split word
    words= []
    doc = nlp(sentence)
    for token in doc:
        words.append(token.text)
    
    return sentence, (e1.group(0)[4:-5],e2.group(0)[4:-5]), words

def process_sentence_group(org_sentences):
    entities = []
    words_list = []
    processed_sentences = []
    for i in range(len(org_sentences)):
        sentence = org_sentences[i]
        sentence, entity, words =  process_sentence(sentence)
        entities.append(entity)
        words_list.append(words)
        processed_sentences.append(sentence)
        if (i%100 == 0):
            print('\rfinished {}'.format(i),flush=True, end='')
    print('')
    return processed_sentences, entities, words_list

train_processed, train_entities, train_words = process_sentence_group(train_sentences)
test_processed, test_entities, test_words = process_sentence_group(test_sentences)

finished 7900
finished 2700


In [10]:
print(train_processed[:3])
print(train_entities[:3])
print(train_words[:3])

['The system as described above has its greatest application in an arrayed E1LABEL of antenna E2LABEL.', 'The E1LABEL was carefully wrapped and bound into the E2LABEL by means of a cord.', 'The E1LABEL of a keygen uses a E2LABEL to look at the raw assembly code.']
[('configuration', 'elements'), ('child', 'cradle'), ('author', 'disassembler')]
[['The', 'system', 'as', 'described', 'above', 'has', 'its', 'greatest', 'application', 'in', 'an', 'arrayed', 'E1LABEL', 'of', 'antenna', 'E2LABEL', '.'], ['The', 'E1LABEL', 'was', 'carefully', 'wrapped', 'and', 'bound', 'into', 'the', 'E2LABEL', 'by', 'means', 'of', 'a', 'cord', '.'], ['The', 'E1LABEL', 'of', 'a', 'keygen', 'uses', 'a', 'E2LABEL', 'to', 'look', 'at', 'the', 'raw', 'assembly', 'code', '.']]


In [11]:
def lexical_features(org_words):
    context_words = []
    word_idxs = []
    count = 0
    for each in org_words:
        w1 = ''
        w2 = ''
        w3 = ''
        w4 = ''
        idx1 = 0
        idx2 = 0
        for i in range(len(each)):
            if each[i] == E1LABEL:
                w1 = each[i-1] if i-1>=0 else ''
                w2 = each[i+1] if i+1<len(each) else ''
                idx1 = i
            if each[i] == E2LABEL:
                w3 = each[i-1] if i-1>=0 else ''
                w4 = each[i+1] if i+1<len(each) else ''
                idx2 = i
        context_words.append((w1, w2, w3, w4))
        word_idxs.append((idx1, idx2))
        if count%100 == 0:
            print('\rfinished {}'.format(count), flush=True, end='')
        count += 1
    print('')
    return context_words, word_idxs

train_context,train_idxs = lexical_features(train_words)
test_context,test_idxs = lexical_features(test_words)

finished 7900
finished 2700


In [12]:
print(train_context[:3])
print(train_idxs[:3])

[('arrayed', 'of', 'antenna', '.'), ('The', 'was', 'the', 'by'), ('The', 'of', 'a', 'to')]
[(12, 15), (1, 9), (1, 7)]


In [13]:
def replace_word(org_list, org_word_list):
    new_list = []
    for i in range(len(org_list)):
        e1 = (nlp(org_list[i][0])[-1].text).lower()
        e2 = (nlp(org_list[i][1])[-1].text).lower()
        new_list.append((e1,e2))
        for j in range(len(org_word_list[i])):
            if org_word_list[i][j] == E1LABEL:
                org_word_list[i][j] = e1
            if org_word_list[i][j] == E2LABEL:
                org_word_list[i][j] = e2
            org_word_list[i][j] = org_word_list[i][j].lower()
        if i%100 == 0:
            print('\rfinished {}'.format(i), flush=True, end='')
    print('')
    return new_list

train_entities = replace_word(train_entities, train_words)
test_entities = replace_word(test_entities, test_words)
print(train_entities[:3])
print(train_words[:3])

finished 7900
finished 2700
[('configuration', 'elements'), ('child', 'cradle'), ('author', 'disassembler')]
[['the', 'system', 'as', 'described', 'above', 'has', 'its', 'greatest', 'application', 'in', 'an', 'arrayed', 'configuration', 'of', 'antenna', 'elements', '.'], ['the', 'child', 'was', 'carefully', 'wrapped', 'and', 'bound', 'into', 'the', 'cradle', 'by', 'means', 'of', 'a', 'cord', '.'], ['the', 'author', 'of', 'a', 'keygen', 'uses', 'a', 'disassembler', 'to', 'look', 'at', 'the', 'raw', 'assembly', 'code', '.']]


In [14]:
def get_hypernyms(org_entities):
    hypernyms = []
    for i in range(len(org_entities)):
        h1 = ''
        h2 = ''
        temp = wn.synsets(org_entities[i][0])
        if len(temp) > 0:
            t2 = temp[0].hypernyms()
            if len(t2) > 0:
                h1 = t2[0].lemmas()[0].name()
        temp = wn.synsets(org_entities[i][1])
        if len(temp) > 0:
            t2 = temp[0].hypernyms()
            if len(t2) > 0:
                h2 = t2[0].lemmas()[0].name()
        
        h1 = h1.split('_')[-1]
        h2 = h2.split('_')[-1]
        hypernyms.append((h1, h2))
        if i%100 == 0:
            print('\rfinished {}'.format(i), flush=True, end='')
    print('')
    return hypernyms

train_hypernyms = get_hypernyms(train_entities)
test_hypernyms = get_hypernyms(test_entities)
print(train_hypernyms[:3])

finished 7900
finished 2700
[('design', 'weather'), ('juvenile', 'bed'), ('communicator', '')]


In [15]:
unique_words = list(set([w for item in train_words+test_words for w in item]))
unique_words.append('<unknown_word>')
unique_words.append('<padding_word>')
unknown_idx = len(unique_words) - 2
padding_idx = len(unique_words) - 1

In [16]:
word2idx = {}
idx2word = {}
for i,w in enumerate(unique_words):
    word2idx[w] = i
    idx2word[i] = w
print(len(word2idx), len(idx2word))

22549 22549


In [17]:
not_in_count = 0
not_in_words = []
def get_word_embed(unique_words, word_dict):
    global not_in_count, not_in_words
    word_embed = np.zeros([WORD_DIM, len(unique_words)], dtype= 'float32')
    for i, w in enumerate(unique_words):
        if w not in word_dict:
            not_in_count += 1
            not_in_words.append(w)
        word_embed[:, i] = word_dict[w] if w in word_dict else np.random.normal(0,1,50)
    word_embed[:,-1] = np.zeros(50, dtype='float')
    return word_embed
word_embed = get_word_embed(unique_words, word_dict)
print(word_embed[:3,:3])
print(not_in_count, not_in_words[:3],len(unique_words))

[[-0.33719    -0.70544    -0.9214848 ]
 [-0.21405     0.53588    -0.6224373 ]
 [-0.75152     0.29921     0.40707925]]
3277 ['802.11a', 'ingots', 'karelian'] 22549


In [18]:
# example
    # sentence
    # entity_idx
    # entity_words
    # context_words
    # hypernyms
    # words
    # label
train_iter = zip(train_sentences, train_idxs, train_entities, train_context, train_hypernyms, train_words, train_labels)
test_iter = zip(test_sentences, test_idxs, test_entities, test_context, test_hypernyms, test_words, test_labels)


def transform_and_write_tf(data):
    tf_ex_ser = []
    for i,each in enumerate(data):
        sentence = each[0]
        idxs = [item for item in each[1]]
        lexical = each[2] + each[3] + each[4]
        words = each[5]
        label = each[6]
        lexical = [word2idx[item] if item in word2idx else unknown_idx for item in lexical]
        words = [word2idx[item] if item in word2idx else unknown_idx for item in words]
        f1 = tf.train.Feature(bytes_list=tf.train.BytesList(value=[sentence.encode('utf-8')]))
        f2 = tf.train.Feature(int64_list=tf.train.Int64List(value=idxs))
        f3 = tf.train.Feature(int64_list=tf.train.Int64List(value=lexical))
        f4 = tf.train.Feature(int64_list=tf.train.Int64List(value=words))
        f5 = tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
        feature = {
            'sentece': f1,
            'idxs': f2,
            'lexical': f3,
            'words':f4,
            'label':f5
        }

        example = tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
        tf_ex_ser.append(example)
        if (i+1) % 100 == 0:
            print('\rfinished {}'.format(i+1), end='', flush=True)
    print('')
    return tf_ex_ser
train_tf = transform_and_write_tf(train_iter)
test_tf = transform_and_write_tf(test_iter)
#     print(sentence)
#     print(idxs)
#     print(lexical)
#     print(words)


finished 8000
finished 2700


In [22]:
base_dir = './data/'

In [23]:
record_file = base_dir + 'train.tfrecords'
with tf.io.TFRecordWriter(record_file) as writer:
    for example in train_tf:
        writer.write(example)

record_file = base_dir + 'test.tfrecords'
with tf.io.TFRecordWriter(record_file) as writer:
    for example in test_tf:
        writer.write(example)
print('finished')

finished


In [24]:
import pickle
pickle.dump(word2idx, open(base_dir + 'word2idx.dict', 'wb'))
pickle.dump(idx2word, open(base_dir + 'idx2word.dict', 'wb'))
pickle.dump(lb2rel, open(base_dir + 'lb2rel.dict', 'wb'))
pickle.dump(rel2lb, open(base_dir + 'rel2lb.dict', 'wb'))
pickle.dump(word_embed, open(base_dir + 'word_embed','wb'))
pickle.dump(unique_relations, open(base_dir + 'unique_relations', 'wb'))

In [25]:
print()

['Entity-Destination(e1,e2)', 'Message-Topic(e1,e2)', 'Cause-Effect(e1,e2)', 'Content-Container(e2,e1)', 'Instrument-Agency(e1,e2)', 'Entity-Origin(e1,e2)', 'Content-Container(e1,e2)', 'Component-Whole(e2,e1)', 'Other', 'Product-Producer(e2,e1)', 'Entity-Origin(e2,e1)', 'Member-Collection(e1,e2)', 'Product-Producer(e1,e2)', 'Instrument-Agency(e2,e1)', 'Cause-Effect(e2,e1)', 'Message-Topic(e2,e1)', 'Entity-Destination(e2,e1)', 'Component-Whole(e1,e2)', 'Member-Collection(e2,e1)']
