In [1]:
import numpy as np

In [2]:
train_file_path = './files/train_attn_sp.txt'
val_file_path = './files/val_attn_sp.txt'
test_file_path = './files/test_attn_sp.txt'

relation_file_path = './files/relations.txt'
emb_google_txt = './word_embeddings/GoogleNews-vectors-negative300.txt'
avg_vec_file = './word_embeddings/GoogleNews-vectors-negative300_avg_vec.txt'

data_all_file_path = "./data/data_all"

files = [train_file_path, val_file_path, test_file_path]


In [3]:
label_to_int = {}
int_to_label= {} 

with open(relation_file_path, 'r') as f:
    for line in f: 
        line = line.strip().split()
        key = str(line[0])
        val = int(line[1])
        label_to_int[key] = val 
        int_to_label[val] = key 
        
print(int_to_label)


{0: 'Other', 1: 'Message-Topic(e1,e2)', 2: 'Message-Topic(e2,e1)', 3: 'Product-Producer(e1,e2)', 4: 'Product-Producer(e2,e1)', 5: 'Instrument-Agency(e1,e2)', 6: 'Instrument-Agency(e2,e1)', 7: 'Entity-Destination(e1,e2)', 8: 'Entity-Destination(e2,e1)', 9: 'Cause-Effect(e1,e2)', 10: 'Cause-Effect(e2,e1)', 11: 'Component-Whole(e1,e2)', 12: 'Component-Whole(e2,e1)', 13: 'Entity-Origin(e1,e2)', 14: 'Entity-Origin(e2,e1)', 15: 'Member-Collection(e1,e2)', 16: 'Member-Collection(e2,e1)', 17: 'Content-Container(e1,e2)', 18: 'Content-Container(e2,e1)'}


In [4]:
words_dataset = set()

def create_words_dataset():
    for f_name in files: 
        f = open(f_name, 'r')
        lines = f.readlines()
        f.close()
        
        lines = [ l.strip().split(" ")[2:]  for l in lines]
    
        for line in lines:
            for w in line:
                words_dataset.add(w)
    
    print("len(words_dataset)", len(words_dataset))
    
    with open('./data/words_dataset.txt', 'w') as f: 
        for w in sorted(words_dataset):
            f.write(str(w) + "\n")
        print('./data/words_dataset.txt created')
    
    
create_words_dataset()



len(words_dataset) 25655
./data/words_dataset.txt created


In [5]:

word_to_emb = {}

with open(emb_google_txt, 'r', encoding='utf-8') as f: 
    first = True
    
    for line in f:
        if first == True:
            first = False
            continue
        line = line.strip().split()
        if len(line) != 301:
            continue 
        word = str(line[0])
        vec = [float(x) for x in line[1:]]
        vec = np.array(vec, dtype='float64')
        
        if word in words_dataset:
            word_to_emb[word] = vec
        elif word.lower() in words_dataset and word.lower() not in word_to_emb: 
            word_to_emb[word.lower()] = vec


In [6]:

def get_avg_vec(file_name):
    with open(file_name, 'r') as f:
        line = f.readline()
        line = line.strip().split()
        line = [float(x) for x in line]
        avg_vec = np.array(line, dtype='float64')
        print("avg_vec.shape", avg_vec.shape)
        return avg_vec

avg_vec = get_avg_vec(avg_vec_file)


avg_vec.shape (300,)


In [7]:
word_to_int = {}
embedding = []

unknown_words = 0
word_to_int['PADDING'] = len(word_to_int)
embedding.append(np.zeros(300, dtype='float64'))

for w in sorted(words_dataset): 
    word_to_int[w] = len(word_to_int)
    if w in word_to_emb:
        embedding.append(word_to_emb[w])
    elif w.lower() in word_to_emb:
        embedding.append(word_to_emb[w.lower()])
    else:
        unknown_words += 1
        embedding.append(avg_vec)

embedding = np.array(embedding, dtype='float64')
print("len(word_to_int)", len(word_to_int)) # 25656
print("embedding.shape", embedding.shape) # (25656, 300)
print("unknown_words", unknown_words) # 2652


len(word_to_int) 25656
embedding.shape (25656, 300)
unknown_words 2652


In [8]:
def get_max_sent_len(files):
    max_sent_len = 0 
    for fname in files: 
        f = open(fname, 'r')
        lines = f.readlines()
        f.close()
        for l in lines: 
            l = l.strip().split(" ")[2:]
            max_sent_len = max(max_sent_len, len(l))
    return max_sent_len

max_sent_len = get_max_sent_len(files)
print("max_sent_len", max_sent_len) # 102-1 = 101

max_sent_len 101


In [9]:
def create_matrices(file_name, word_to_int, label_to_int, max_sent_len):
    X = []
    Y = []
    
    f = open(file_name, 'r')
    lines = f.readlines()
    f.close()
    lines = [line.strip().split() for line in lines]
    
    for line in lines: 
        Y.append(label_to_int[line[1]])
        line = line[2:]
        tmp = np.zeros(max_sent_len, dtype='int32')
        for i in range(len(line)):
            tmp[i] = word_to_int[line[i]]
        X.append(tmp)

        
    X = np.array(X, dtype='int32')
    Y = np.array(Y, dtype='int32')
    
    return [X, Y]
    

In [10]:
train_set = create_matrices(train_file_path, word_to_int, label_to_int, max_sent_len)
val_set = create_matrices(val_file_path, word_to_int, label_to_int, max_sent_len)
test_set = create_matrices(test_file_path, word_to_int, label_to_int, max_sent_len)

In [11]:
data_all = [train_set, val_set, test_set, embedding, label_to_int, int_to_label]

np.save(data_all_file_path, data_all)
