In [1]:
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import pprint
import os
import sys

sys.path.append(os.path.dirname(os.getcwd()))
from data import WN18

In [2]:
class Config:
    seed = 21
    n_epochs = 10
    batch_size = 100
    embed_dim = 200

In [3]:
"""
e: entity
s: subject
p: predicate
o: object
"""

def read_triples(path):
    triples = []
    with open(path, 'rt') as f:
        for line in f.readlines():
            s, p, o = line.split()
            triples += [(s.strip(), p.strip(), o.strip())]
    return triples


def load_triple():
    WN18.download()
    triples_tr = read_triples('../data/WN18/wn18/train.txt')
    triples_va = read_triples('../data/WN18/wn18/valid.txt')
    triples_te = read_triples('../data/WN18/wn18/test.txt')
    
    triples_all = triples_tr + triples_va + triples_te
    
    return triples_all, triples_tr, triples_va, triples_te


def build_vocab(triples):
    params = {}
    
    e_set = {s for (s, p, o) in triples} | {o for (s, p, o) in triples}
    p_set = {p for (s, p, o) in triples}

    params['e_vocab_size'] = len(e_set)
    params['p_vocab_size'] = len(p_set)
    params['embed_dim'] = Config.embed_dim

    e2idx = {e: idx for idx, e in enumerate(sorted(e_set))}
    p2idx = {p: idx for idx, p in enumerate(sorted(p_set))}
    
    return e2idx, p2idx, params


def build_train_data(triples_tr, e2idx, p2idx):
    x_s = np.array([e2idx[s] for (s, p, o) in triples_tr], dtype=np.int32)
    x_p = np.array([p2idx[p] for (s, p, o) in triples_tr], dtype=np.int32)
    x_o = np.array([e2idx[o] for (s, p, o) in triples_tr], dtype=np.int32)

    x = {'s': x_s,
         'p': x_p,
         'o': x_o}
    y = np.ones([len(x_s)], dtype=np.float32)
    
    return x, y


def train_input_fn(triples_tr, e2idx, p2idx, random_state, params):
    x, y = build_train_data(triples_tr, e2idx, p2idx)
    s, p, o = x['s'], x['p'], x['o']
    
    s_ = random_state.choice(params['e_vocab_size'], s.shape)
    o_ = random_state.choice(params['e_vocab_size'], o.shape)
    
    x_ = {
        's': np.concatenate([s, s_, s]),
        'p': np.concatenate([p, p, p]),
        'o': np.concatenate([o, o, o_])}
    y_ = np.concatenate([y, np.zeros([2*len(y)], dtype=np.float32)])
    
    return tf.estimator.inputs.numpy_input_fn(x = x_,
                                              y = y_,
                                              batch_size = Config.batch_size,
                                              num_epochs = 1,
                                              shuffle = True)

In [4]:
def s_next_batch(eval_triples,
                 e2idx,
                 p2idx,
                 nb_entities,
                 batch_size):
    for (s, p, o) in tqdm(eval_triples, total=len(eval_triples), ncols=70):
        s_idx, p_idx, o_idx = e2idx[s], p2idx[p], e2idx[o]
        xs = np.arange(nb_entities)
        xp = np.full(shape=(nb_entities,), fill_value=p_idx, dtype=np.int32)
        xo = np.full(shape=(nb_entities,), fill_value=o_idx, dtype=np.int32)
        for i in range(0, len(xs), batch_size):
            yield xs[i: i+batch_size], xp[i: i+batch_size], xo[i: i+batch_size]


def o_next_batch(eval_triples,
                 e2idx, 
                 p2idx,
                 nb_entities,
                 batch_size):
    for (s, p, o) in tqdm(eval_triples, total=len(eval_triples), ncols=70):
        s_idx, p_idx, o_idx = e2idx[s], p2idx[p], e2idx[o]
        xs = np.full(shape=(nb_entities,), fill_value=s_idx, dtype=np.int32)
        xp = np.full(shape=(nb_entities,), fill_value=p_idx, dtype=np.int32)
        xo = np.arange(nb_entities)
        for i in range(0, len(xs), batch_size):
            yield xs[i: i+batch_size], xp[i: i+batch_size], xo[i: i+batch_size]


def s_input_fn(eval_triples,
               e2idx, 
               p2idx,
               nb_entities,
               batch_size):
    dataset = tf.data.Dataset.from_generator(
        lambda: s_next_batch(eval_triples,
                             e2idx, 
                             p2idx,
                             nb_entities,
                             batch_size),
        (tf.int32, tf.int32, tf.int32),
        (tf.TensorShape([None,]),
         tf.TensorShape([None,]),
         tf.TensorShape([None,]),))
    iterator = dataset.make_one_shot_iterator()
    s, p, o = iterator.get_next()
    return {'s': s, 'p': p, 'o': o}


def o_input_fn(eval_triples,
               e2idx, 
               p2idx,
               nb_entities,
               batch_size):
    dataset = tf.data.Dataset.from_generator(
        lambda: o_next_batch(eval_triples,
                             e2idx, 
                             p2idx,
                             nb_entities,
                             batch_size),
        (tf.int32, tf.int32, tf.int32),
        (tf.TensorShape([None,]),
         tf.TensorShape([None,]),
         tf.TensorShape([None,]),))
    iterator = dataset.make_one_shot_iterator()
    s, p, o = iterator.get_next()
    return {'s': s, 'p': p, 'o': o}


def evaluate_rank(model,
                  valid_triples,
                  test_triples,
                  all_triples,
                  e2idx,
                  p2idx,
                  nb_entities,
                  batch_size):

    #for eval_name, eval_triples in [('valid', valid_triples), ('test', test_triples)]:
    for eval_name, eval_triples in [('test', test_triples)]:
        
        _scores_s = list(model.predict(
            lambda: s_input_fn(eval_triples,
                               e2idx, 
                               p2idx,
                               nb_entities,
                               batch_size), yield_single_examples=False))
        
        _scores_o = list(model.predict(
            lambda: o_input_fn(eval_triples,
                               e2idx, 
                               p2idx,
                               nb_entities,
                               batch_size), yield_single_examples=False))

        ScoresS = np.concatenate(_scores_s).reshape([len(eval_triples), nb_entities])
        ScoresO = np.concatenate(_scores_o).reshape([len(eval_triples), nb_entities])

        ranks_s, ranks_o = [], []
        filtered_ranks_s, filtered_ranks_o = [], []

        for ((s, p, o), scores_s, scores_o) in tqdm(zip(eval_triples,
                                                        ScoresS,
                                                        ScoresO),
                                                    total=len(eval_triples),
                                                    ncols=70):
            s_idx, p_idx, o_idx = e2idx[s], p2idx[p], e2idx[o]

            ranks_s += [1 + np.argsort(np.argsort(- scores_s))[s_idx]]
            ranks_o += [1 + np.argsort(np.argsort(- scores_o))[o_idx]]

            filtered_scores_s = scores_s.copy()
            filtered_scores_o = scores_o.copy()

            rm_idx_s = [e2idx[fs] for (fs, fp, fo) in all_triples if fs != s and fp == p and fo == o]
            rm_idx_o = [e2idx[fo] for (fs, fp, fo) in all_triples if fs == s and fp == p and fo != o]

            filtered_scores_s[rm_idx_s] = - np.inf
            filtered_scores_o[rm_idx_o] = - np.inf

            filtered_ranks_s += [1 + np.argsort(np.argsort(- filtered_scores_s))[s_idx]]
            filtered_ranks_o += [1 + np.argsort(np.argsort(- filtered_scores_o))[o_idx]]

        ranks = ranks_s + ranks_o
        filtered_ranks = filtered_ranks_s + filtered_ranks_o

        for setting_name, setting_ranks in [('Raw', ranks), ('Filtered', filtered_ranks)]:
            mean_rank = np.mean(1 / np.asarray(setting_ranks))
            print('[{}] {} MRR: {}'.format(eval_name, setting_name, mean_rank))
            for k in [1, 3, 5, 10]:
                hits_at_k = np.mean(np.asarray(setting_ranks) <= k) * 100
                print('[{}] {} Hits@{}: {}'.format(eval_name, setting_name, k, hits_at_k))

In [5]:
def forward(features, params):
    e_embed = tf.get_variable('e_embed',
                              [params['e_vocab_size'], params['embed_dim']],
                              initializer=tf.contrib.layers.xavier_initializer())
    p_embed = tf.get_variable('p_embed',
                              [params['p_vocab_size'], params['embed_dim']],
                              initializer=tf.contrib.layers.xavier_initializer())
    
    s = tf.nn.embedding_lookup(e_embed, features['s'])
    p = tf.nn.embedding_lookup(p_embed, features['p'])
    o = tf.nn.embedding_lookup(e_embed, features['o'])
    
    logits = tf.reduce_sum(s * p * o, axis=1)
    
    return logits
    
    
def model_fn(features, labels, mode, params):
    logits = forward(features, params)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.logging.info('\n'+pprint.pformat(tf.trainable_variables()))
        tf.logging.info('params: %d'%count_train_params())
        
        loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                                         labels=labels))
        
        train_op = tf.train.AdamOptimizer().minimize(
            loss_op, global_step=tf.train.get_global_step())
        
        return tf.estimator.EstimatorSpec(mode = mode,
                                          loss = loss_op,
                                          train_op = train_op)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions = tf.sigmoid(logits))


def count_train_params():
    return np.sum([np.prod([d.value for d in v.get_shape()]) for v in tf.trainable_variables()])

In [6]:
random_state = np.random.RandomState(Config.seed)
triples_all, triples_tr, triples_va, triples_te = load_triple()
e2idx, p2idx, params = build_vocab(triples_tr)

model = tf.estimator.Estimator(model_fn,
                               params = params)

for _ in range(Config.n_epochs):
    model.train(train_input_fn(triples_tr,
                               e2idx,
                               p2idx,
                               random_state,
                               params))
evaluate_rank(model,
              triples_va,
              triples_te,
              triples_all,
              e2idx,
              p2idx,
              params['e_vocab_size'],
              batch_size = 10*Config.batch_size)

Files Already Downloaded
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmph9gee3t2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11cb85898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:
[<tf.Variable 'e_embed:0' shape=(40943, 200) dtype=float32_ref>,
 <tf.Variable 'p_embed:0' shape=(18, 200) dtype=float32_ref>]
INFO:tensorflow:params: 8192200
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create Ch

INFO:tensorflow:global_step/sec: 23.3872
INFO:tensorflow:loss = 0.6726109, step = 6145 (4.276 sec)
INFO:tensorflow:global_step/sec: 23.7174
INFO:tensorflow:loss = 0.66494447, step = 6245 (4.216 sec)
INFO:tensorflow:global_step/sec: 23.4006
INFO:tensorflow:loss = 0.7015714, step = 6345 (4.274 sec)
INFO:tensorflow:global_step/sec: 23.6506
INFO:tensorflow:loss = 0.64571726, step = 6445 (4.228 sec)
INFO:tensorflow:global_step/sec: 23.5666
INFO:tensorflow:loss = 0.63766456, step = 6545 (4.243 sec)
INFO:tensorflow:global_step/sec: 23.1886
INFO:tensorflow:loss = 0.63489205, step = 6645 (4.313 sec)
INFO:tensorflow:global_step/sec: 19.247
INFO:tensorflow:loss = 0.60622185, step = 6745 (5.196 sec)
INFO:tensorflow:global_step/sec: 18.3756
INFO:tensorflow:loss = 0.35748413, step = 6845 (5.442 sec)
INFO:tensorflow:global_step/sec: 18.9533
INFO:tensorflow:loss = 0.2978887, step = 6945 (5.276 sec)
INFO:tensorflow:global_step/sec: 18.8434
INFO:tensorflow:loss = 0.25057703, step = 7045 (5.307 sec)
INFO

INFO:tensorflow:Saving checkpoints for 12733 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmph9gee3t2/model.ckpt.
INFO:tensorflow:loss = 0.01704003, step = 12733
INFO:tensorflow:global_step/sec: 18.2516
INFO:tensorflow:loss = 0.018545132, step = 12833 (5.480 sec)
INFO:tensorflow:global_step/sec: 20.5244
INFO:tensorflow:loss = 0.017712018, step = 12933 (4.872 sec)
INFO:tensorflow:global_step/sec: 21.4456
INFO:tensorflow:loss = 0.01871293, step = 13033 (4.663 sec)
INFO:tensorflow:global_step/sec: 17.6825
INFO:tensorflow:loss = 0.015625024, step = 13133 (5.655 sec)
INFO:tensorflow:global_step/sec: 19.8977
INFO:tensorflow:loss = 0.015269113, step = 13233 (5.026 sec)
INFO:tensorflow:global_step/sec: 20.3148
INFO:tensorflow:loss = 0.31329942, step = 13333 (4.923 sec)
INFO:tensorflow:global_step/sec: 22.2266
INFO:tensorflow:loss = 0.2799238, step = 13433 (4.499 sec)
INFO:tensorflow:global_step/sec: 23.8025
INFO:tensorflow:loss = 0.29225558, step = 13533 (4.201 sec)
INFO:tensorflow:gl

INFO:tensorflow:global_step/sec: 21.6629
INFO:tensorflow:loss = 0.014801575, step = 19977 (4.616 sec)
INFO:tensorflow:global_step/sec: 23.3141
INFO:tensorflow:loss = 0.012151762, step = 20077 (4.289 sec)
INFO:tensorflow:global_step/sec: 22.1533
INFO:tensorflow:loss = 0.025867328, step = 20177 (4.514 sec)
INFO:tensorflow:global_step/sec: 21.2846
INFO:tensorflow:loss = 0.00982687, step = 20277 (4.698 sec)
INFO:tensorflow:global_step/sec: 18.8553
INFO:tensorflow:loss = 0.017311504, step = 20377 (5.304 sec)
INFO:tensorflow:global_step/sec: 17.2244
INFO:tensorflow:loss = 0.009361813, step = 20477 (5.806 sec)
INFO:tensorflow:global_step/sec: 18.6642
INFO:tensorflow:loss = 0.032713365, step = 20577 (5.358 sec)
INFO:tensorflow:global_step/sec: 17.7069
INFO:tensorflow:loss = 0.03641712, step = 20677 (5.647 sec)
INFO:tensorflow:global_step/sec: 20.3619
INFO:tensorflow:loss = 0.022188032, step = 20777 (4.911 sec)
INFO:tensorflow:global_step/sec: 20.7011
INFO:tensorflow:loss = 0.013359409, step = 

INFO:tensorflow:global_step/sec: 23.1391
INFO:tensorflow:loss = 0.0012555821, step = 26365 (4.322 sec)
INFO:tensorflow:global_step/sec: 23.2662
INFO:tensorflow:loss = 0.0017564258, step = 26465 (4.298 sec)
INFO:tensorflow:global_step/sec: 23.1823
INFO:tensorflow:loss = 0.042840205, step = 26565 (4.313 sec)
INFO:tensorflow:global_step/sec: 23.0005
INFO:tensorflow:loss = 0.0024591233, step = 26665 (4.348 sec)
INFO:tensorflow:global_step/sec: 23.3357
INFO:tensorflow:loss = 0.0022110469, step = 26765 (4.285 sec)
INFO:tensorflow:global_step/sec: 23.1628
INFO:tensorflow:loss = 0.0018130823, step = 26865 (4.318 sec)
INFO:tensorflow:global_step/sec: 23.2201
INFO:tensorflow:loss = 0.0019714388, step = 26965 (4.306 sec)
INFO:tensorflow:global_step/sec: 20.8812
INFO:tensorflow:loss = 0.0014880953, step = 27065 (4.789 sec)
INFO:tensorflow:global_step/sec: 21.5452
INFO:tensorflow:loss = 0.002382321, step = 27165 (4.641 sec)
INFO:tensorflow:global_step/sec: 21.7355
INFO:tensorflow:loss = 0.001108464

INFO:tensorflow:loss = 0.0001654242, step = 33509 (5.275 sec)
INFO:tensorflow:global_step/sec: 17.0391
INFO:tensorflow:loss = 0.0003932778, step = 33609 (5.869 sec)
INFO:tensorflow:global_step/sec: 18.0545
INFO:tensorflow:loss = 7.2048475e-05, step = 33709 (5.539 sec)
INFO:tensorflow:global_step/sec: 19.2269
INFO:tensorflow:loss = 0.00097653596, step = 33809 (5.201 sec)
INFO:tensorflow:global_step/sec: 18.798
INFO:tensorflow:loss = 0.0066415113, step = 33909 (5.320 sec)
INFO:tensorflow:Saving checkpoints for 33952 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmph9gee3t2/model.ckpt.
INFO:tensorflow:Loss for final step: 0.0053888108.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:
[<tf.Variable 'e_embed:0' shape=(40943, 200) dtype=float32_ref>,
 <tf.Variable 'p_embed:0' shape=(18, 200) dtype=float32_ref>]
INFO:tensorflow:params: 8192200
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Resto

INFO:tensorflow:global_step/sec: 17.473
INFO:tensorflow:loss = 0.0003257164, step = 39897 (5.722 sec)
INFO:tensorflow:global_step/sec: 18.2158
INFO:tensorflow:loss = 0.00014598526, step = 39997 (5.490 sec)
INFO:tensorflow:global_step/sec: 17.3956
INFO:tensorflow:loss = 0.00010825575, step = 40097 (5.749 sec)
INFO:tensorflow:global_step/sec: 17.9661
INFO:tensorflow:loss = 0.00070927583, step = 40197 (5.566 sec)
INFO:tensorflow:global_step/sec: 19.0496
INFO:tensorflow:loss = 0.0034223069, step = 40297 (5.249 sec)
INFO:tensorflow:global_step/sec: 18.0324
INFO:tensorflow:loss = 0.00042293247, step = 40397 (5.545 sec)
INFO:tensorflow:global_step/sec: 19.5049
INFO:tensorflow:loss = 0.00044064978, step = 40497 (5.127 sec)
INFO:tensorflow:global_step/sec: 20.3157
INFO:tensorflow:loss = 0.0017988888, step = 40597 (4.922 sec)
INFO:tensorflow:global_step/sec: 23.4485
INFO:tensorflow:loss = 0.06859967, step = 40697 (4.265 sec)
INFO:tensorflow:global_step/sec: 23.1688
INFO:tensorflow:loss = 0.00037

100%|█████████████████████████████| 5000/5000 [02:16<00:00, 36.71it/s]


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmph9gee3t2/model.ckpt-42440
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


100%|█████████████████████████████| 5000/5000 [02:20<00:00, 35.56it/s]
100%|█████████████████████████████| 5000/5000 [04:02<00:00, 20.61it/s]


[test] Raw MRR: 0.5655667568354295
[test] Raw Hits@1: 42.699999999999996
[test] Raw Hits@3: 66.03999999999999
[test] Raw Hits@5: 74.53999999999999
[test] Raw Hits@10: 81.76
[test] Filtered MRR: 0.8155054696611849
[test] Filtered Hits@1: 70.89
[test] Filtered Hits@3: 92.0
[test] Filtered Hits@5: 93.33
[test] Filtered Hits@10: 94.16
