1. Run this file: On Single CPU, GPU, and Turing cluster using command "python filename".
2. Saved the model_dir and output testfor each run after training and testing
3. Change parameters model_dir for cpu/gpu and use both DB100K and FB15K for now.
4. Output will be three model_dir and output text one for each instances CPU, GPU, turing.

In [1]:
from tqdm import tqdm
import tensorflow as tf

import numpy as np
import sklearn
import pprint
import itertools
import os
import time
import sys
import zipfile
# !pip install wget
import wget

sys.path.append(os.path.dirname(os.getcwd()))
tf.logging.set_verbosity(tf.logging.INFO)
os.environ["CUDA_VISIBLE_DEVICES"] = "1"



In [0]:
class Config:
    n_epochs = 100
    batch_size = 128
    embed_dim = 200
    prefetch_buffer_size = None
    shuffle_buffer_size = 2
    map_threads = 3
    data_path = "data/"       #constant value
    dataset_name="FB15K"      #FB15K/WN18/DB100K     #FB15K 272115, maxlen = 843         # DB100K 597572, maxlen = 85
    model_dir = "model/conve_"+"single_gpu"+"_bs_"+str(batch_size)+"_epochs_"+str(n_epochs)+"_embed_dim_"+str(embed_dim)+"_dataset_"+dataset_name # single_cpu / single_gpu

In [3]:
def download(data_path=Config.data_path, dataset_name=Config.dataset_name):
    dataset_path=data_path+dataset_name
    if not os.path.exists(dataset_path):
      os.makedirs(dataset_path)
    dataset_path = dataset_path+'/'
    if dataset_name=="WN18":
      if not os.path.isfile(dataset_path+'train.txt'):
          zip_path = dataset_path+"WN18RR.zip"
          url = "https://www.dropbox.com/s/sginaquks2xzv6o/WN18RR.zip?dl=1"
          wget.download(url, zip_path)
          z = zipfile.ZipFile(zip_path, 'r')
          z.extractall(dataset_path)
          z = z.close()
          os.remove(zip_path)
      else:
          print('Files Already Downloaded')
    elif dataset_name=="FB15K":
      if not os.path.isfile(dataset_path+'train.txt'):
        zip_path = dataset_path+"FB15K.zip"
        url = "https://www.dropbox.com/s/kph0mbs79w8itw6/FB15K-237.zip?dl=1"
        wget.download(url, zip_path)
        z = zipfile.ZipFile(zip_path, 'r')
        z.extractall(dataset_path)
        z = z.close()
        os.remove(zip_path)
      else:
          print('Files Already Downloaded')
    elif dataset_name=="DB100K":
      if not os.path.isfile(dataset_path+'train.txt'):
        zip_path = dataset_path+"DB100K.zip"
        url = "https://www.dropbox.com/s/fmbbh712ilx2zrc/DB100K.zip?dl=1"
        wget.download(url, zip_path)
        z = zipfile.ZipFile(zip_path, 'r')
        z.extractall(dataset_path)
        z = z.close()
        os.remove(zip_path)
      else:
          print('Files Already Downloaded')
download()

Files Already Downloaded


In [0]:
"""
e: entity
s: subject
p: predicate
o: object
"""
def glance_dict(d, n=5):
    return dict(itertools.islice(d.items(), n))

def read_triples(path):
    triples = []
    with open(path, 'rt') as f:
        for line in f.readlines():
            s, p, o = line.split()
            triples += [(s.strip(), p.strip(), o.strip())]
    return triples

def load_triple():
    download()
    dataset_path = Config.data_path+Config.dataset_name+"/"
    triples_tr = read_triples(dataset_path+'train.txt')
    triples_va = read_triples(dataset_path+'valid.txt')
    triples_te = read_triples(dataset_path+'test.txt')
    triples_all = triples_tr + triples_va + triples_te
    return triples_all, triples_tr, triples_va, triples_te


def build_vocab(triples):
    params = {}
    e_set = {s for (s, p, o) in triples} | {o for (s, p, o) in triples}
    p_set = {p for (s, p, o) in triples}
    params['e_vocab_size'] = len(e_set)
    params['p_vocab_size'] = len(p_set)
    e2idx = {e: idx for idx, e in enumerate(sorted(e_set))}
    p2idx = {p: idx for idx, p in enumerate(sorted(p_set))}
    return e2idx, p2idx, params


def build_multi_label(triples_tr):
    sp2o = {}
    for (_s, _p, _o) in triples_tr:
        s, p, o = e2idx[_s], p2idx[_p], e2idx[_o] 
        if (s,p) not in sp2o:
            sp2o[(s,p)] = [o]
        else:
            if o not in sp2o[(s,p)]:
                sp2o[(s,p)].append(o)
    return sp2o
  
def build_multi_label_all(triples_all):
    sp2o_all = {}
    for (_s, _p, _o) in triples_all:
        s, p, o = e2idx[_s], p2idx[_p], e2idx[_o] 
        if (s,p) not in sp2o_all:
            sp2o_all[(s,p)] = [o]
        else:
            if o not in sp2o_all[(s,p)]:
                sp2o_all[(s,p)].append(o)
    return sp2o_all  


def get_train_y(triples_tr, e2idx, p2idx, sp2o):
    y = []
    for (_s, _p, _o) in triples_tr:
        s, p, o = e2idx[_s], p2idx[_p], e2idx[_o] 
        temp = np.zeros([len(e2idx)])
        temp[sp2o[(s,p)]] = 1.
        y.append(temp)
    y = np.asarray(y)
    return y

def get_eval_y(triples_te, e2idx, p2idx, sp2o_all):
    y = []
    for (_s, _p, _o) in triples_te:
        s, p, o = e2idx[_s], p2idx[_p], e2idx[_o] 
        temp1 = np.zeros([len(e2idx)])
        temp1[o] = 1.
        temp2 = np.ones([len(e2idx)])
        temp2[sp2o_all[(s,p)]] = -1.
        temp2[o] = 1.
        y.append((temp1,temp2))
    y = np.asarray(y)
    return y 
  
def get_features_labels(triples_tr, e2idx, p2idx, sp2o,entity_vocab_size):
  features = [[e2idx[s],p2idx[p]] for (s, p, o) in triples_tr]
  labels = [sp2o[(e2idx[s],p2idx[p])] for (s, p, o) in triples_tr]
  new_labels = np.full([len(labels),len(max(labels,key = lambda x: len(x)))],params['e_vocab_size']+1)
  for i,j in enumerate(labels):
    new_labels[i][0:len(j)] = j
  return features, new_labels

In [0]:
def next_train_batch(triples_tr, e2idx, p2idx, sp2o):
    for i in range(0, len(triples_tr), Config.batch_size):
        _triples_tr = triples_tr[i: i+Config.batch_size]
        x_s = np.asarray([e2idx[s] for (s, p, o) in _triples_tr], dtype=np.int32)
        x_p = np.asarray([p2idx[p] for (s, p, o) in _triples_tr], dtype=np.int32)
        y = get_train_y(_triples_tr, e2idx, p2idx, sp2o)
        yield ({'s': x_s, 'p': x_p}, y)

def train_input_fn(triples_tr, e2idx, p2idx, sp2o):
    dataset = tf.data.Dataset.from_generator(
        lambda: next_train_batch(triples_tr,
                                 e2idx,
                                 p2idx,
                                 sp2o),
        ({'s': tf.int32, 'p': tf.int32}, tf.float32),
        ({'s': tf.TensorShape([None]), 'p': tf.TensorShape([None])},
         tf.TensorShape([None, len(e2idx)])))
    #dataset = dataset.shard(num_shards= ,  index = )
    dataset = dataset.shuffle(buffer_size=Config.shuffle_buffer_size)
    dataset = dataset.repeat(Config.n_epochs)
    #dataset = dataset.batch(Config.batch_size)
    dataset = dataset.prefetch(Config.prefetch_buffer_size)
    return dataset
  
def next_train_single(triples_tr, e2idx, p2idx, sp2o):
    for i in range(0, len(triples_tr)):
        s,p,o = triples_tr[i]
        x_s = e2idx[s]
        x_p = p2idx[p] 
        y = np.zeros([len(e2idx)])
        y[sp2o[(x_s,x_p)]] = 1.
        yield ({'s': x_s, 'p': x_p}, y)
        
def train_input_fn_single(triples_tr, e2idx, p2idx, sp2o):
    dataset = tf.data.Dataset.from_generator(
        lambda: next_train_single(triples_tr,
                                 e2idx,
                                 p2idx,
                                 sp2o),
        ({'s': tf.int32, 'p': tf.int32}, tf.float32),
        ({'s': tf.TensorShape([]), 'p': tf.TensorShape([])},
         tf.TensorShape([len(e2idx)])))
    #dataset = dataset.shard(num_shards= ,  index = )
    dataset = dataset.shuffle(buffer_size=Config.shuffle_buffer_size)
    dataset = dataset.repeat(Config.n_epochs)
    dataset = dataset.batch(Config.batch_size)
    dataset = dataset.prefetch(Config.prefetch_buffer_size)
    return dataset

def next_eval_batch(triples_te, e2idx, p2idx, sp2o_all):
    for i in range(0, len(triples_te), Config.batch_size):
        _triples_te = triples_te[i: i+Config.batch_size]
        x_s = np.asarray([e2idx[s] for (s, p, o) in _triples_te], dtype=np.int32)
        x_p = np.asarray([p2idx[p] for (s, p, o) in _triples_te], dtype=np.int32)
        y = get_eval_y(_triples_te, e2idx, p2idx, sp2o_all)
        yield ({'s': x_s, 'p': x_p}, y)
  
def eval_input_fn(triples_te, e2idx, p2idx, sp2o_all):
    dataset = tf.data.Dataset.from_generator(
        lambda: next_eval_batch(triples_te,
                             e2idx, 
                             p2idx,
                             sp2o_all),
        ({'s': tf.int32, 'p': tf.int32}, tf.float32),
        ({'s': tf.TensorShape([None]), 'p': tf.TensorShape([None])},
         tf.TensorShape([None, 2, len(e2idx)])))
    return dataset.prefetch(1)
  
def next_predict_batch(triples, e2idx, p2idx):
    for i in range(0, len(triples), Config.batch_size):
        _triples = triples[i: i+Config.batch_size]
        x_s = np.asarray([e2idx[s] for (s, p, o) in _triples], dtype=np.int32)
        x_p = np.asarray([p2idx[p] for (s, p, o) in _triples], dtype=np.int32)
        yield {'s': x_s, 'p': x_p}
  
def predict_input_fn(triples,
               e2idx, 
               p2idx):
    dataset = tf.data.Dataset.from_generator(
        lambda: next_predict_batch(triples,
                             e2idx, 
                             p2idx),
        ({'s':tf.int32, 'p':tf.int32}),
        ({'s':tf.TensorShape([None]),
         'p':tf.TensorShape([None])}))
    return dataset.prefetch(1)  

In [0]:

def tf_get_rank_order(input, targets, filtered):
      target1 = targets[:,0]
      target2 = targets[:,1]
      tf.logging.info
      size = tf.shape(input)[-1]
      if filtered:
        filtered_input = tf.multiply(input,target2)
        indices_of_ranks = tf.nn.top_k(-filtered_input, k=size)[1]
      else:
        indices_of_ranks = tf.nn.top_k(-input, k=size)[1]
      indices_of_ranks = size - tf.nn.top_k(-indices_of_ranks, k=size)[1]
      indices_of_o = tf.cast(tf.argmax(target1, axis=1),tf.int32)
      row_indices = tf.range(tf.shape(indices_of_o)[0])
      full_indices = tf.stack([row_indices, indices_of_o], axis=1)
      return tf.gather_nd(indices_of_ranks, full_indices)

def get_rank(logits, targets, filtered = False):
    ordered_array = tf_get_rank_order(logits, targets, filtered)
    return ordered_array

In [0]:
def forward(features, mode, params):
    batch_sz = tf.shape(features['s'])[0]
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    
    e_embed = tf.get_variable('e_embed',
                              [params['e_vocab_size'], Config.embed_dim],
                              initializer=tf.variance_scaling_initializer())
    p_embed = tf.get_variable('p_embed',
                              [params['p_vocab_size'], Config.embed_dim],
                              initializer=tf.variance_scaling_initializer())
    
    s = tf.nn.embedding_lookup(e_embed, features['s'])
    p = tf.nn.embedding_lookup(p_embed, features['p'])
    
    x = tf.concat([tf.reshape(s, [batch_sz, 10, 20, 1]),
                   tf.reshape(p, [batch_sz, 10, 20, 1]),], axis=1)
    x = tf.layers.batch_normalization(x, training=is_training)
    x = tf.layers.dropout(x, 0.2, training=is_training)
    
    x = tf.layers.conv2d(x, 64, (5,5), activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=is_training)
    x = tf.layers.dropout(x, 0.2, training=is_training)

    x = tf.layers.conv2d(x, 32, (3,3), activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=is_training)
    x = tf.layers.dropout(x, 0.2, training=is_training)
    
    x = tf.layers.conv2d(x, 16, (3,3), activation=tf.nn.relu)
    x = tf.layers.flatten(x)
    x = tf.layers.batch_normalization(x, training=is_training)
    x = tf.layers.dropout(x, 0.2, training=is_training)
    
    x = tf.layers.dense(x, Config.embed_dim, tf.nn.relu)
    x = tf.layers.batch_normalization(x, training=is_training)
    x = tf.layers.dropout(x, 0.3, training=is_training)
    
    logits = tf.matmul(x, e_embed, transpose_b=True)
    return logits
    
    
def model_fn(features, labels, mode, params):
    print("Shape of features: {} and labels: {}".format(features['s'].shape,labels.shape))
    logits = forward(features, mode, params)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.logging.info('\n'+pprint.pformat(tf.trainable_variables()))
        tf.logging.info('params: %d'%count_train_params())
        
        global_step = tf.train.get_global_step()
        
        loss_op = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                                        labels=labels))
        
        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            train_op = tf.train.AdamOptimizer().minimize(loss_op,
                                                         global_step = global_step)
        
        return tf.estimator.EstimatorSpec(mode = mode,
                                          loss = loss_op,
                                          train_op = train_op)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode = mode, predictions = tf.sigmoid(logits))
      
    if mode == tf.estimator.ModeKeys.EVAL:
        filtered_rank = tf.cast(get_rank(tf.sigmoid(logits), labels, filtered = True), tf.float32)
        raw_rank = tf.cast(get_rank(tf.sigmoid(logits), labels, filtered = False), tf.float32)
        metrics = {
          'mean_rank_raw': tf.metrics.mean(raw_rank),
          'mean_rank_filtered': tf.metrics.mean(filtered_rank),
          'mean_reciprocal_rank_raw': tf.metrics.mean(1./raw_rank),
          'mean_reciprocal_rank_filtered': tf.metrics.mean(1./filtered_rank),
          'hits_at_1_raw' : tf.metrics.mean(raw_rank <= 1.),
          'hits_at_3_raw' : tf.metrics.mean(raw_rank <= 3.),
          'hits_at_5_raw' : tf.metrics.mean(raw_rank <= 5.),
          'hits_at_10_raw' : tf.metrics.mean(raw_rank <= 10.),
          'filtered_at_1_raw' : tf.metrics.mean(filtered_rank <= 1.),
          'filtered_at_3_raw' : tf.metrics.mean(filtered_rank <= 3.),
          'filtered_at_5_raw' : tf.metrics.mean(filtered_rank <= 5.),
          'filtered_at_10_raw' : tf.metrics.mean(filtered_rank <= 10.)
        }
        loss_op = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                                        labels=labels[:,0]))
        return tf.estimator.EstimatorSpec(mode = mode,loss=loss_op, eval_metric_ops=metrics)

def count_train_params():
    return np.sum([np.prod([d.value for d in v.get_shape()]) for v in tf.trainable_variables()])

In [10]:
triples_all, triples_tr, triples_va, triples_te = load_triple()
e2idx, p2idx, params = build_vocab(triples_all)
sp2o = build_multi_label(triples_tr)
sp2o_all = build_multi_label_all(triples_all)
# features, labels = get_features_labels(triples_tr, e2idx, p2idx, sp2o,params['e_vocab_size'] )

Files Already Downloaded


In [13]:
config = tf.estimator.RunConfig(model_dir=Config.model_dir)
model = tf.estimator.Estimator(model_fn,
                               params = params,
                              config = config)
training_time = time.time()
tf.logging.info('Model Training Started at Time: {}'.format(time.time()))
model.train(lambda: train_input_fn(triples_tr, e2idx, p2idx, sp2o))
tf.logging.info('Model Training Ended at Time: {}'.format(time.time()))
training_time = time.time() - training_time 

INFO:tensorflow:Using config: {'_model_dir': 'model/conve_single_cpu_bs_128_epochs_1_embed_dim_200_dataset_FB15K', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ffa5a78ce10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Model Training Started at Time: 1544179540.6903284
INFO:tensorflow:Calling model_fn.
Shape of features: (?,) and 

In [14]:
tf.logging.info('Evaluation Started at Time: {}'.format(time.time()))
metrics = model.evaluate(lambda: eval_input_fn(triples_te, e2idx, p2idx, sp2o_all))
tf.logging.info('Evaluation Metrics: {}'.format(metrics))
tf.logging.info('Evaluation Loss:{}'.format(metrics['loss']))
tf.logging.info('Raw: Mean Rank: {}\nRaw: Mean Reciprocal Rank: {}\nRaw: Hits at 1: {}\nRaw: Hits at 3: {}\nRaw: Hits at 5: {}\nRaw: Hits at 10: {}'.format(metrics['mean_rank_raw'],metrics['mean_reciprocal_rank_raw'],metrics['hits_at_1_raw'],metrics['hits_at_3_raw'],metrics['hits_at_5_raw'],metrics['hits_at_10_raw']))
tf.logging.info('Filtered: Mean Rank: {}\nFiltered: Mean Reciprocal Rank: {}\nFiltered: Hits at 1: {}\nFiltered: Hits at 3: {}\nFiltered: Hits at 5: {}\nFiltered: Hits at 10: {}'.format(metrics['mean_rank_filtered'],metrics['mean_reciprocal_rank_filtered'],metrics['filtered_at_1_raw'],metrics['filtered_at_3_raw'],metrics['filtered_at_5_raw'],metrics['filtered_at_10_raw']))
print(metrics)
tf.logging.info('Evaluation Ended at Time: {}'.format(time.time()))
tf.logging.info('Training Time: {}'.format(training_time))

INFO:tensorflow:Evaluation Started at Time: 1544179635.0068936
INFO:tensorflow:Calling model_fn.
Shape of features: (?,) and labels: (?, 2, 14541)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-07-10:47:15
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model/conve_single_cpu_bs_128_epochs_1_embed_dim_200_dataset_FB15K/model.ckpt-2126
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-07-10:47:22
INFO:tensorflow:Saving dict for global step 2126: filtered_at_10_raw = 0.31031954, filtered_at_1_raw = 0.15513535, filtered_at_3_raw = 0.22349262, filtered_at_5_raw = 0.25505716, global_step = 2126, hits_at_10_raw = 0.23580572, hits_at_1_raw = 0.10451481, hits_at_3_raw = 0.15313202, hits_at_5_raw = 0.18127626, loss = 9584.237, mean_rank_filtered = 1949.2825, mean_rank_raw = 1973.6218, mean_reciprocal_rank_filtered = 0.20679656, mean_reciprocal_ran

In [0]:
'''def evaluate_rank(model, triples_va, triples_te, triples_all, e2idx, p2idx):
    for eval_name, eval_triples in [('test', triples_te)]:
        _scores_o = list(model.predict(lambda: predict_input_fn(eval_triples, e2idx, p2idx)))
        ScoresO = np.reshape(_scores_o, [len(eval_triples), len(e2idx)])
        ranks_o, filtered_ranks_o = [], []
        for ((s, p, o), scores_o) in tqdm(zip(eval_triples, ScoresO), total=len(eval_triples), ncols=70):
            s_idx, p_idx, o_idx = e2idx[s], p2idx[p], e2idx[o]
            ranks_o += [1 + np.argsort(np.argsort(- scores_o))[o_idx]]
            filtered_scores_o = scores_o.copy()
            rm_idx_o = [e2idx[fo] for (fs, fp, fo) in triples_all if fs == s and fp == p and fo != o]
            filtered_scores_o[rm_idx_o] = - np.inf
            filtered_ranks_o += [1 + np.argsort(np.argsort(- filtered_scores_o))[o_idx]]
        for setting_name, setting_ranks in [('Raw', ranks_o), ('Filtered', filtered_ranks_o)]:
            mean_rank = np.mean(np.asarray(setting_ranks))
            print('[{}] {} MR: {}'.format(eval_name, setting_name, mean_rank))
            mean_reciprocal_rank = np.mean(1 / np.asarray(setting_ranks))
            print('[{}] {} MRR: {}'.format(eval_name, setting_name, mean_reciprocal_rank))
            for k in [1, 3, 5, 10]:
                hits_at_k = np.mean(np.asarray(setting_ranks) <= k) * 100
                print('[{}] {} Hits@{}: {}'.format(eval_name, setting_name, k, hits_at_k))
                
evaluate_rank(model,triples_va, triples_te, triples_all, e2idx, p2idx,)'''