In [19]:
import tensorflow as tf 

In [45]:
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model
from tqdm import tqdm

In [21]:
def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.4.0


In [26]:
dataset_new = pd.read_csv("../data/routes.csv")
dataset_new.columns = ['_start', '_end', '_type','_distance']
dataset_new = dataset_new.reindex(columns=['_start', '_type', '_end', '_distance'])
dataset_new.drop(dataset_new.index[dataset_new['_start'] == '_start'], inplace=True)
dataset_new.drop(columns=['_distance'], inplace=True)
# dataset_new = dataset_new.loc[dataset_new['_type'] != "HAS_ROUTE"]
dataset_new.head(10)
ent_to_id ,rel_to_idx = ampligraph.evaluation.create_mappings(dataset_new.to_numpy())
dataset_new = ampligraph.evaluation.to_idx(dataset_new.to_numpy(), rel_to_idx, ent_to_id)
test_train, X_valid = train_test_split_no_unseen(dataset_new, 1000, seed=0)

X_train, X_test = train_test_split_no_unseen(test_train, 2000, seed=0)

print('Total triples:', dataset_new.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (73954, 3)
Size of train: (70954, 3)
Size of valid: (1000, 3)
Size of test: (2000, 3)


In [27]:
X_train

array([[1640,    4, 8622],
       [ 203,    0, 1739],
       [1473,    0,   66],
       ...,
       [  57,    0, 1173],
       [1206,    4, 8625],
       [ 693,    0,  903]])

In [28]:
model = TransE(k=200,                                                             # embedding size
               epochs=300,                                                        # Num of epochs
               batches_count= 10,                                                 # Number of batches 
               eta=2,                                                             # number of corruptions to generate during training
               loss='pairwise', loss_params={'margin': 1},                        # loss type and it's hyperparameters         
               initializer='xavier', initializer_params={'uniform': False},       # initializer type and it's hyperparameters
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},   # regularizer along with its hyperparameters
               optimizer= 'adam', optimizer_params= {'lr': 0.001},                # optimizer to use along with its                
               seed= 0, verbose=True)

model.fit(X_train)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)

ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter, filter_unseen=True)

save_model(model, 'test/TransE.pkl')
display_aggregate_metrics(ranks)

Average TransE Loss:   0.023726: 100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [02:14<00:00,  2.24epoch/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:24<00:00, 80.16it/s]


Mean Rank: 163.081
Mean Reciprocal Rank: 0.18552176422303676
Hits@1: 0.0005
Hits@10: 0.542
Hits@100: 0.8315


In [77]:
model = ComplEx(k=200, epochs=300, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},

                
                
                
                seed= 0, batches_count= 1, verbose=True)
model.fit(X_train)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)

ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

save_model(model, 'test/ComplEx.pkl')
display_aggregate_metrics(ranks)




  0%|                                                                                                                           | 0/300 [00:00<?, ?epoch/s]2023-01-08 15:39:59.197072: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 113526400 exceeds 10% of system memory.
2023-01-08 15:39:59.197197: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 113526400 exceeds 10% of system memory.
2023-01-08 15:39:59.197217: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 113526400 exceeds 10% of system memory.
2023-01-08 15:39:59.249474: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 113526400 exceeds 10% of system memory.
2023-01-08 15:39:59.249788: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 113526400 exceeds 10% of system memory.
Average ComplEx Loss:   0.110729: 100%|███████████████████████████████████████████████████████████████████████████████| 300/300 [09:56<00:00,  1.99s/epoch]
100%|████

Mean Rank: 213.251
Mean Reciprocal Rank: 0.2387062421129227
Hits@1: 0.1735
Hits@10: 0.3635
Hits@100: 0.67525





In [78]:
model = DistMult(k=200, epochs=300, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},

                
                
                
                seed= 0, batches_count= 1, verbose=True)
model.fit(X_train)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)

ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

save_model(model, 'test/DistMult.pkl')
display_aggregate_metrics(ranks)



Average DistMult Loss:   0.202328: 100%|██████████████████████████████████████████████████████████████████████████████| 300/300 [02:03<00:00,  2.43epoch/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:22<00:00, 88.22it/s]


Mean Rank: 230.1245
Mean Reciprocal Rank: 0.2541328064527293
Hits@1: 0.191
Hits@10: 0.3725
Hits@100: 0.65575


In [None]:
model = ConvKB(k=200, epochs=100, eta=1, loss='multiclass_nll', 
                initializer='xavier', initializer_params={'uniform': False},
                regularizer='LP', regularizer_params= {'lambda': 0.0001, 'p': 3},
                optimizer= 'adam', optimizer_params= {'lr': 0.001}, 
                seed= 0, 
                batches_count= 5, # Goes OOM (ResourceExhaustedError) if batch count is 1
                verbose=True)
model.fit(X_train)

X_filter = np.concatenate([X_train, X_valid, X_test], 0)

ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

save_model(model, 'test/ConvKB.pkl')
display_aggregate_metrics(ranks)

Average ConvKB Loss:   0.031474: 100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [17:21<00:00, 10.42s/epoch]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [25:13<00:00,  1.32it/s]

Mean Rank: 236.736
Mean Reciprocal Rank: 0.22788252786336363
Hits@1: 0.13525
Hits@10: 0.405
Hits@100: 0.74075





In [86]:
def predict_scores_train(model_location):
    model = restore_model(model_location)
    data_to_test = pd.DataFrame(X_valid, columns = ['_start', '_type', '_end'])
    uniuqe_predicate = data_to_test._type.unique()
    score = 0
    for i in tqdm(range(len(data_to_test))):
        score +=  model.predict([data_to_test.loc[i,"_start"],data_to_test.loc[i,"_type"],data_to_test.loc[i,"_end"]])
    print(score / len(data_to_test))

In [87]:
def predict_scores(model_location):
    model = restore_model(model_location)
    data_to_test = pd.DataFrame(X_test, columns = ['_start', '_type', '_end'])
    uniuqe_predicate = data_to_test._type.unique()
    score = 0
    for i in tqdm(range(len(data_to_test))):
        score +=  model.predict([data_to_test.loc[i,"_start"],data_to_test.loc[i,"_type"],data_to_test.loc[i,"_end"]])
    print(score / len(data_to_test))
            
                                                                                      



In [89]:
predict_scores("test/TransE.pkl")
print("train")
predict_scores_train("test/TransE.pkl")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:35<00:00, 20.98it/s]


[-9.825143]
train


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:47<00:00, 21.26it/s]

[-9.8588705]





In [90]:
predict_scores("test/ComplEx.pkl")
print("train")
predict_scores_train("test/ComplEx.pkl")



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:27<00:00, 13.56it/s]

[1.8920759]
train



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:13<00:00, 13.61it/s]

[1.8427835]





In [91]:
predict_scores("test/DistMult.pkl")
print("train")
predict_scores_train("test/DistMult.pkl")



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:30<00:00, 22.06it/s]

[1.3430394]
train



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:45<00:00, 21.99it/s]

[1.321544]





In [92]:
predict_scores("test/ConvKB.pkl")
print("train")
predict_scores_train("test/ConvKB.pkl")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:28<00:00, 13.45it/s]


6.132646740171127
train


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:14<00:00, 13.50it/s]

6.0166484672315415



