In [2]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
import ampligraph
# import rdflib
# import itertools

#  https://docs.ampligraph.org/en/1.3.1/tutorials/AmpliGraphBasicsTutorial.html
#  https://docs.ampligraph.org/en/1.3.1/examples.html

In [3]:
%store -r triples_arr
%store -r unseen_triples_arr

In [4]:
X = triples_arr
unseen_triples = unseen_triples_arr

#  Sanity check 
print(len(X))

29339


In [5]:
#  Sanity check 
print(len(unseen_triples))

560


In [6]:
# Train, test, valid split 

from ampligraph.evaluation import train_test_split_no_unseen 
X_train_valid, X_test = train_test_split_no_unseen(X, test_size=(2400))
X_train, X_valid = train_test_split_no_unseen(X_train_valid, test_size=2400)

print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)
print('Valid set size: ', X_valid.shape)

Train set size:  (24539, 3)
Test set size:  (2400, 3)
Valid set size:  (2400, 3)


#  Train model

In [6]:
#  Define ComplEx model 

from ampligraph.latent_features import ComplEx

model = ComplEx(batches_count=100, 
                seed=555, 
                epochs=100, 
                k=200, 
                eta=15,
                loss='multiclass_nll', 
                embedding_model_params = {'negative_corruption_entities': 'all'},
                regularizer='LP', 
                regularizer_params={'p':1, 'lambda':1e-5}, 
                initializer= 'xavier', 
                initializer_params= {'uniform': False},
                optimizer= 'adam',
                optimizer_params = {'lr': 0.0005}, 
                verbose=True)


In [8]:
#  Fit model on the training data 

import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

model.fit(X_train, early_stopping = False)

Average Loss:   0.108873: 100%|███████████████████████████████████████████████████| 100/100 [18:17<00:00, 10.98s/epoch]


In [23]:
from ampligraph.latent_features import save_model
save_model(model, './best_model.pkl')

In [8]:
from ampligraph.latent_features import restore_model
model = restore_model('./best_model.pkl')

In [9]:
#  Sanity check 

if model.is_fitted:
    print('The model is fit!')
else:
    print('The model is not fit! Did you skip a step?')

The model is fit!


# Evaluate Model

In [10]:
#  Evaluate performance on the test set 

positives_filter = np.concatenate((X_train, X_test, X_valid))

from ampligraph.evaluation import evaluate_performance
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=positives_filter,   # Corruption strategy filter defined above 
                             use_default_protocol=True, # corrupt subj and obj separately while evaluating
                             verbose=True)



100%|██████████████████████████████████████████████████████████████████████████████| 2400/2400 [03:36<00:00, 11.06it/s]


In [11]:
#  See evaluation scores

from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mr = mr_score(ranks)
print("MR: %.2f" % (mr))

mrr = mrr_score(ranks)
print("MRR: %.2f" % (mrr))

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % (hits_10))
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % (hits_3))
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % (hits_1))

MR: 279.91
MRR: 0.68
Hits@10: 0.75
Hits@3: 0.69
Hits@1: 0.64


# Predicting new links

In [11]:
X_unseen = np.array(unseen_triples)

In [12]:
unseen_filter = np.array(list({tuple(i) for i in np.vstack((positives_filter, X_unseen))}))

In [13]:
print(len(X_unseen))
print(len(unseen_filter)) 

560


17203

In [14]:
X_unseen

array([['https://data.cooperationdatabank.org/vocab/prop/religiousLevel_H1',
        'https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn',
        'https://data.cooperationdatabank.org/id/dependentvariable/cooperation'],
       ['https://data.cooperationdatabank.org/vocab/prop/religiousLevel_H1',
        'https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn',
        'https://data.cooperationdatabank.org/id/dependentvariable/contributions'],
       ['https://data.cooperationdatabank.org/vocab/prop/religiousLevel_H1',
        'https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn',
        'https://data.cooperationdatabank.org/id/dependentvariable/cooperation'],
       ...,
       ['https://data.cooperationdatabank.org/vocab/prop/feedbackTarget_H5',
        'https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn',
        'https://data.cooperationdatabank.org/id/dependentvariable/cooperation'],
       ['https://data.cooperationd

In [15]:
ranks_unseen = evaluate_performance(
    X_unseen, 
    model=model, 
    filter_triples=unseen_filter,   # Corruption strategy filter defined above 
    corrupt_side = 's+o',
    use_default_protocol=False, # corrupt subj and obj separately while evaluating
    verbose=True
)

100%|████████████████████████████████████████████████████████████████████████████████| 560/560 [00:50<00:00, 11.04it/s]


In [16]:
scores = model.predict(X_unseen)

In [17]:
#  Calibrate model on the valid set 

model.calibrate(X_valid, positive_base_rate=0.5)

probas_pos_neg = model.predict_proba(X_unseen) 

#  https://docs.ampligraph.org/_/downloads/en/1.3.1/pdf/  See page 27 for explanation  

Calibration Loss:   0.300113: 100%|█████████████████████████████████████████████████| 50/50 [00:08<00:00,  5.65epoch/s]


array([0.8755646 , 0.8854853 , 0.9254524 , 0.55554205, 0.9106661 ,
       0.62879634, 0.34563106, 0.30714607, 0.3550626 , 0.8910928 ,
       0.94742215, 0.9231199 , 0.9579115 , 0.9417647 , 0.2027488 ,
       0.27007782, 0.18676949, 0.56751406, 0.8862363 , 0.8574461 ,
       0.55221003, 0.85639894, 0.84536326, 0.48096055, 0.8561038 ,
       0.7897005 , 0.82756066, 0.8542503 , 0.93498766, 0.79566747,
       0.8580024 , 0.8389073 , 0.9486566 , 0.83228755, 0.72864664,
       0.7125771 , 0.8751346 , 0.8856375 , 0.9244466 , 0.9309971 ,
       0.9685943 , 0.5015219 , 0.926574  , 0.63533884, 0.7027925 ,
       0.9183842 , 0.8463392 , 0.6616401 , 0.7130632 , 0.5811899 ,
       0.937739  , 0.73057175, 0.8227493 , 0.82922906, 0.9146973 ,
       0.8721961 , 0.77166   , 0.8505485 , 0.9181563 , 0.5793898 ,
       0.9337336 , 0.5619836 , 0.5727319 , 0.93231237, 0.7883134 ,
       0.67759067, 0.9256048 , 0.82329524, 0.93719447, 0.9806893 ,
       0.85308033, 0.8454279 , 0.97484994, 0.8609417 , 0.75913

In [18]:
from scipy.special import expit
probs = expit(scores)

In [19]:
rankings = pd.DataFrame(list(zip([' '.join(x) for x in X_unseen], 
                      ranks_unseen, 
                      np.squeeze(scores),
                      np.squeeze(probas_pos_neg))), 
             columns=['statement', 'rank', 'score', 'probas_pos_neg']).sort_values('probas_pos_neg', ascending=False)


In [22]:
pd.set_option('display.max_colwidth', 300)
pd.set_option('max_rows', 350)
rankings = rankings.reset_index(drop=True)
rankings

Unnamed: 0,statement,rank,score,probas_pos_neg
0,https://data.cooperationdatabank.org/vocab/prop/iteratedStrategy_H6 https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn https://data.cooperationdatabank.org/id/dependentvariable/cooperation,1,7.381446,0.981578
1,https://data.cooperationdatabank.org/vocab/prop/iteratedStrategy_H9 https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn https://data.cooperationdatabank.org/id/dependentvariable/cooperation,2,7.320162,0.980730
2,https://data.cooperationdatabank.org/vocab/prop/ethnicityUS_H1 https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn https://data.cooperationdatabank.org/id/dependentvariable/cooperation,2,7.317283,0.980689
3,https://data.cooperationdatabank.org/vocab/prop/punishmentTreatment_H3 https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn https://data.cooperationdatabank.org/id/dependentvariable/contributions,1,7.157279,0.978288
4,https://data.cooperationdatabank.org/vocab/prop/targetMembership_H5 https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn https://data.cooperationdatabank.org/id/dependentvariable/contributions,1,7.092446,0.977233
...,...,...,...,...
555,https://data.cooperationdatabank.org/vocab/prop/preferenceConditionalCooperation_H4 https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn https://data.cooperationdatabank.org/id/dependentvariable/cooperation,771,0.376581,0.220347
556,https://data.cooperationdatabank.org/vocab/prop/uncertaintyTarget_H1 https://data.cooperationdatabank.org/vocab/prop/hasNegativeEffectOn https://data.cooperationdatabank.org/id/dependentvariable/cooperation,1596,0.235451,0.202749
557,https://data.cooperationdatabank.org/vocab/prop/iteratedStrategy_H14 https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn https://data.cooperationdatabank.org/id/dependentvariable/contributions,1162,0.230638,0.202168
558,https://data.cooperationdatabank.org/vocab/prop/uncertaintyTarget_H1 https://data.cooperationdatabank.org/vocab/prop/hasPositiveEffectOn https://data.cooperationdatabank.org/id/dependentvariable/cooperation,3816,0.099160,0.186769


In [24]:
rankings_H = rankings
%store rankings_H

Stored 'rankings_H' (DataFrame)
