In [9]:
import json
import random

import numpy as np
from numpy import mean
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

from config import VCSLAM
from util.parse import generate_dictionaries, generate_id_dict, encode_triples

In [10]:
dataset = "VC-SLAM"

vcslam = VCSLAM()
with open(vcslam.PARSED_MODELS_PATH, 'r') as file:
    vcslam_models = json.load(file)

models = vcslam_models
template = vcslam_models
triples = []
for model in models[:]:
    for triple in model:
        triples.append(tuple(triple))

classes, predicates = generate_dictionaries(triples)
classes_mapping = generate_id_dict(classes)
predicates_mapping = generate_id_dict(predicates)


In [11]:
len(triples)

2560

In [12]:


# triples = modrel.reduce_relations(triples, targets)

#test_triples = random.sample(triples, floor(len(triples) / 10))
# test_triples = [triple for triple in test_triples if triple[0] == 'http://schema.org/Offer' and triple[2] == str(XSD.string)]



Statistics Recommender Baseline

In [13]:
from modelextension.statistics_recommender import StatisticsRecommender as SR
from util.metrics import calc_hits_mrr
from util.utilities import prepare_data

_, _, X_test, y_test = prepare_data(template, c_map=classes_mapping,
                    p_map=predicates_mapping, shuffle=False, multiply=1, generate_test=True)

encoded_triples = encode_triples(triples, classes_mapping, predicates_mapping)
sr = SR(triples=encoded_triples)

pred = sr.predict_links(X_test)
hits_mrr = calc_hits_mrr( pred, y_test)
print(f'SR {dataset} MRR {hits_mrr["mrr"]}, Hits@1 {hits_mrr["hits@1"]}, Hits@3 {hits_mrr["hits@3"]}')

SR VC-SLAM MRR 0.98046875, Hits@1 0.9609375, Hits@3 1.0


In [14]:
combinations = []
for i in range(len(classes_mapping.items())):
    for j in range(len(classes_mapping.items())):
        combinations.append((i,j))

# comb_pred = sr.predict_links(combinations)

In [15]:
# choices = [np.count_nonzero(prediction) for prediction in comb_pred]

In [16]:
# counts = [count for count in choices if count > 0]
# counts

Train RFC

In [17]:
%%capture
from util.metrics import calc_hits_mrr
from util.utilities import prepare_data

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

X, y, X_test, y_test = prepare_data(models = template, multiply=3, c_map=classes_mapping,
                    p_map=predicates_mapping, generate_test=True)
rfc = RandomForestClassifier(n_estimators=10, max_depth=20)
rfc_model = rfc.fit(X, y)

In [18]:
mrrs = []
hits_1 = []
hits_3 = []
pred = rfc_model.predict_proba(X_test)
hits_mrr = calc_hits_mrr(pred, y_test)
hits_1.append(hits_mrr['hits@1'])
hits_3.append(hits_mrr['hits@3'])
mrrs.append(hits_mrr['mrr'])

print(f'RFC {dataset} 10 20 | {mean(mrrs):.3f} {mean(hits_1):.3f} {mean(hits_3):.3f}')


RFC VC-SLAM 10 20 | 0.961 0.927 0.993


Train RGCN

In [19]:
import torch
from linkprediction import utils
from linkprediction.rgcn import LinkPredict, node_norm_to_edge_norm

train_triples = triples
test_triples = random.sample(triples, int(len(triples)/10))
print("training triples size", len(train_triples))

train_data = encode_triples(train_triples, classes_mapping, predicates_mapping)
valid_data = encode_triples(test_triples, classes_mapping, predicates_mapping)

# load graph data
num_nodes = len(classes)
num_rels = len(predicates)

# create model
rgcn_model = LinkPredict(in_dim=num_nodes,
                         h_dim=100,
                         num_rels=num_rels,
                         num_bases=10,
                         num_hidden_layers=2,
                         dropout=0.1,
                         use_cuda=False,
                         reg_param=0.01)

# validation and testing triplets
valid_data = torch.LongTensor(valid_data)
test_data = torch.LongTensor(valid_data)

# build test graph
test_graph, test_rel, test_norm = utils.build_test_graph(
    num_nodes, num_rels, train_data)
test_deg = test_graph.in_degrees(
    range(test_graph.number_of_nodes())).float().view(-1, 1)
test_node_id = torch.arange(0, num_nodes, dtype=torch.long).view(-1, 1)
test_rel = torch.from_numpy(test_rel)
test_norm = node_norm_to_edge_norm(test_graph, torch.from_numpy(test_norm).view(-1, 1))

# build adj list and calculate degrees for sampling
adj_list, degrees = utils.get_adj_and_degrees(num_nodes, train_data)

# optimizer
optimizer = torch.optim.Adam(rgcn_model.parameters(), lr=0.001)

forward_time = []
backward_time = []

# training loop
# print("start training...")

epoch = 0
best_mrr = 0
best_hits3 = 0
checkpoint = None
while True:
    rgcn_model.train()
    epoch += 1

    # perform edge neighborhood sampling to generate training graph and data
    g, node_id, edge_type, node_norm, data, labels = \
        utils.generate_sampled_graph_and_labels(
            train_data, 20, 0.3,
            num_rels, adj_list, degrees, 5,
            "neighbor")
    # print("Done edge sampling")

    # set node/edge feature
    node_id = torch.from_numpy(node_id).view(-1, 1).long()
    edge_type = torch.from_numpy(edge_type)
    edge_norm = node_norm_to_edge_norm(g, torch.from_numpy(node_norm).view(-1, 1))
    data, labels = torch.from_numpy(data), torch.from_numpy(labels)
    deg = g.in_degrees(range(g.number_of_nodes())).float().view(-1, 1)

    embed = rgcn_model(g, node_id, edge_type, edge_norm)
    loss = rgcn_model.get_loss(g, embed, data, labels)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(rgcn_model.parameters(), 1.0)  # clip gradients
    optimizer.step()
    optimizer.zero_grad()

    # validation
    if epoch % 100 == 0:
        rgcn_model.eval()
        # print("start eval")
        embed = rgcn_model(test_graph, test_node_id, test_rel, test_norm)
        rgcn_results = utils.calc_mrr(embed, rgcn_model.w_relation, torch.LongTensor(train_data),
                                      valid_data, test_data, hits=[1, 3], eval_bz=100,
                                      eval_p="filtered")
        mrr = rgcn_results['mrr']
        hits3 = rgcn_results['hits@3']
        hits1 = rgcn_results['hits@1']
        print(f"epoch {epoch} MRR {mrr:.2f} hits@1 {hits1:.2f} hits@3 {hits3:.2f}", end="")
        if best_hits3 < hits3:
            best_hits3 = hits3
        if best_mrr <= mrr:
            best_mrr = mrr
            checkpoint = {'state_dict': rgcn_model.state_dict(), 'epoch': epoch}
            print(f"*** | ", end="")

        else:
            print(f" | ", end="")
        # if hits3 == 1:
        #    break

        if epoch >= 8000:
            break


Using backend: pytorch


training triples size 2560


  norm = 1.0 / in_deg


epoch 100 MRR 0.05 hits@1 0.01 hits@3 0.06*** | epoch 200 MRR 0.15 hits@1 0.09 hits@3 0.18*** | epoch 300 MRR 0.20 hits@1 0.12 hits@3 0.23*** | epoch 400 MRR 0.25 hits@1 0.18 hits@3 0.27*** | epoch 500 MRR 0.28 hits@1 0.22 hits@3 0.30*** | epoch 600 MRR 0.31 hits@1 0.24 hits@3 0.33*** | epoch 700 MRR 0.34 hits@1 0.26 hits@3 0.36*** | epoch 800 MRR 0.36 hits@1 0.29 hits@3 0.38*** | epoch 900 MRR 0.38 hits@1 0.30 hits@3 0.41*** | epoch 1000 MRR 0.40 hits@1 0.31 hits@3 0.42*** | epoch 1100 MRR 0.42 hits@1 0.33 hits@3 0.46*** | epoch 1200 MRR 0.43 hits@1 0.34 hits@3 0.47*** | epoch 1300 MRR 0.46 hits@1 0.37 hits@3 0.49*** | epoch 1400 MRR 0.47 hits@1 0.38 hits@3 0.51*** | epoch 1500 MRR 0.47 hits@1 0.37 hits@3 0.50*** | epoch 1600 MRR 0.49 hits@1 0.38 hits@3 0.54*** | epoch 1700 MRR 0.51 hits@1 0.40 hits@3 0.57*** | epoch 1800 MRR 0.50 hits@1 0.39 hits@3 0.54 | epoch 1900 MRR 0.53 hits@1 0.41 hits@3 0.58*** | epoch 2000 MRR 0.54 hits@1 0.44 hits@3 0.59*** | epoch 2100 MRR 0.55 hits@1 0.44 

In [20]:
# use best model checkpoint
print("Using best epoch: {}".format(checkpoint['epoch']))
rgcn_model.eval()
rgcn_model.load_state_dict(checkpoint['state_dict'])
rgcn_embed = rgcn_model(test_graph, test_node_id, test_rel, test_norm)
rgcn_results = utils.calc_mrr(rgcn_embed, rgcn_model.w_relation, torch.LongTensor(train_data), valid_data,
                              test_data, hits=[1, 3], eval_bz=100, eval_p="filtered")
mrr = rgcn_results['mrr']
hits3 = rgcn_results['hits@3']
hits1 = rgcn_results['hits@1']
print(f"RGCN {dataset} MRR {mrr} hits@1 {hits1} hits@3 {hits3}")

Using best epoch: 7700
RGCN VC-SLAM MRR 0.7463410496711731 hits@1 0.62890625 hits@3 0.833984375


Test ME recommendation

In [27]:
with open("modelextensions_single.json","r") as file:
    setups = json.load(fp=file)

In [22]:
with open("modelextensions_filtered.json","r") as file:
    setups = json.load(fp=file)

In [28]:
limit = len(setups)
limit

128

ME + SR

In [29]:
from util.parse import find_relation
from util.metrics import rank

modes = [("oracle",0),("neighbors",0),("similar",30)
    #, ("all", 0)
]

for mode in modes:
    key = mode[0]
    me_weight =mode[1] / 100
    w_me = me_weight
    w_lp = 1-w_me
    sr_ranks = []
    for setup in setups[:limit]:
        anchor = setup['anchor']
        target = setup['target']
        tuples = [[classes_mapping[anchor], classes_mapping[obj[0]]] for obj in setup[key]]
        #print(tuples)
        pred = sr.predict_links(tuples)
        #print(pred.shape)
        predictions = np.argsort(pred)[:, ::-1]
        predictions = predictions[:,:10]
        #print(valid, predicates_mapping[valid[0][1]], predictions)
        probs = [[pred[index][value[i]] for i in range(10)] for  index, value in enumerate(predictions) ]
        #print(probs)
        sr_result = []
        for i in range(len(probs)):
            score_me = setup[key][i][1]
            obj = setup[key][i][0]
            # print(obj, score_me)
            for j in range(predictions.shape[1]):
                score_lp = probs[i][j]
                if score_lp >= 0:
                    pred = find_relation(predicates_mapping=predicates_mapping, i=predictions[i][j])
                    sr_result.append(((anchor, pred, obj), w_me * score_me + w_lp * score_lp))

        #print(sr_result)
        sr_result = sorted(sr_result, key=lambda x:x[1], reverse=True)
        #print(result[:5])

        r = rank(sr_result, target)
        #print(r)
        sr_ranks.append(r)

    # print(len(sr_ranks))
    sr_stats = {}
    sr_stats['mrr'] = np.mean([1 / rank for rank in sr_ranks])
    for value in [1,3]:
        hits_at_value = [1 if pos <= value else 0 for pos in sr_ranks]
        sr_stats['hits@' + str(value)] = np.sum(hits_at_value) / len(hits_at_value)

    print("SR", mode,"\n", sr_ranks, "\n", sr_stats)

SR ('oracle', 0) 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
 {'mrr': 0.9830729166666666, 'hits@1': 0.96875, 'hits@3': 1.0}
SR ('neighbors', 0) 
 [1, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 8, 4, 1, 3, 1, 3, 1, 1, 6, 7, 2, 1, 2, 2, 6, 5, 6, 1, 7, 2, 11, 5, 1, 2, 12, 2, 2, 1, 8, 54, 15, 3, 1, 4, 1, 1, 6, 1, 2, 1, 1, 1, 5, 1, 11, 24, 1, 1, 2, 8, 4, 1, 2, 1, 1, 1, 3, 2, 6, 1, 1, 3, 2, 1, 1, 8, 4, 1, 3, 2, 1, 4, 1, 3, 2, 4, 2, 1, 2, 3, 2, 2, 3, 6, 4, 1, 8, 9, 1, 2, 13, 1, 1, 5, 1, 1, 10, 1, 1, 1, 2, 1, 6, 8, 3, 1, 1, 2, 2, 1, 3, 4, 2, 2, 2] 
 {'mrr': 0.5817202834780959, 'hits@1': 0.390625, 'hits@3': 0.7109375}
SR ('similar', 30) 
 [1, 3, 1, 1, 1, 100, 1, 1

ME + RFC

In [30]:
from util.parse import find_relation
from util.metrics import rank

w_me = 0.3
w_lp = 1-w_me
for mode in modes:
    rfc_ranks = []
    key = mode[0]
    me_weight =mode[1] / 100
    w_me = me_weight
    for setup in setups[:limit]:
        anchor = setup['anchor']
        target = setup['target']
        tuples = [[classes_mapping[anchor], classes_mapping[obj[0]]] for obj in setup[key]]
        #print(tuples)
        pred = rfc_model.predict_proba(tuples)
        #print(pred, pred.shape)
        predictions = np.argsort(pred)[:, ::-1]
        predictions = predictions[:,:10]
        #print(valid, predicates_mapping[valid[0][1]], predictions)
        probs = [[pred[index][value[i]] for i in range(10)] for  index, value in enumerate(predictions) ]
        #print(probs)
        rfc_result = []
        for i in range(len(probs)):
            score_me = setup[key][i][1]
            obj = setup[key][i][0]
            # print(obj, score_me)

            for j in range(predictions.shape[1]):
                score_lp = probs[i][j]
                if score_lp >= 0:
                    pred = find_relation(predicates_mapping=predicates_mapping, i=predictions[i][j])
                    rfc_result.append(((anchor, pred, obj), w_me * score_me + w_lp * score_lp))

        #print(rfc_result)
        rfc_result = sorted(rfc_result, key=lambda x:x[1], reverse=True)
        # print(len(rfc_result))

        r = rank(rfc_result, target)
        #print(r)
        rfc_ranks.append(r)

    rfc_stats = {}
    rfc_stats['mrr'] = np.mean([1/rank for rank in rfc_ranks])
    for value in [1,3]:
        hits_at_value = [1 if pos <= value else 0 for pos in rfc_ranks]
        rfc_stats['hits@' + str(value)] = np.sum(hits_at_value) / len(hits_at_value)

    print("RFC", mode,"\n", rfc_ranks, "\n", rfc_stats)

RFC ('oracle', 0) 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
 {'mrr': 0.9869791666666666, 'hits@1': 0.9765625, 'hits@3': 1.0}
RFC ('neighbors', 0) 
 [3, 8, 2, 2, 1, 5, 1, 1, 1, 1, 1, 1, 1, 25, 6, 1, 3, 1, 1, 1, 2, 4, 6, 5, 1, 3, 2, 3, 4, 3, 1, 2, 1, 6, 3, 1, 1, 6, 1, 1, 4, 3, 59, 7, 5, 1, 8, 5, 1, 5, 1, 6, 1, 1, 1, 5, 4, 5, 22, 2, 7, 3, 6, 3, 1, 2, 2, 3, 1, 1, 2, 6, 3, 1, 3, 1, 1, 1, 10, 3, 1, 4, 2, 1, 5, 1, 3, 5, 5, 1, 1, 2, 10, 1, 1, 18, 13, 7, 1, 6, 8, 1, 3, 11, 1, 1, 4, 1, 1, 6, 1, 1, 1, 1, 1, 8, 7, 2, 6, 3, 1, 2, 1, 3, 4, 2, 2, 1] 
 {'mrr': 0.5717230728605198, 'hits@1': 0.4140625, 'hits@3': 0.6640625}
RFC ('similar', 30) 
 [1, 3, 1, 1, 1, 100

ME + RGCN

In [32]:
from util.parse import find_class

for mode in modes:
    rgcn_ranks = []
    key = mode[0]
    me_weight =mode[1] / 100
    for setup in setups[:limit]:

        anchor = setup['anchor']
        target = setup['target']
        tuples = [[classes_mapping[anchor], classes_mapping[obj[0]]] for obj in setup[key]]
        scores_me = [obj[1] for obj in setup[key]]

        candidate_relations = []

        for tuple_i, t in enumerate(tuples):
            s_id = t[0]
            o_id = t[1]
            score_me = scores_me[tuple_i]

            for pred, i in predicates_mapping.items():
                emb_triplet = rgcn_embed[s_id] * rgcn_model.w_relation[i] * rgcn_embed
                scores = torch.sigmoid(torch.sum(emb_triplet, dim=1))
                scores, indices = torch.sort(scores, descending=True)
                p_rank = int((indices == o_id).nonzero())

                score_lp = scores[p_rank]
                if score_lp >= 0:
                    candidate_relations.append(((anchor, pred, find_class(classes_mapping, i=o_id)), w_me * score_me + w_lp * score_lp))

        top_list = sorted(candidate_relations, key=lambda item: item[1], reverse=True)

        rgcn_result = []
        # if filter_invalid:
        #     for tuple in top_list:
        #         if tuple_in_ontology((tuple[0], tuple[1], tuple[2])):
        #             result_list.append(tuple)
        # else:
        rgcn_result = top_list

        r = rank(rgcn_result, target)
        rgcn_ranks.append(r)

    rgcn_stats = {}
    rgcn_stats['mrr'] = np.mean([1/rank for rank in rgcn_ranks])
    for value in [1,3]:
        hits_at_value = [1 if pos <= value else 0 for pos in rgcn_ranks]
        rgcn_stats['hits@' + str(value)] = np.sum(hits_at_value) / len(hits_at_value)

    print("RGCN", mode,"\n", rgcn_ranks, "\n", rgcn_stats)

RGCN ('oracle', 0) 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 87, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 3, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1] 
 {'mrr': 0.9079614844566353, 'hits@1': 0.84375, 'hits@3': 0.96875}
RGCN ('neighbors', 0) 
 [1, 2, 3, 2, 1, 2, 1, 9, 3, 2, 1, 2, 1, 6, 4, 2, 7, 1, 2, 1, 1, 7, 31, 1, 1, 3, 3, 11, 2, 17, 4, 12, 5, 6, 6, 2, 1, 7, 2, 1, 6, 7, 3177, 284, 1, 1, 13, 5, 1, 10, 1, 3, 1, 2, 2, 4, 1, 18, 5, 1, 4, 6, 7, 14, 2, 5, 24, 1, 1, 5, 4, 105, 1, 1, 1, 4, 1, 1, 41, 2, 1, 2, 2, 1, 4, 2, 57, 2, 4, 2, 1, 2, 24, 1, 1, 3, 2, 8, 3, 1, 13, 2, 3, 6, 1, 1, 2, 1, 2, 17, 1, 1, 1, 3, 2, 20, 7, 1, 2, 2, 1, 2, 12, 2, 8, 1, 1, 1] 
 {'mrr': 0.5210561895744013, 'hits@1': 0.3359375, 'hits@3': 0.6328125}
RGCN ('similar', 30)