In [1]:
import json
import random

from numpy import mean
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

from config import ARMSTRADER, VCSLAM
from util.parse import generate_dictionaries, generate_id_dict, encode_triples

In [2]:
dataset = "combined"

vcslam = VCSLAM()
with open(vcslam.PARSED_MODELS_PATH, 'r') as file:
    vcslam_models = json.load(file)

armstrader = ARMSTRADER()
with open(armstrader.PARSED_MODELS_PATH, 'r') as file:
    armstrader_models = json.load(file)

models = armstrader_models + vcslam_models
vcslam_x3 = armstrader_models + vcslam_models + vcslam_models + vcslam_models
triples = []
for model in models[:]:
    for triple in model:
        triples.append(tuple(triple))

classes, predicates = generate_dictionaries(triples)
classes_mapping = generate_id_dict(classes)
predicates_mapping = generate_id_dict(predicates)


In [3]:
len(triples)

173265

In [4]:


# triples = modrel.reduce_relations(triples, targets)

#test_triples = random.sample(triples, floor(len(triples) / 10))
# test_triples = [triple for triple in test_triples if triple[0] == 'http://schema.org/Offer' and triple[2] == str(XSD.string)]



Statistics Recommender Baseline

In [5]:
from modelextension.statistics_recommender import StatisticsRecommender as SR
from util.metrics import calc_hits_mrr
from util.utilities import prepare_data

_, _, X_test, y_test = prepare_data(models, c_map=classes_mapping,
                    p_map=predicates_mapping, shuffle=False, multiply=1, generate_test=True)

encoded_triples = encode_triples(triples, classes_mapping, predicates_mapping)
sr = SR(triples=encoded_triples)

pred = sr.predict_links(X_test)
hits_mrr = calc_hits_mrr( pred, y_test)
print(f'SR {dataset} MRR {hits_mrr["mrr"]}, Hits@1 {hits_mrr["hits@1"]}, Hits@3 {hits_mrr["hits@3"]}')

SR combined MRR 0.7439168402749149, Hits@3 0.8812766939859171, Hits@1 0.5891146254184463


Train RFC

In [6]:
%%capture
from util.metrics import calc_hits_mrr
from util.utilities import prepare_data

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

X, y, X_test, y_test = prepare_data(vcslam_x3, c_map=classes_mapping,
                                    p_map=predicates_mapping, shuffle=False, multiply=1, generate_test=True)
rfc = RandomForestClassifier(n_estimators=10, max_depth=20)
est = rfc.fit(X,y)

In [7]:
mrrs = []
hits_1 = []
hits_3 = []
pred = est.predict_proba(X_test)
hits_mrr = calc_hits_mrr(pred, y_test)
hits_1.append(hits_mrr['hits@1'])
hits_3.append(hits_mrr['hits@3'])
mrrs.append(hits_mrr['mrr'])

print(f'RFC {dataset} 10 20 | {mean(mrrs):.3f} {mean(hits_1):.3f} {mean(hits_3):.3f}')

RFC combined 10 20 | 0.749 0.601 0.879


Train RGCN

In [8]:
import torch
from linkprediction import utils
from linkprediction.rgcn import LinkPredict, node_norm_to_edge_norm

train_triples = triples
test_triples = random.sample(triples, int(len(triples)/10))
print("training triples size", len(train_triples))

train_data = encode_triples(train_triples, classes_mapping, predicates_mapping)
valid_data = encode_triples(test_triples, classes_mapping, predicates_mapping)

# load graph data
num_nodes = len(classes)
num_rels = len(predicates)

# create model
rgcn_model = LinkPredict(in_dim=num_nodes,
                         h_dim=100,
                         num_rels=num_rels,
                         num_bases=10,
                         num_hidden_layers=2,
                         dropout=0.1,
                         use_cuda=False,
                         reg_param=0.01)

# validation and testing triplets
valid_data = torch.LongTensor(valid_data)
test_data = torch.LongTensor(valid_data)

# build test graph
test_graph, test_rel, test_norm = utils.build_test_graph(
    num_nodes, num_rels, train_data)
test_deg = test_graph.in_degrees(
    range(test_graph.number_of_nodes())).float().view(-1, 1)
test_node_id = torch.arange(0, num_nodes, dtype=torch.long).view(-1, 1)
test_rel = torch.from_numpy(test_rel)
test_norm = node_norm_to_edge_norm(test_graph, torch.from_numpy(test_norm).view(-1, 1))

# build adj list and calculate degrees for sampling
adj_list, degrees = utils.get_adj_and_degrees(num_nodes, train_data)

# optimizer
optimizer = torch.optim.Adam(rgcn_model.parameters(), lr=0.001)

forward_time = []
backward_time = []

# training loop
# print("start training...")

epoch = 0
best_mrr = 0
best_hits3 = 0
checkpoint = None
while True:
    rgcn_model.train()
    epoch += 1

    # perform edge neighborhood sampling to generate training graph and data
    g, node_id, edge_type, node_norm, data, labels = \
        utils.generate_sampled_graph_and_labels(
            train_data, 20, 0.3,
            num_rels, adj_list, degrees, 5,
            "neighbor")
    # print("Done edge sampling")

    # set node/edge feature
    node_id = torch.from_numpy(node_id).view(-1, 1).long()
    edge_type = torch.from_numpy(edge_type)
    edge_norm = node_norm_to_edge_norm(g, torch.from_numpy(node_norm).view(-1, 1))
    data, labels = torch.from_numpy(data), torch.from_numpy(labels)
    deg = g.in_degrees(range(g.number_of_nodes())).float().view(-1, 1)

    embed = rgcn_model(g, node_id, edge_type, edge_norm)
    loss = rgcn_model.get_loss(g, embed, data, labels)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(rgcn_model.parameters(), 1.0)  # clip gradients
    optimizer.step()
    optimizer.zero_grad()

    # validation
    if epoch % 100 == 0:
        rgcn_model.eval()
        # print("start eval")
        embed = rgcn_model(test_graph, test_node_id, test_rel, test_norm)
        results = utils.calc_mrr(embed, rgcn_model.w_relation, torch.LongTensor(train_data),
                                 valid_data, test_data, hits=[1, 3], eval_bz=100,
                                 eval_p="filtered")
        mrr = results['mrr']
        hits3 = results['hits@3']
        hits1 = results['hits@1']
        print(f"epoch {epoch} MRR {mrr:.2f} hits@1 {hits1:.2f} hits@3 {hits3:.2f}", end="")
        if best_hits3 < hits3:
            best_hits3 = hits3
        if best_mrr <= mrr:
            best_mrr = mrr
            checkpoint = {'state_dict': rgcn_model.state_dict(), 'epoch': epoch}
            print(f"*** | ", end="")

        else:
            print(f" | ", end="")
        # if hits3 == 1:
        #    break

        if epoch >= 8000:
            break


Using backend: pytorch


training triples size 173265


  norm = 1.0 / in_deg


epoch 100 MRR 0.01 hits@1 0.00 hits@3 0.00*** | epoch 200 MRR 0.02 hits@1 0.00 hits@3 0.00*** | epoch 300 MRR 0.06 hits@1 0.02 hits@3 0.02*** | epoch 400 MRR 0.09 hits@1 0.02 hits@3 0.05*** | epoch 500 MRR 0.10 hits@1 0.02 hits@3 0.07*** | epoch 600 MRR 0.33 hits@1 0.18 hits@3 0.38*** | epoch 700 MRR 0.37 hits@1 0.26 hits@3 0.39*** | epoch 800 MRR 0.37 hits@1 0.26 hits@3 0.41 | epoch 900 MRR 0.41 hits@1 0.28 hits@3 0.47*** | epoch 1000 MRR 0.67 hits@1 0.59 hits@3 0.74*** | epoch 1100 MRR 0.37 hits@1 0.24 hits@3 0.43 | epoch 1200 MRR 0.33 hits@1 0.20 hits@3 0.37 | epoch 1300 MRR 0.52 hits@1 0.39 hits@3 0.57 | epoch 1400 MRR 0.59 hits@1 0.51 hits@3 0.63 | epoch 1500 MRR 0.61 hits@1 0.54 hits@3 0.60 | epoch 1600 MRR 0.65 hits@1 0.57 hits@3 0.67 | epoch 1700 MRR 0.76 hits@1 0.68 hits@3 0.82*** | epoch 1800 MRR 0.61 hits@1 0.51 hits@3 0.70 | epoch 1900 MRR 0.61 hits@1 0.50 hits@3 0.70 | epoch 2000 MRR 0.70 hits@1 0.59 hits@3 0.78 | epoch 2100 MRR 0.74 hits@1 0.64 hits@3 0.83 | epoch 2200 MR

In [9]:

# use best model checkpoint
print("Using best epoch: {}".format(checkpoint['epoch']))
rgcn_model.eval()
rgcn_model.load_state_dict(checkpoint['state_dict'])
rgcn_embed = rgcn_model(test_graph, test_node_id, test_rel, test_norm)
results = utils.calc_mrr(rgcn_embed, rgcn_model.w_relation, torch.LongTensor(train_data), valid_data,
                         test_data, hits=[1, 3], eval_bz=100, eval_p="filtered")
mrr = results['mrr']
hits3 = results['hits@3']
hits1 = results['hits@1']
print(f"RGCN {dataset} MRR {mrr} hits@1 {hits1} hits@3 {hits3}")

Using best epoch: 6700
RGCN combined MRR 0.8652116060256958 hits@1 0.8303416967391968 hits@3 0.8822867274284363
