Notebook written by [Zhedong Zheng](https://github.com/zhedongzheng)

In [1]:
"""
pip3 install rdflib
"""
import rdflib
import urllib.parse
import pprint
import itertools
import os
import sys
import time
from tqdm import tqdm

sys.path.append(os.path.dirname(os.getcwd()))
from data import WN18

In [2]:
URI_PREFIX = 'http://example.org/'

In [3]:
def read_triples(path):
    triples = []
    with open(path, 'rt') as f:
        for line in f.readlines():
            s, p, o = line.split()
            triples += [(s.strip(), p.strip(), o.strip())]
    return triples

def load_triple():
    WN18.download()
    triples_tr = read_triples('../data/WN18/wn18/train.txt')
    triples_va = read_triples('../data/WN18/wn18/valid.txt')
    triples_te = read_triples('../data/WN18/wn18/test.txt')
    triples_all = triples_tr + triples_va + triples_te
    return triples_all, triples_tr, triples_va, triples_te

def get_resources(triples):
    return set([r for triple in triples for r in triple])

def to_uri(suffix):
    return URI_PREFIX + urllib.parse.quote(suffix)

def glance_dict(d, n=5):
    return dict(itertools.islice(d.items(), n))

def build_rdf_graph(triples):
    graph = rdflib.Graph()
    resources = get_resources(triples)
    resource2uri = {r: to_uri(r) for r in resources}
    uri2resource = {uri: r for r, uri in resource2uri.items()}
    for (s, p, o) in tqdm(triples, total=len(triples), ncols=70):
        s_uri = rdflib.URIRef(resource2uri[s])
        p_uri = rdflib.URIRef(resource2uri[p])
        o_uri = rdflib.URIRef(resource2uri[o])
        graph.add((s_uri, p_uri, o_uri))
    return graph, uri2resource

In [4]:
tgt_p, tgt_o = '_has_part', '04371774'

triples_all, triples_tr, triples_va, triples_te = load_triple()
rdf_graph, uri2resource = build_rdf_graph(triples_tr)
query_str = (
    """
    PREFIX prefix: <%s>
    
    SELECT DISTINCT ?s WHERE {
        ?s prefix:%s prefix:%s .
    }
    """ % (URI_PREFIX, tgt_p, tgt_o)
)

t0 = time.time()
res = []
count = 0
for row in rdf_graph.query(query_str):
    s = uri2resource[row.s.toPython()]
    res.append((s, tgt_p, tgt_o))
    count += 1

print("Query Spent %.2f sec" % (time.time() - t0))
print("Get %d / %d Results" % (count, len(triples_tr)))
pprint.pprint(res)

100%|██████████████████████| 141442/141442 [00:05<00:00, 25679.66it/s]


Query Spent 1.51 sec
Get 1 / 141442 Results
[('03963645', '_has_part', '04371774')]
