In [1]:
import rdflib
import urllib
import pprint
import itertools
import os
import sys
import time
from tqdm import tqdm

sys.path.append(os.path.dirname(os.getcwd()))
from data import WN18

In [2]:
def read_triples(path):
    triples = []
    with open(path, 'rt') as f:
        for line in f.readlines():
            s, p, o = line.split()
            triples += [(s.strip(), p.strip(), o.strip())]
    return triples

def load_triple():
    WN18.download()
    triples_tr = read_triples('../data/WN18/wn18/train.txt')
    triples_va = read_triples('../data/WN18/wn18/valid.txt')
    triples_te = read_triples('../data/WN18/wn18/test.txt')
    triples_all = triples_tr + triples_va + triples_te
    return triples_all, triples_tr, triples_va, triples_te

def get_resources(triples):
    return set([r for triple in triples for r in triple])

def to_uri(suffix, uri_prefix):
    return uri_prefix + urllib.parse.quote(suffix)

def glance_dict(d, n=5):
    return dict(itertools.islice(d.items(), n))

def build_rdf_graph(triples, uri_prefix='http://www.example.org/'):
    graph = rdflib.Graph()
    resources = get_resources(triples)
    resource2uri = {r: to_uri(r, uri_prefix) for r in resources}
    uri2resource = {uri: r for r, uri in resource2uri.items()}
    for (s, p, o) in tqdm(triples, total=len(triples), ncols=70):
        s_uri = rdflib.URIRef(resource2uri[s])
        p_uri = rdflib.URIRef(resource2uri[p])
        o_uri = rdflib.URIRef(resource2uri[o])
        graph.add((s_uri, p_uri, o_uri))
    return graph, uri2resource

In [3]:
triples_all, triples_tr, triples_va, triples_te = load_triple()
rdf_graph, uri2resource = build_rdf_graph(triples_tr)
query_str = (
    """
    SELECT DISTINCT ?s ?p ?o ?x ?any_p_1 ?any_p_2 WHERE {
        ?s ?p ?o .
        ?s ?any_p_1 ?x .
        ?x ?any_p_2 ?o .
    }
    """
)

t0 = time.time()
res = []
count = 1
for row in rdf_graph.query(query_str):
    s = uri2resource[row.s.toPython()]
    p = uri2resource[row.p.toPython()]
    o = uri2resource[row.o.toPython()]
    x = uri2resource[row.x.toPython()]
    p_1 = uri2resource[row.any_p_1.toPython()]
    p_2 = uri2resource[row.any_p_2.toPython()]
    res.append([(s, p, o), (s, p_1, x), (x, p_2, o)])
    count += 1

print("Query Spent %.2f sec" % (time.time() - t0))
print("Get %d / %d Results" % (count, len(triples_tr)))
pprint.pprint(res[0])
print(res[0][2] in triples_tr)

Files Already Downloaded


100%|██████████████████████| 141442/141442 [00:05<00:00, 25848.47it/s]


Query Spent 246.93 sec
Get 25184 / 141442 Results
[('02673134', '_verb_group', '01602318'),
 ('02673134', '_derivationally_related_form', '14002279'),
 ('14002279', '_derivationally_related_form', '01602318')]
True
