In [None]:
import pandas as pd

from collections import defaultdict
from tqdm import tqdm

from pykeen.triples import TriplesFactory

from SPARQLWrapper import SPARQLWrapper
from SPARQLWrapper import JSON 

In [None]:
sparql = SPARQLWrapper(endpoint="http://pop-os:7200/repositories/YAGO4")
sparql.setReturnFormat(JSON)

In [None]:
yago_prefix = "http://yago-knowledge.org/resource/"
yago_schema_prefix = "http://yago-knowledge.org/schema#"

In [None]:
query = """
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    select ?p
    WHERE { 
        ?p a owl:ObjectProperty .
    }
    ORDER BY RAND()
"""

sparql.setQuery(query)
rows = sparql.queryAndConvert()["results"]["bindings"]

predicates = set([r["p"]["value"] for r in rows])

predicates.add("http://schema.org/albumRelease")
predicates.add("http://schema.org/alumni")
predicates.add("http://schema.org/member")
predicates.add("http://schema.org/subjectOf")

predicates_string = [f"<{p}>" for p in predicates]
predicates_string = f"{{{' '.join(predicates_string)}}}"

In [None]:
query = f"""
    SELECT ?s ?p ?o
    WHERE {{
        VALUES ?p {predicates_string}
        ?s ?p ?o
    }}
"""

sparql.setQuery(query)
rows = sparql.queryAndConvert()["results"]["bindings"]

In [None]:
get_s = lambda row: row["s"]["value"]
get_o = lambda row: row["o"]["value"]
get_p = lambda row: row["p"]["value"]

triples = [(get_s(t), get_p(t), get_o(t)) for t in tqdm(rows)]

In [None]:
entity_to_triples = defaultdict(list)

for s, p, o in tqdm(triples):
    entity_to_triples[s].append((s, p, o))
    entity_to_triples[o].append((s, p, o))

entities_to_discard = []
for entity, cur_triples in tqdm(entity_to_triples.items()):
    if "http://yago-knowledge.org/resource/" not in entity or len(cur_triples) < 20:
        entities_to_discard.append(entity)

entities_to_discard = set(entities_to_discard)

filtered_triples = []
for s, p, o in tqdm(triples):
    if s not in entities_to_discard and o not in entities_to_discard:
        filtered_triples.append((s, p, o))

s = [s for s, _, _ in filtered_triples]
o = [o for _, _, o in filtered_triples]

sample_entities = set(s).union(set(o))

In [None]:
triples_df = pd.DataFrame(filtered_triples, columns=["s", "p", "o"])
triples_df["s"] = triples_df["s"].map(lambda x: x.split("http://yago-knowledge.org/resource/")[-1])
triples_df["o"] = triples_df["o"].map(lambda x: x.split("http://yago-knowledge.org/resource/")[-1])
triples_df["p"] = triples_df["p"].map(lambda x: x.split("/")[-1])
triples_df.to_csv("../data/YAGO4-20/data.txt", sep="\t", index=False, header=False)

In [None]:
tf = TriplesFactory.from_path("../data/YAGO4-20/data.txt")

training, testing, validation = tf.split([.8, .1, .1])

train_df = training.tensor_to_df(training.mapped_triples)
train_df.drop(columns=["head_id", "relation_id", "tail_id"], inplace=True)
train_df.to_csv("../data/YAGO4-20/train.txt", sep="\t", index=False, header=False)

test_df = testing.tensor_to_df(testing.mapped_triples)
test_df.drop(columns=["head_id", "relation_id", "tail_id"], inplace=True)
test_df.to_csv("../data/YAGO4-20/test.txt", sep="\t", index=False, header=False)

valid_df = validation.tensor_to_df(validation.mapped_triples)
valid_df.drop(columns=["head_id", "relation_id", "tail_id"], inplace=True)
valid_df.to_csv("../data/YAGO4-20/valid.txt", sep="\t", index=False, header=False)