In [None]:
import pandas as pd

from rdflib import Graph
from rdflib import URIRef
from rdflib import RDF

from tqdm import tqdm

from SPARQLWrapper import SPARQLWrapper
from SPARQLWrapper import JSON 

In [None]:
sparql = SPARQLWrapper(endpoint="http://pop-os:7200/repositories/YAGO4")
sparql.setReturnFormat(JSON)
query = """
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    select ?p
    WHERE { 
        ?p a owl:ObjectProperty .
    }
    ORDER BY RAND()
"""

sparql.setQuery(query)
rows = sparql.queryAndConvert()["results"]["bindings"]

predicates = set([r["p"]["value"] for r in rows])

predicates.add("http://schema.org/albumRelease")
predicates.add("http://schema.org/alumni")
predicates.add("http://schema.org/member")
predicates.add("http://schema.org/subjectOf")
predicates_map = {predicate.split("/")[-1]: predicate for predicate in predicates}

In [None]:
triples = pd.read_csv("../data/YAGO4-20/data.txt", sep="\t", names=["s", "p", "o"])
triples["s"] = triples["s"].map(lambda x: f"http://yago-knowledge.org/resource/{x}")
triples["p"] = triples["p"].map(predicates_map.get)
triples["o"] = triples["o"].map(lambda x: f"http://yago-knowledge.org/resource/{x}")

In [None]:
subjects = set(triples["s"].values.tolist())
objects = set(triples["o"].values.tolist())
entities = subjects.union(objects)

In [None]:
types = Graph()

types.parse("../data/YAGO4-20/yago-wd-full-types.nt", format="nt")

In [None]:
sample_graph = Graph()

for entity in tqdm(entities):
    entity_types = types.triples((URIRef(entity), RDF.type, None))
    entity_types = list(entity_types)
    for entity_type in entity_types:
        sample_graph.add(entity_type)

In [None]:
triples = [(row['s'], row['p'], row['o']) for _, row in tqdm(triples.iterrows())]

In [None]:
for s, p, o in tqdm(triples):
    sample_graph.add((URIRef(s), URIRef(p), URIRef(o)))

In [None]:
sample_graph.serialize("../data/YAGO4-20/YAGO4-20_no_schema.nt", format="nt", encoding="utf-8")

In [None]:
!cat ../data/YAGO4-20/yago-wd-schema.nt ../data/YAGO4-20/yago-wd-class.nt ../data/YAGO4-20/YAGO4-20_no_schema.nt > ../data/YAGO4-20/YAGO4-20.nt