In [1]:
import random

import pandas as pd

from collections import defaultdict
from tqdm import tqdm

from rdflib import Graph
from rdflib import RDF, RDFS, OWL

from SPARQLWrapper import SPARQLWrapper
from SPARQLWrapper import JSON
from SPARQLWrapper import GET, POST

In [2]:
sparql = SPARQLWrapper(endpoint="http://pop-os:7200/repositories/YAGO45")
sparql.setReturnFormat(JSON)

In [3]:
yago_prefix = "http://yago-knowledge.org/resource/"
yago_schema_prefix = "http://yago-knowledge.org/schema#"

In [None]:
query = """
    SELECT DISTINCT ?vertex
    WHERE
    {
        { 
            ?vertex ?p [] 
        }
        UNION
        { 
            [] ?p ?vertex 
            FILTER(!IsLiteral(?vertex))
        }
        FILTER(STRSTARTS(STR(?vertex), "http://yago-knowledge.org/resource/"))
        FILTER NOT EXISTS { ?vertex rdf:type rdfs:Class }    
    }
    LIMIT 1000000
"""
sparql.setQuery(query)

rows = sparql.queryAndConvert()["results"]["bindings"]
entities = [r["vertex"]["value"] for r in rows]

In [None]:
sample = random.sample(entities, 150000)

In [None]:
get_s = lambda row: row["s"]["value"].replace(yago_prefix, "")
get_o = lambda row: row["o"]["value"].replace(yago_prefix, "")
get_p = lambda row: row["p"]["value"]

sparql.setMethod(GET)

query_entities = [e.replace(yago_prefix, "") for e in sample]
query_entities = [f"yago:{value}" for value in query_entities]

entity_triples = {}
for entity in tqdm(query_entities):
    query = f"""
        PREFIX yago: <{yago_prefix}>

        SELECT DISTINCT ?s ?p ?o
        WHERE {{
            VALUES ?s {{{entity}}}
            ?s ?p ?o .

            FILTER (?p != rdf:type && ?p != owl:sameAs)
            FILTER(!IsLiteral(?o))
        }}
    """

    sparql.setQuery(query)
    as_subject = sparql.queryAndConvert()["results"]["bindings"]

    query = f"""
        PREFIX yago: <{yago_prefix}>

        SELECT DISTINCT ?s ?p ?o
        WHERE {{
            VALUES ?o {{{entity}}}
            ?s ?p ?o .

            FILTER (?p != rdf:type && ?p != owl:sameAs)
        }}
    """

    sparql.setQuery(query)
    as_object = sparql.queryAndConvert()["results"]["bindings"]

    triples = as_subject + as_object
    triples = [{"s": get_s(t), "p": get_p(t), "o": get_o(t)} for t in triples]

    entity_triples[entity] = triples

In [None]:
sparql.setMethod(POST)

yago_prefix = "http://yago-knowledge.org/resource/"

sample = [e.replace(yago_prefix, "") for e in sample]
sample = [f"yago:{value}" for value in sample]

rows = []
batch_start = 0
batch_size = 50000
while batch_start < len(sample):
    batch_end = min(batch_start + batch_size, len(sample))
    print(batch_start, batch_end)
    batch = sample[0:batch_end]
    batch = f"{{{' '.join(batch)}}}"

    query = f"""
        PREFIX yago: <{yago_prefix}>
        SELECT DISTINCT ?entity_uri (COALESCE(?class, "") as ?class_)
        WHERE {{
            VALUES ?entity_uri {batch}

            OPTIONAL {{
                ?entity_uri a ?class .
                FILTER NOT EXISTS {{
                    ?subClass rdfs:subClassOf ?class .
                    ?entity_uri a ?subClass .
                    FILTER (?subClass != ?class)
                }}
            }}
        }}
    """
    
    sparql.setQuery(query)
    rows.extend(sparql.queryAndConvert()["results"]["bindings"])

    batch_start += batch_size

In [None]:
get_entity = lambda row: row["entity_uri"]["value"].replace(yago_prefix, "")
get_class = lambda row: row["class_"]["value"].split("/")[-1]

entities = [{"class": get_class(row), "entity": get_entity(row)} for row in rows]

entities = pd.DataFrame(entities)

def to_set(classes):
    classes = set(classes.tolist())
    classes = classes if classes != {""} else set()

    return classes

entities = entities.groupby(entities["entity"]).agg(to_set)
entities = entities.reset_index()

entities.to_csv('entities.csv', index=False)

In [None]:
get_s = lambda row: row["s"]["value"].replace(yago_prefix, "")
get_o = lambda row: row["o"]["value"].replace(yago_prefix, "")
get_p = lambda row: row["p"]["value"]

query = f"""
    PREFIX yago: <{yago_prefix}>
    PREFIX ys: <{yago_schema_prefix}>
    PREFIX shacl: <http://www.w3.org/ns/shacl#>

    SELECT ?s ?p ?o
    WHERE
    {{
        ?s ?p ?o
        FILTER (
            !IsLiteral(?o) &&
            ?p != ys:fromClass && ?p != ys:fromProperty &&
            ?p != rdf:type && ?p != rdfs:subClassOf && ?p != rdfs:subPropertyOf && 
            ?p != shacl:property && ?p != shacl:path && ?p != shacl:datatype &&
            ?p != owl:disjointWith && ?p != owl:sameAs
        )
    }}
"""

sparql.setMethod(POST)
sparql.setQuery(query)
rows = sparql.queryAndConvert()["results"]["bindings"]

triples = [{"s": get_s(t), "p": get_p(t), "o": get_o(t)} for t in rows]
triples = [(t["s"], t["p"], t["o"]) for t in triples]