In [1]:
from rdflib import Graph, URIRef

from pathlib import Path

In [11]:
DATASET = "YAGO4-20-C-MATERIALIZE"

home_path = Path().cwd().absolute()
datasets_path = home_path / "kgs"
dataset_path = datasets_path / DATASET
triples_path = dataset_path / "abox/splits/train.nt"

In [12]:
g = Graph()
g.parse(triples_path, format="nt")

<Graph identifier=N000958953cc748c1a795a06f81d8d5e7 (<class 'rdflib.graph.Graph'>)>

In [13]:
# remove from g all triples having http://www.w3.org/2002/07/owl#AnnotationProperty as object
for s, p, o in g.triples((None, None, URIRef("http://www.w3.org/2002/07/owl#AnnotationProperty"))):
    print(s, p, o)
    g.remove((s, p, o))

# remove from g all triples having http://www.w3.org/2002/07/owl#Class as object
for s, p, o in g.triples((None, None, URIRef("http://www.w3.org/2002/07/owl#Class"))):
    print(s, p, o)
    g.remove((s, p, o))

http://schema.org/superEvent http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#AnnotationProperty
http://schema.org/isPartOf http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#AnnotationProperty
http://schema.org/containsPlace http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#AnnotationProperty
http://schema.org/locationCreated http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#AnnotationProperty
http://schema.org/familyName http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#AnnotationProperty
http://schema.org/director http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#AnnotationProperty
http://schema.org/copyrightHolder http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#AnnotationProperty
http://schema.org/alumniOf http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#Anno

In [14]:
# serialize g to nt format and save it to triples_path
g.serialize(destination=triples_path, format="nt")

from tqdm import tqdm

with open(dataset_path / "abox/splits/train.tsv", "w", encoding="utf-8") as f:
    for s, p, o in tqdm(g):
        f.write(f"{s}\t{p}\t{o}\n")

100%|██████████| 627353/627353 [00:02<00:00, 250360.79it/s]


In [15]:
inds = set()

for s, p, o in g:
    if isinstance(s, URIRef):
        inds.add(s)
    if isinstance(o, URIRef):
        inds.add(o)

In [18]:
inds_labels = {}
print(len(inds))
for ind in inds:

    prefixes = [
        "http://yago-knowledge.org/resource/",
        "http://schema.org/",
        "http://bioschemas.org/"
    ]
    label = str(ind)

    for prefix in prefixes:
        if label.startswith(prefix):
            label = label.replace(prefix, "")
            break
        if label == prefix[:-1]:
            label = label.split("/")[-1]
            break 

    if "http://" in label:
        print(label)

    inds_labels[ind] = label

93213


In [None]:
inds_labels = {}
print(len(inds))
for ind in inds:

    prefixes = [
        "http://dbpedia.org/resource/",
        "http://dbpedia.org/ontology/"
    ]
    label = str(ind)

    for prefix in prefixes:
        if label.startswith(prefix):
            label = label.replace(prefix, "")
            break
        if label == prefix[:-1]:
            label = label.split("/")[-1]
            break 

    if "http://" in label:
        print(label)

    inds_labels[ind] = label

In [None]:
inds_labels = {}

for ind in inds:

    prefixes = [
        "https://w3id.org/italia/onto/SM/Review/",
        "https://w3id.org/italia/onto/SM/ContactPoint/",
        "https://w3id.org/italia/onto/CLV/StreetToponym/",
        "https://w3id.org/italia/onto/CLV/StreetNumber/",
        "https://w3id.org/italia/onto/CLV/Province/",
        "https://w3id.org/italia/onto/CLV/Address/",
        "https://w3id.org/italia/onto/POI/PointOfInterest/",
        "https://w3id.org/italia/onto/CLV/OpeningHours/",
        "https://w3id.org/italia/onto/SM/Email/",
        "https://w3id.org/italia/onto/CLV/Geometry/",
        "https://w3id.org/italia/onto/SM/WebSite/",
        "https://w3id.org/italia/onto/CLV/City/",
        "https://apuliatravel.org/td/",
        "https://w3id.org/italia/onto/l0/Object/",
        "https://w3id.org/italia/onto/SM/Rating/",
        "https://w3id.org/italia/onto/CLV/AddressArea/",
        "https://w3id.org/italia/onto/TI/DayOfWeek/",
        "https://w3id.org/italia/onto/TI/",
        "https://w3id.org/italia/onto/l0/Entity/",
        "https://w3id.org/italia/onto/AccessCondition/AccessCondition/",
        "https://w3id.org/italia/onto/CLV/Feature/",
        "https://w3id.org/italia/onto/SM/OnlineContactPoint/",
        "https://w3id.org/italia/onto/CLV/SpatialObject/",
        "https://w3id.org/italia/onto/l0/Location/",
        "https://w3id.org/italia/onto/AccessCondition/OpeningHoursSpecification/",
        "https://w3id.org/italia/onto/SM/Post/",
        "https://w3id.org/italia/onto/CLV/AdminUnitComponent/",
        "https://w3id.org/italia/onto/CLV/CivicNumbering",
        "https://w3id.org/italia/onto/AccessCondition/mandatoryBooking/",
        "https://w3id.org/italia/onto/CLV/AddressComponent/",
        "https://w3id.org/italia/onto/l0/Description/",
        "https://w3id.org/italia/onto/l0/Characteristic/"
    ]
    label = str(ind)

    for prefix in prefixes:
        if label.startswith(prefix):
            label = label.replace(prefix, "")
            break
        if label == prefix[:-1]:
            label = label.split("/")[-1]
            break 

    if "https://" in label:
        print(label)

    inds_labels[ind] = label

In [1]:
import json


with open("ind_labels.json", "r") as f:
    inds_labels = json.load(f)

In [2]:
inds_labels = {k: v[0] for k, v in inds_labels.items()}

In [3]:
import json


with open("ind_labels.json", "w") as f:
    json.dump(inds_labels, f)

In [None]:
props = {p for s, p, o in g}

In [None]:
prop_labels = {}

for prop in props:
    if prop == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type":
        prop_labels[str(prop)] = "type"
    else:
        prop_labels[str(prop)] = str(prop).split("/")[-1]