# Analysing the ontology from the OpenAIRE Research Graph

We will try to understand how classes and properties from the OpenAIRE ontology are actually used in the Research Graph.

## Classes

### Listing all classes

Let's create a list of all classes in the Research Graph.

First, we will try to list all entities of type `rdfs:Class`.

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://lod.openaire.eu/sparql")

query = """
SELECT ?class
WHERE {
    ?class a rdfs:Class.
}
"""

sparql.setReturnFormat(JSON)

sparql.setQuery(query)

results = sparql.query().convert()

graph_classes = []

for result in results["results"]["bindings"]:
    d = {}
    d["URI"] = result["class"]["value"]
    graph_classes.append(d)

# The following list will be necessary later, to exclude duplicates.
graph_classes_results_1 = []
for result in results["results"]["bindings"]:
    graph_classes_results_1.append(result["class"]["value"])

We can try the same thing with `owl:Class`, but the results are not relevant and can be discarded.

In [2]:
query = """
SELECT ?class
WHERE {
    ?class a owl:Class.
}
"""

sparql.setReturnFormat(JSON)

sparql.setQuery(query)

results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result["class"]["value"])

http://www.w3.org/2002/07/owl#Thing
http://www.w3.org/2002/07/owl#Nothing
nodeID://b10001
nodeID://b136512
nodeID://b1609135
nodeID://b1609139
nodeID://b1609142
nodeID://b1609145
nodeID://b1609148
nodeID://b1609151
nodeID://b1609154
nodeID://b1609157
nodeID://b1609160
nodeID://b1609163
nodeID://b1609169
nodeID://b1609173


Now, we will try to find other classes by looking for the objects of property `rdf:type`.

In [3]:
query = """
SELECT DISTINCT ?type
WHERE {
    ?s a ?type.
}
"""

sparql.setReturnFormat(JSON)

sparql.setQuery(query)

results = sparql.query().convert()

for result in results["results"]["bindings"]:
    if result["type"]["value"] not in graph_classes_results_1:
        d = {}
        d["URI"] = result["type"]["value"]
        graph_classes.append(d)

### Checking counts of these classes

In [4]:
for class_dictionary in graph_classes:
    query = f"""
    SELECT COUNT(?s) AS ?cnt
    WHERE {{
    ?s a <{class_dictionary["URI"]}>.
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        class_dictionary["cnt"] = int(result["cnt"]["value"])

### Looking for predicates and objects of these classes

In [5]:
for class_dictionary in graph_classes:
    query = f"""
    SELECT *
    WHERE {{
        <{class_dictionary["URI"]}> ?p ?o
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        if result["p"]["value"] in class_dictionary:
            if type(class_dictionary[result["p"]["value"]]) is list:
                class_dictionary[result["p"]["value"]].append(result["o"]["value"])
            else:
                class_dictionary[result["p"]["value"]] = [class_dictionary[result["p"]["value"]],]
                class_dictionary[result["p"]["value"]].append(result["o"]["value"])
        else:
            class_dictionary[result["p"]["value"]] = result["o"]["value"]

### Analyzing the instances of these classes

In [6]:
import re

for class_dictionary in graph_classes:
    query = f"""
    SELECT ?s
    WHERE {{
        ?s a <{class_dictionary["URI"]}>
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        base_uri = re.sub(r"(.+/).*", r"\1", result["s"]["value"])
        if "Examples (base URI)" in class_dictionary:
            if type(class_dictionary["Examples (base URI)"]) is list:
                if base_uri not in class_dictionary["Examples (base URI)"]:
                    class_dictionary["Examples (base URI)"].append(base_uri)
            else:
                class_dictionary["Examples (base URI)"] = [class_dictionary["Examples (base URI)"],]
                if base_uri not in class_dictionary["Examples (base URI)"]:
                    class_dictionary["Examples (base URI)"].append(base_uri)
        else:
            class_dictionary["Examples (base URI)"] = base_uri

Let's import everything into a DataFrame and export as CSV.

In [7]:
import pandas as pd

classes_df = pd.DataFrame(graph_classes)

classes_df.to_csv("graph_classes.csv")

## Properties

Let's make a list of all properties present in the Research Graph, starting with instances of `rdf:Property`.

In [8]:
query = """
SELECT ?property
WHERE {
    ?property a rdf:Property.
}
"""

sparql.setReturnFormat(JSON)

sparql.setQuery(query)

results = sparql.query().convert()

graph_properties = []

for result in results["results"]["bindings"]:
    d = {}
    d["URI"] = result["property"]["value"]
    graph_properties.append(d)

# The following list will be necessary later, to exclude duplicates.
graph_properties_results_1 = []
for result in results["results"]["bindings"]:
    graph_properties_results_1.append(result["property"]["value"])

Now, we will try to find other properties by looking for predicates of triples.

In [9]:
query = """
SELECT DISTINCT ?p
WHERE {
    ?s ?p ?o.
}
"""

sparql.setReturnFormat(JSON)

sparql.setQuery(query)

results = sparql.query().convert()

for result in results["results"]["bindings"]:
    if result["p"]["value"] not in graph_properties_results_1:
        d = {}
        d["URI"] = result["p"]["value"]
        graph_properties.append(d)

### Checking counts of these properties

In [10]:
for property_dictionary in graph_properties:
    query = f"""
    SELECT COUNT(*) AS ?cnt
    WHERE {{
    ?s <{property_dictionary["URI"]}> ?o.
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        property_dictionary["cnt"] = int(result["cnt"]["value"])

### Looking for predicates and objects of these properties

In [11]:
for property_dictionary in graph_properties:
    query = f"""
    SELECT *
    WHERE {{
        <{property_dictionary["URI"]}> ?p ?o
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        if result["p"]["value"] in property_dictionary:
            if type(property_dictionary[result["p"]["value"]]) is list:
                property_dictionary[result["p"]["value"]].append(result["o"]["value"])
            else:
                property_dictionary[result["p"]["value"]] = [property_dictionary[result["p"]["value"]],]
                property_dictionary[result["p"]["value"]].append(result["o"]["value"])
        else:
            property_dictionary[result["p"]["value"]] = result["o"]["value"]

### Looking for domain and range of these properties

Let's first check the domain of these properties.

In [12]:
for property_dictionary in graph_properties:
    query = f"""
    SELECT DISTINCT ?sclass
    WHERE {{
        ?s <{property_dictionary["URI"]}> ?o.
        ?s a ?sclass.
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        if "Domain" in property_dictionary:
            if type(property_dictionary["Domain"]) is list:
                if result["sclass"]["value"] not in property_dictionary["Domain"]:
                    property_dictionary["Domain"].append(result["sclass"]["value"])
            else:
                if property_dictionary["Domain"] != result["sclass"]["value"]:
                    property_dictionary["Domain"] = [property_dictionary["Domain"],]
                    property_dictionary["Domain"].append(result["sclass"]["value"])
        else:
            property_dictionary["Domain"] = result["sclass"]["value"]

Now let's check the range.

In [13]:
for property_dictionary in graph_properties:
    query = f"""
    SELECT DISTINCT ?oclass
    WHERE {{
        ?s <{property_dictionary["URI"]}> ?o.
        ?o a ?oclass.
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:
        if "Range" in property_dictionary:
            if type(property_dictionary["Range"]) is list:
                if result["oclass"]["value"] not in property_dictionary["Range"]:
                    property_dictionary["Range"].append(result["oclass"]["value"])
            else:
                if property_dictionary["Range"] != result["oclass"]["value"]:
                    property_dictionary["Range"] = [property_dictionary["Range"],]
                    property_dictionary["Range"].append(result["oclass"]["value"])
        else:
            property_dictionary["Range"] = result["oclass"]["value"]

Let's also check if these properties have literals as objects.

In [14]:
for property_dictionary in graph_properties:
    query = f"""
    ASK
    {{
        ?s <{property_dictionary["URI"]}> ?o
        FILTER(isLiteral(?o))
    }}
    """
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    results = sparql.query().convert()
    property_dictionary["Literal?"] = results["boolean"]

Let's import everything into a DataFrame and export as CSV.

In [None]:
import pandas as pd

properties_df = pd.DataFrame(graph_properties)

properties_df.to_csv("graph_properties.csv")