Skip to content

Commit

Permalink
rdflib utility: sparql_results_to_df
Browse files Browse the repository at this point in the history
  • Loading branch information
dhimmel committed Apr 24, 2022
1 parent e131f73 commit e38c757
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 4 deletions.
Empty file.
62 changes: 62 additions & 0 deletions nxontology_data/tests/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest
import rdflib

from nxontology_data.utils import sparql_results_to_df


@pytest.fixture
def rdflib_foaf_graph() -> rdflib.Graph:
"""
FOAF (Friend of a Friend) testing graph from rdflib.
"""
graph = rdflib.Graph()
return graph.parse(
source="https://github.com/RDFLib/rdflib/raw/56dc4207ce6e7b11ed7b45fb4fd4020ba548e718/examples/foaf.n3",
format="n3",
)


_foaf_sparql = """\
SELECT
?subject
?subject_is_tim
(COUNT(*) AS ?n_triples)
(MIN(?predicate) AS ?sample_predicate)
(SAMPLE(?missing) AS ?missing)
WHERE {
?subject ?predicate ?object.
BIND(?subject = <http://www.w3.org/People/Berners-Lee/card#i> AS ?subject_is_tim)
OPTIONAL {?subject <this_predicate_does_not_exist> ?missing .}
}
GROUP BY ?subject ?subject_is_tim
ORDER BY DESC(?n_triples) ?subject
LIMIT 10
"""


@pytest.mark.slow
def test_sparql_results_to_df(rdflib_foaf_graph: rdflib.Graph) -> None:
results = rdflib_foaf_graph.query(_foaf_sparql)
df = sparql_results_to_df(results)
assert len(df) == 10
# test column values (no ? prefix), type (as strings), and order
assert list(df.columns) == [
"subject",
"subject_is_tim",
"n_triples",
"sample_predicate",
"missing",
]
first_row = next(df.itertuples())
# test value of subject, ensuring type conversion to str
assert first_row.subject == "http://www.w3.org/People/Berners-Lee/card#i"
# test value of subject_is_tim, ensuring type conversion to bool
assert first_row.subject_is_tim is True
# test value of n_triples, ensuring type conversion to int
assert first_row.n_triples == 61
# test value of sample_predicate, ensuring type conversion to str
assert (
first_row.sample_predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
)
# test value of missing, ensuring it's None
assert first_row.missing is None
14 changes: 14 additions & 0 deletions nxontology_data/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pandas as pd
from rdflib.plugins.sparql.processor import SPARQLResult


def sparql_results_to_df(results: SPARQLResult) -> pd.DataFrame:
"""
Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
using Python types. See https://github.com/RDFLib/rdflib/issues/1179
and https://github.com/RDFLib/sparqlwrapper/issues/205.
"""
return pd.DataFrame(
data=([None if x is None else x.toPython() for x in row] for row in results),
columns=[str(x) for x in results.vars],
)
45 changes: 41 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ openpyxl = "^3.0.9"
pandas = "^1.4.1"
papermill = "^2.3.3"
requests = "^2.26.0"
rdflib = "^6.1.1"
fsspec = "^2022.3.0"

[tool.poetry.dev-dependencies]
pre-commit = "^2.15.0"
Expand All @@ -51,6 +53,7 @@ module = [
"fire.*",
"networkx.*",
"pandas.*",
"rdflib.*",
"requests.*",
]
ignore_missing_imports = true

0 comments on commit e38c757

Please sign in to comment.