In [1]:
from rdfframes.knowledge_graph import KnowledgeGraph
from rdfframes.dataset.rdfpredicate import RDFPredicate
from rdfframes.utils.constants import JoinType
from rdfframes.client.http_client import HttpClientDataFormat, HttpClient

In [3]:
graph = KnowledgeGraph(graph_uri='http://dbpedia.org',
                       prefixes={'dcterms': 'http://purl.org/dc/terms/',
                                 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
                                 'dbpprop': 'http://dbpedia.org/property/',
                                 'dbpr': 'http://dbpedia.org/resource/'})

dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='film', range_col_name='actor')\
    .expand('actor', [RDFPredicate('dbpprop:birthPlace', 'actor_country'), RDFPredicate('rdfs:label', 'actor_name')])\
    .expand('film', [RDFPredicate('rdfs:label', 'film_name'), RDFPredicate('dcterms:subject', 'subject'),
                     RDFPredicate('dbpprop:country', 'film_country')])\
    .cache()
# 26928 Rows. -- 4273 msec.
american_actors = dataset.filter({'actor_country': ['regex(str(?actor_country), "USA")']})

# 1606 Rows. -- 7659 msec.
prolific_actors = dataset.group_by(['actor'])\
    .count('film', 'film_count', unique=True).filter({'film_count': ['>= 20', '<=30']})

#663,769 Rows. -- 76704 msec.
films = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\
    .join(dataset, join_col_name1='actor')\
    .select_cols(["film_name", "actor_name", "subject"])

sparql_query = films.to_sparql()

print(sparql_query)

endpoint = 'http://10.161.202.101:8890/sparql/'
output_format = HttpClientDataFormat.PANDAS_DF

client = HttpClient(endpoint_url=endpoint, return_format=output_format)

PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbpprop: <http://dbpedia.org/property/>
PREFIX dbpr: <http://dbpedia.org/resource/>
SELECT ?film_name ?actor_name ?subject 
FROM <http://dbpedia.org>
WHERE {
	?film dbpprop:starring ?actor .
	?actor dbpprop:birthPlace ?actor_country .
	?actor rdfs:label ?actor_name .
	?film rdfs:label ?film_name .
	?film dcterms:subject ?subject .
	?film dbpprop:country ?film_country .


		{
		SELECT * 
		WHERE {
			?film dbpprop:starring ?actor .
			?actor dbpprop:birthPlace ?actor_country .
			?actor rdfs:label ?actor_name .
			?film rdfs:label ?film_name .
			?film dcterms:subject ?subject .
			?film dbpprop:country ?film_country .
		
		
			FILTER (regex(str(?actor_country), "USA") ) 
		}
		
		}
		UNION
		{
		SELECT ?actor  (COUNT(DISTINCT ?film) AS ?film_count) 
		WHERE {
			?film dbpprop:starring ?actor .
			?actor dbpprop:birthPlace ?actor_country .
			?actor rdfs:label ?actor_name .
			?film r

In [None]:
# [663769 rows x 8 columns]
df = films.execute(client, return_format=output_format)
print(df)

time of the query preparation 0.0011327266693115234
