# Parsing a SPARQL query to extract basic graph patterns

Function to parse a SPARQL query and extract the BGP it contains to load them in a RDFLib graph, with support for federated queries

In [2]:
from sparql_llm.validate_sparql import validate_sparql_with_void

# sparql_query = """
# PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX rh: <http://rdf.rhea-db.org/>
# SELECT ?uniprotCount ?rhea ?accession ?equation
# WHERE {
#   SERVICE <https://sparql.uniprot.org/sparql> {
#       SELECT ?rhea (count(?uniprot) as ?uniprotCount) {
#           ?uniprot up:annotation ?rhea .
#           ?uniprot up:toast ?toooast .
#       }
#       GROUP BY ?rhea
#   }
#   ?rhea rh:accession ?accession .
#   ?rhea rh:equation ?equation .
# }
# """

# Hard query
sparql_query = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up:<http://purl.uniprot.org/core/>
PREFIX taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX orth:<http://purl.org/net/orth#>
PREFIX dcterms:<http://purl.org/dc/terms/>
PREFIX obo:<http://purl.obolibrary.org/obo/>
PREFIX lscr:<http://purl.org/lscr#>
PREFIX genex:<http://purl.org/genex#>
PREFIX sio: <http://semanticscience.org/resource/>
SELECT DISTINCT ?diseaseLabel ?humanProtein ?hgncSymbol ?orthologRatProtein ?orthologRatGene
WHERE {
    SERVICE <https://sparql.uniprot.org/sparql> {
        SELECT DISTINCT * WHERE {
            ?humanProtein a up:Protein ;
                up:organism/up:scientificName 'Homo sapiens' ;
                up:annotation ?annotation ;
                rdfs:seeAlso ?hgnc .
            ?hgnc up:database <http://purl.uniprot.org/database/HGNC> ;
                rdfs:label ?hgncSymbol . # comment
            ?annotation a up:Disease_Annotation ;
                up:disease ?disease .
            ?disease a up:Disease ;
                rdfs:label ?diseaseLabel . # skos:prefLabel
            FILTER CONTAINS(?diseaseLabel, "cancer")
        }
    }
    SERVICE <https://sparql.omabrowser.org/sparql/> {
        SELECT ?humanProtein ?orthologRatProtein ?orthologRatGene WHERE {
            ?humanProteinOma a orth:Protein ;
                lscr:xrefUniprot ?humanProtein .
            ?orthologRatProtein a orth:Protein ;
                sio:SIO_010078 ?orthologRatGene ; # 79
                orth:organism/obo:RO_0002162/up:scientificNam 'Rattus norvegicus' .
            ?cluster a orth:OrthologsCluster .
            ?cluster orth:hasHomologousMember ?node1 .
            ?cluster orth:hasHomologousMember ?node2 .
            ?node1 orth:hasHomologousMember* ?humanProteinOma .
            ?node2 orth:hasHomologousMember* ?orthologRatProtein .
            FILTER(?node1 != ?node2)
        }
    }
    SERVICE <https://www.bgee.org/sparql/> {
        ?orthologRatGene genex:isExpressedIn ?anatEntity ;
            orth:organism ?ratOrganism .
        ?anatEntity rdfs:label 'brain' .
        ?ratOrganism obo:RO_0002162 taxon:10116 .
    }
}
"""

issues = validate_sparql_with_void(sparql_query, "https://sparql.uniprot.org/sparql")
print("\n".join(issues))

# sparql_query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX genex: <http://purl.org/genex#>
# PREFIX obo: <http://purl.obolibrary.org/obo/>
# PREFIX orth: <http://purl.org/net/orth#>
# PREFIX dcterms: <http://purl.org/dc/terms/>
# PREFIX sio: <http://semanticscience.org/resource/>

# SELECT DISTINCT ?flyEnsemblGene ?orthologTaxon ?orthologEnsemblGene ?orthologOmaLink WHERE {
# 	{
#         SELECT DISTINCT ?gene ?flyEnsemblGene {
#         ?gene a orth:Gene ;
#             genex:isExpressedIn/rdfs:label 'eye' ;
#             orth:organism/obo:RO_0002162 ?taxon ;
#             dcterms:identifier ?flyEnsemblGene .
#         ?taxon up:commonName 'fruit fly' .
#         } LIMIT 100
#     }
#     SERVICE <https://sparql.omabrowser.org/sparql> {
#         ?protein2 a orth:Protein .
#         ?protein1 a orth:Protein .
#         ?clusterPrimates a orth:OrthologsCluster .
#         ?cluster a orth:OrthologsCluster ;
#             orth:hasHomologousMember ?node1 ;
#             orth:hasHomologousMember ?node2 .
#         ?node1 orth:hasHomologousMember* ?protein1 .
#         ?node2 orth:hasHomologousMember* ?clusterPrimates .
#         ?clusterPrimates orth:hasHomologousMember* ?protein2 .
#         ?protein1 sio:SIO_010079 ?gene . # is encoded by
#         ?protein2 rdfs:seeAlso ?orthologOmaLink ;
#             orth:organism/obo:RO_0002162 ?orthologTaxonUri ;
#             sio:SIO_010079 ?orthologGene . # is encoded by
#         ?clusterPrimates orth:hasTaxonomicRange ?taxRange .
#         ?taxRange orth:taxRange 'Primates' .
#         FILTER ( ?node1 != ?node2 )
#     }
#     ?orthologTaxonUri up:commonName ?orthologTaxon .
#     ?orthologGene dcterms:identifier ?orthologEnsemblGene .
# }"""

# try:
#     validate_sparql_with_void(sparql_query, "https://www.bgee.org/sparql")
# except Exception as e:
#     print(e)


# # Failing query:
# sparql_query = """PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
# PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
# SELECT ?protein ?sequence
# WHERE {
#     ?protein a up:Protein .
#     ?protein up:reviewed true .
#     ?protein up:organism taxon:10090 .
#     ?protein up:matureProtein ?matureProtein .
#     ?matureProtein rdf:value ?sequence .
#     FILTER (STRSTARTS(?sequence, "G")) .
# }"""


# print("\n".join(error_msgs))
# print(len(error_msgs))
# qres = g.query("""SELECT ?s ?p ?o WHERE {
#     ?s ?p ?o
# } LIMIT 10""")
# for row in qres:
#     print(f"{row.s} knows {row.o}")

Subject ?hgnc in endpoint https://sparql.uniprot.org/sparql does not support the predicate rdfs:label. Correct predicate might be one of the following: owl:sameAs, rdfs:seeAlso, up:transcribedFrom, up:translatedTo, rdfs:comment, up:database (we inferred this variable might be of the type up:Transcript_Resource)
Subject ?disease with type up:Disease in endpoint https://sparql.uniprot.org/sparql does not support the predicate rdfs:label. It can have the following predicates: skos:altLabel, rdfs:comment, up:mnemonic, skos:prefLabel, rdfs:seeAlso
Subject ?orthologRatProtein with type orth:Protein in endpoint https://sparql.omabrowser.org/sparql/ does not support the predicate sio:SIO_010078. It can have the following predicates: lscr:xrefUniprot, orth:organism, obo:RO_0001018, lscr:xrefEnsemblProtein, lscr:xrefEnsemblTranscript, sio:SIO_010079, lscr:xrefSwissProt, rdfs:label, dc:identifier, rdfs:comment, lscr:xrefNCBIRefSeq, lscr:xrefNCBIProtein, rdfs:seeAlso, lscr:xrefEMBLSequence
Subject

In [2]:
import json

from sparql_llm.validate_sparql import sparql_query_to_dict

path_query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?anyKindOfName WHERE {
		?protein a up:Protein .
		?protein (up:recommendedName|up:alternativeName)|((up:domain|up:component)/(up:recommendedName|up:alternativeName)) ?structuredName .
		?structuredName ?anyKindOfName  "HLA class I histocompatibility antigen, B-73 alpha chain" .
		?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}"""

sparql_query = """"""

# sparql_query
query_dict = sparql_query_to_dict(path_query, "https://sparql.uniprot.org/sparql")
# query_dict = sparql_query_to_dict(sparql_query, "https://sparql.uniprot.org/sparql")

# print(query_dict)
print(json.dumps(query_dict, indent=2))

{
  "https://sparql.uniprot.org/sparql": {
    "?anyKindOfName": {
      "http://www.w3.org/2000/01/rdf-schema#subPropertyOf": [
        "http://purl.uniprot.org/core/structuredNameType"
      ]
    },
    "?protein": {
      "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": [
        "http://purl.uniprot.org/core/Protein"
      ],
      "http://purl.uniprot.org/core/recommendedName": [
        "?structuredName"
      ],
      "http://purl.uniprot.org/core/alternativeName": [
        "?structuredName"
      ],
      "http://purl.uniprot.org/core/domain": [
        "?pathVar2"
      ],
      "http://purl.uniprot.org/core/component": [
        "?pathVar2"
      ]
    },
    "?structuredName": {
      "?anyKindOfName": [
        "HLA class I histocompatibility antigen, B-73 alpha chain"
      ]
    },
    "?pathVar2": {
      "http://purl.uniprot.org/core/recommendedName": [
        "?structuredName"
      ],
      "http://purl.uniprot.org/core/alternativeName": [
        "?structuredNam

In [3]:
# from rdflib.plugins.sparql import prepareQuery

# NOTE: there is a bug in rdflib parser when nested SERVICE https://github.com/RDFLib/rdflib/issues/2136
# query = """
# PREFIX wikibase: <http://wikiba.se/ontology#>
# PREFIX wdt: <http://www.wikidata.org/prop/direct/>
# PREFIX bd: <http://www.bigdata.com/rdf#>
# PREFIX wd: <http://www.wikidata.org/entity/>
# PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
# SELECT ?item ?pic
# WHERE
# {
#     SERVICE <https://query.wikidata.org/sparql> {
#         ?item wdt:P31 wd:Q146 .
# 		?item wdt:P18 ?pic
#         SERVICE wikibase:label {
#             bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".
#         }
#     }
# }

# """
# parsed_query = parseQuery(query)
# print(sq.algebra.name)
# print(prepareQuery(toast_query).algebra)
# triples = extract_triples(prepareQuery(sparql_query).algebra)

# print(triples)
# print(len(triples))