In [1]:
import json

import requests
from SPARQLWrapper import JSON, SPARQLWrapper

queries_obj = requests.get(
    "https://raw.githubusercontent.com/biosoda/bioquery/master/biosoda_frontend/src/biosodadata.json",
    timeout=20,
).json()

# print(json.dumps(queries_obj, indent=2))

get_datasources = {
    "species": {
        "fetchUrl": "https://sparql.omabrowser.org/lode/servlet/query?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/>\nSELECT DISTINCT ?value WHERE {\n?s a up:Taxon .\n?s up:scientificName ?value .\n?s up:rank up:Species .\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n} LIMIT 100',
        "question": "Which species are available?",
    },
    "uniprotmnemonics": {
        "fetchUrl": "https://sparql.uniprot.org/sparql?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/>\nSELECT ?value\nWHERE\n{\n ?protein a up:Protein .\n ?protein up:mnemonic ?value\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n} LIMIT 100',
        "question": "What are the UniProt mnemonics matching the search string?",
    },
    "uniprottaxons": {
        "fetchUrl": "https://sparql.uniprot.org/sparql?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/>\nSELECT DISTINCT ?name ?value\nWHERE\n{\n?value a up:Taxon ;\nup:scientificName ?name\nFILTER(CONTAINS(lcase(str(?name)), "$$searchString$$"))\n} LIMIT 100',
        "question": "What are the taxonomic names and their corresponding IRIs?",
    },
    "uniprotnaturalannotations": {
        "fetchUrl": "https://sparql.uniprot.org/sparql?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": "PREFIX up: <http://purl.uniprot.org/core/>\nPREFIX taxon: <http://purl.uniprot.org/taxonomy/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nSELECT ?value\nWHERE\n{\n?protein a up:Protein .\n?protein up:organism taxon:9606 .\n?protein up:annotation ?annotation .\n?annotation a up:Natural_Variant_Annotation .\n?annotation rdfs:comment ?value .\nFILTER (CONTAINS(?value, '$$searchString$$'))\n} LIMIT 100",
        "question": "What are the natural variant annotations for proteins in humans matching the search string?",
    },
    "ensemblgenes": {
        "fetchUrl": "https://sparql.omabrowser.org/lode/servlet/query?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX dct: <http://purl.org/dc/terms/>\nPREFIX sio: <http://semanticscience.org/resource/>\nPREFIX lscr: <http://purl.org/lscr#>\nSELECT ?value\nWHERE {\n?protein1 sio:SIO_010079/lscr:xrefEnsemblGene ?ens .\n\n?ens dct:identifier ?value . FILTER(CONTAINS(lcase(str(?ens)), "$$searchString$$"))\n} LIMIT 100',
        "question": "What are the Ensembl gene identifiers matching the search string?",
    },
    "omaproteins": {
        "fetchUrl": "https://sparql.omabrowser.org/lode/servlet/query?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX orth: <http://purl.org/net/orth#>\nPREFIX dct: <http://purl.org/dc/terms/>\nPREFIX sio: <http://semanticscience.org/resource/>\nPREFIX lscr: <http://purl.org/lscr#>\nSELECT ?value\nWHERE {\n?protein1 a orth:Protein .\n?protein1 dct:identifier ?value . FILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n} LIMIT 100',
        "question": "Which OMA protein identifiers match the search string?",
    },
    "taxlevels": {
        "fetchUrl": "https://sparql.omabrowser.org/lode/servlet/query?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX orth: <http://purl.org/net/orth#>\nSELECT DISTINCT ?value\nWHERE {\n?taxRange orth:taxRange ?value .\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n} LIMIT 100',
        "question": "What are the taxonomic levels matching the search string?",
    },
    "speciescommon": {
        "fetchUrl": "https://sparql.uniprot.org/sparql?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/> SELECT DISTINCT ?value WHERE {  ?s a up:Taxon .  ?s up:commonName ?value . ?s up:rank up:Species . FILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))}',
        "question": "What are the common names of species matching the search string?",
    },
    "speciescommonIRI": {
        "fetchUrl": "https://sparql.uniprot.org/sparql?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/> SELECT DISTINCT ?name ?value WHERE {  ?value a up:Taxon .  ?value up:commonName ?name . ?value up:rank up:Species . FILTER(CONTAINS(lcase(str(?name)), "$$searchString$$"))}',
        "question": "What are the common names and their corresponding IRIs of species matching the search string?",
    },
    "anatomicentities": {
        "fetchUrl": "https://www.bgee.org/sparql/?query=$$query$$",
        "queryHeaders": {"Accept": "application/sparql-results+json"},
        "fetchQuery": 'PREFIX genex: <http://purl.org/genex#>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nSELECT DISTINCT ?value {\n?anatEntity a genex:AnatomicalEntity .\n?anatEntity rdfs:label ?value .\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n}',
        "question": "What are the anatomical entities matching the search string?",
    },
    "anatomicentities_extraspecies": {
        "fetchUrl": "https://www.bgee.org/sparql/?query=$$query$$",
        "queryHeaders": {"Accept": "application/sparql-results+json"},
        "fetchQuery": 'PREFIX genex: <http://purl.org/genex#>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX orth: <http://purl.org/net/orth#>\nPREFIX up: <http://purl.uniprot.org/core/>\nPREFIX obo: <http://purl.obolibrary.org/obo/>\nSELECT DISTINCT ?value {\n?gene genex:isExpressedIn ?anat .\n?anat rdfs:label ?value .\n?gene orth:organism ?o .\n?o obo:RO_0002162 ?taxon .\n?taxon up:commonName "$$extra_0$$" .\n?anat a genex:AnatomicalEntity .\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n}',
        "question": "What are the anatomical entities for the species '$$extra_0$$' matching the search string?",
    },
    "stages": {
        "fetchUrl": "https://www.bgee.org/sparql/?query=$$query$$",
        "queryHeaders": {"Accept": "application/sparql-results+json"},
        "fetchQuery": 'PREFIX genex: <http://purl.org/genex#>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nSELECT DISTINCT ?value { ?stage a <http://www.ebi.ac.uk/efo/EFO_0000399> .\n?stage rdfs:label ?value .\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n} LIMIT 100',
        "question": "What are the developmental stages matching the search string?",
    },
    "genesbgee": {
        "fetchUrl": "https://www.bgee.org/sparql/?query=$$query$$",
        "queryHeaders": {"Accept": "application/sparql-results+json"},
        "fetchQuery": 'PREFIX orth: <http://purl.org/net/orth#>\nPREFIX genex: <http://purl.org/genex#>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nSELECT DISTINCT ?value { ?gene a orth:Gene .\n?gene rdfs:label ?value .\nFILTER(STRSTARTS(lcase(str(?value)), "$$searchString$$"))\n} LIMIT 100',
        "question": "What are the genes matching the search string?",
    },
    "speciesbgee": {
        "fetchUrl": "https://www.bgee.org/sparql/?query=$$query$$",
        "queryHeaders": {"Accept": "application/sparql-results+json"},
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/>\nSELECT DISTINCT ?value {\n?species a up:Taxon .\n?species up:scientificName ?value .\n?species up:rank up:Species .\n\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n} ORDER BY ?value',
        "question": "What are the species names available in Bgee matching the search string?",
    },
    "speciesbgeecommon": {
        "fetchUrl": "https://www.bgee.org/sparql/?query=$$query$$",
        "queryHeaders": {"Accept": "application/sparql-results+json"},
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/>\nSELECT DISTINCT ?value {\n?species a up:Taxon .\n?species up:commonName ?value .\n?species up:rank up:Species .\n\nFILTER(CONTAINS(lcase(str(?value)), "$$searchString$$"))\n} ORDER BY ?value',
        "question": "What are the common names of species available in Bgee matching the search string?",
    },
    "speciesbgeecommonIRI": {
        "fetchUrl": "https://www.bgee.org/sparql/?query=$$query$$",
        "queryHeaders": {"Accept": "application/sparql-results+json"},
        "fetchQuery": 'PREFIX up: <http://purl.uniprot.org/core/>\nSELECT DISTINCT ?name ?value {\n?value a up:Taxon .\n?value up:commonName ?name .\n?value up:rank up:Species .\n\nFILTER(CONTAINS(lcase(str(?name)), "$$searchString$$"))\n} ORDER BY ?value',
        "question": "What are the common names and their corresponding IRIs of species available in Bgee matching the search string?",
    },
    "proteinsatoma": {
        "fetchUrl": "https://sparql.omabrowser.org/lode/servlet/query?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": 'PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX orth: <http://purl.org/net/orth#>\nSELECT DISTINCT ?value WHERE {\n?proteinOMA a orth:Protein;\nrdfs:label ?value .\nFILTER(STRSTARTS(lcase(str(?value)), "$$searchString$$"))}',
        "question": "What are the protein identifiers at OMA matching the search string?",
    },
    "proteins": {
        "fetchUrl": "https://sparql.uniprot.org/sparql?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": "PREFIX up: <http://purl.uniprot.org/core/>\nPREFIX skos: <http://www.w3.org/2004/02/skos/core#>\nSELECT DISTINCT ?value\nWHERE\n{\n?protein a up:Protein .\n?protein up:recommendedName ?recommended .\n?recommended up:fullName ?value .\n?protein up:encodedBy ?gene .\n?gene skos:prefLabel ?text .\nFILTER CONTAINS(?text, '$$searchString$$')\n} LIMIT 100",
        "question": "What are the protein names matching the search string?",
    },
    "genesuniprot": {
        "fetchUrl": "https://sparql.uniprot.org/sparql?query=$$query$$&format=JSON$$&limit=limit$$&offset=$$offset$$&inference=false",
        "fetchQuery": "PREFIX up:<http://purl.uniprot.org/core/>\nPREFIX taxon:<http://purl.uniprot.org/taxonomy/>\nPREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>\nPREFIX skos:<http://www.w3.org/2004/02/skos/core#>\nSELECT DISTINCT ?name ?value WHERE {\n\t?value a up:Protein . \n\t?value up:organism taxon:9606 . \n\t?value up:encodedBy ?gene . \n\t?gene skos:prefLabel ?name . \n\tFILTER CONTAINS(lcase(str(?name)), lcase('$$searchString$$'))\n} ORDER BY ?value LIMIT 100",
        "question": "What are the gene names and their corresponding IRIs for proteins in humans matching the search string?",
    },
}


def exec_query(query: str, endpoint: str) -> int:
    try:
        sparql_endpoint = SPARQLWrapper(endpoint)
        sparql_endpoint.setReturnFormat(JSON)
        sparql_endpoint.setTimeout(200)
        sparql_endpoint.setQuery(query)
        results = sparql_endpoint.query().convert()
        return len(results["results"]["bindings"])
    except Exception as _e:
        return 0


# Process each question
results = []
failed = []
for i, question in enumerate(queries_obj["questions"]):
    if "SPARQL" in question and "vars" in question:
        sparql_query = question["SPARQL"]
        question_str = question["question"]
        for var in question["vars"]:
            sparql_query = sparql_query.replace(
                f"$${var['name']}$$", var.get("defaultvalue", var["default"])
            )
            question_str = question_str.replace(f"$${var['name']}$$", var["default"])

        print(f"[{i}/{len(queries_obj['questions'])}] {question_str}")
        sparql_query = sparql_query.replace("$$innerlimit$$", "LIMIT 10")
        # sparql_query = sparql_query.replace("$$innerlimit$$", "")

        endpoint_url = (
            "https://www.bgee.org/sparql/"
            if question["fetchUrlShort"].startswith("https://biosoda.expasy.org:4443")
            else question["fetchUrlShort"].replace(
                "https://sparql.omabrowser.org/lode/sparql",
                "https://sparql.omabrowser.org/sparql/",
            )
        )
        res_count = exec_query(sparql_query, endpoint_url)
        # res_count = 1
        if res_count > 0:
            print(endpoint_url)
            print(sparql_query)
            results.append(
                {
                    "query": sparql_query,
                    "endpoint": endpoint_url,
                    "question": question_str,
                    "results": res_count,
                }
            )
        else:
            failed.append(
                {
                    "query": sparql_query,
                    "endpoint": endpoint_url,
                    "question": question_str,
                    "results": res_count,
                }
            )
        print(f"{len(results)} / {len(failed)}")

# Resolve entities using datasources?
# https://github.com/biosoda/bioquery/blob/master/biosoda_frontend/src/biosodadata.json#L1491

# Output the results
print(json.dumps(results, indent=2))
print(json.dumps(failed, indent=2))
print(len(results), len(failed))

# NOTE: all process up to 13/52

  from .autonotebook import tqdm as notebook_tqdm


[1/52] The orthologs of a gene that is expressed in the fruit fly’s brain and the UniProt annotations of these orthologs.
0 / 1
[2/52] Which are the genes in Primates orthologous to a gene that is expressed in the fruit fly’s brain and the UniProt annotations of the genes.
0 / 2
[3/52] Which are the genes in Human associated to a disease that are orthologous to a gene expressed in the rat’s brain?
https://www.bgee.org/sparql/
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX genex: <http://purl.org/genex#>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX sio: <http://semanticscience.org/resource/>
SELECT ?gene ?OMA_LINK ?uniprot ?annotation_text WHERE {
	SERVICE <https://www.bgee.org/sparql/> {
		SELECT ?gene {
			?gene genex:isExpressedIn ?anat .
			?anat rdfs:label 'brain' .
			?gene orth:organism ?o .
			?o



25 / 13
[39/52] Retrieve hierarchical orthologous groups from the OMA database that contain the gene with the P68871 UniProt accession number
https://sparql.omabrowser.org/sparql/
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX lscr: <http://purl.org/lscr#>
SELECT DISTINCT ?root_hog ?species_name ?protein1_uniprot (?protein1 as ?protein1_OMA) ?taxLevel {
	VALUES ?protein2_uniprot {
		<http://purl.uniprot.org/uniprot/P68871>
	}
	?root_hog obo:CDAO_0000148 ?hog_cluster. #has_Root
	?hog_cluster orth:hasHomologousMember* ?node1.
	?node1 a orth:OrthologsCluster.
	?node1 orth:hasTaxonomicRange ?level.
	?level orth:taxRange ?taxLevel.
	?node1 orth:hasHomologousMember* ?protein1.
	?hog_cluster orth:hasHomologousMember* ?protein2.
	?protein1 a orth:Protein.
	?protein1 orth:organism ?organism.
	?organism obo:RO_0002162 ?taxon.
	?taxon up:scientificName ?speci