# 🐢 Generate RDF files for Bgee SPARQL queries

Queries from https://github.com/RIKEN-DKO/Generation_SPARQL/blob/main/augment_queries/augment_queries.ipynb

To be added to https://github.com/sib-swiss/sparql-examples

In [1]:
# Bgee queries (with variants) from tutorial online:
# https://www.bgee.org/support/tutorial-query-bgee-knowledge-graph-sparql#querying-with-controlled-vocabularies-and-identifiers

qs = [None] * 16

qs[0] = []

qs[1] = ["""
PREFIX up: <http://purl.uniprot.org/core/>

SELECT ?species WHERE {
	?species a up:Taxon .
}
"""]

qs[2] = ["""
PREFIX up: <http://purl.uniprot.org/core/>

SELECT ?species ?sci_name ?common_name WHERE {
	?species a up:Taxon ;
		up:scientificName ?sci_name ;
        up:rank up:Species .
	OPTIONAL { ?species up:commonName ?common_name . }
}
"""]

qs[3] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>

SELECT DISTINCT ?anat ?anatName WHERE {
	?seq a orth:Gene ;
		genex:isExpressedIn ?anat ;
		rdfs:label "APOC1" .
	?anat a genex:AnatomicalEntity ;
		rdfs:label ?anatName .
}
"""]

qs[4] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?anatName WHERE {
	?seq a orth:Gene ;
		genex:isExpressedIn ?anat ;
		rdfs:label "APOC1" .
	?anat a genex:AnatomicalEntity ;
		rdfs:label ?anatName .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon ;
		up:scientificName "Homo sapiens" .
}
"""]

qs[5] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName {
	?seq a orth:Gene ;
		genex:isExpressedIn ?condition ;
		rdfs:label "APOC1" .
    ?condition a genex:ExpressionCondition ;
		genex:hasAnatomicalEntity ?anat ;
		genex:hasAnatomicalEntity obo:GO_0005575 ;
		genex:hasDevelopmentalStage ?stage ;
		genex:hasSex "any" ;
		genex:hasStrain ?strain .
    ?anat a genex:AnatomicalEntity ;
		rdfs:label ?anatName .
    ?stage a efo:EFO_0000399 ;
		rdfs:label "life cycle" .
	?strain rdfs:label "wild-type" .
}
"""]

qs[6] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName ?stage WHERE {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?condition.
    ?condition a genex:ExpressionCondition.
	?seq rdfs:label "APOC1" .
	?condition genex:hasAnatomicalEntity ?anat .
	?anat rdfs:label ?anatName .
		?condition genex:hasAnatomicalEntity obo:GO_0005575 .
		?condition genex:hasDevelopmentalStage ?stage .
        ?stage a efo:EFO_0000399 .
	?stage rdfs:label "post-juvenile" .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
		?strain rdfs:label "wild-type" .
		?seq orth:organism ?organism .
		?organism obo:RO_0002162  ?species .
		?species a up:Taxon .
		?species up:commonName "human" .
}
"""]

qs[7] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName ?stage WHERE {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?condition.
	?seq rdfs:label "APOC1" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
		?condition genex:hasAnatomicalEntity obo:GO_0005575 .
		?condition genex:hasDevelopmentalStage ?stage .
        ?stage a efo:EFO_0000399 .
	?stage rdfs:label "post-juvenile" .
		?seq orth:organism ?organism .
		?organism obo:RO_0002162  ?species .
		?species a up:Taxon .
		?species up:commonName "human" .
}
"""]

qs[8] = [None] * 3

qs[8][0] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?anatName ?score ?stage WHERE {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq rdfs:label "APOC1" .
	?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
	?condition genex:hasAnatomicalEntity obo:GO_0005575 .
	?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage ?stage .
	?stage rdfs:label "post-juvenile" .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
	?strain rdfs:label "wild-type" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:commonName "human" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[8][1] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX ensembl: <http://rdf.ebi.ac.uk/resource/ensembl/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName ?score WHERE {
	?seq a orth:Gene .
	?expression a genex:Expression ;
		genex:hasExpressionCondition ?condition ;
		genex:hasExpressionLevel ?score ;
		genex:hasSequenceUnit ?seq .
	?seq lscr:xrefEnsemblGene ensembl:ENSG00000130208 .
	?condition a genex:ExpressionCondition ;
		genex:hasAnatomicalEntity ?anat .
	?anat a genex:AnatomicalEntity ;
		rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage obo:UBERON_0000113 ;
		genex:hasSex "any" ;
		genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  up-taxon:9606 .
    FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[8][2] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX ensembl: <http://rdf.ebi.ac.uk/resource/ensembl/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT ?anat ?anatName ?score  {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq lscr:xrefEnsemblGene ensembl:ENSG00000130208 .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage obo:UBERON_0000113 .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[9] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?cellType ?anatName ?cellTypeName ?score ?stage WHERE {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq rdfs:label "APOC1" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasAnatomicalEntity ?cellType .
	?cellType rdfs:label ?cellTypeName .
	?condition genex:hasDevelopmentalStage ?stage .
	?stage rdfs:label "post-juvenile" .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:commonName "human" .
    FILTER (?anat != obo:GO_0005575)
    FILTER (?anat != ?cellType)
} ORDER BY DESC(?score)
"""]

qs[10] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?cellType ?anatName ?cellTypeName ?score ?stage WHERE {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq rdfs:label "APOC1" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasAnatomicalEntity ?cellType .
	?cellType rdfs:label ?cellTypeName .
	?condition genex:hasDevelopmentalStage ?stage .
    ?stage rdf:type efo:EFO_0000399 .
	?stage rdfs:label "post-juvenile" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:commonName "human" .
    FILTER (?anat !=  obo:GO_0005575)
    FILTER (?anat != ?cellType)
} ORDER BY DESC(?score)
"""]

qs[11] = [None] * 2

qs[11][1] = """
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX efo: <http://www.ebi.ac.uk/efo/EFO_0000399>

SELECT DISTINCT ?stage ?stageName ?stageDescription WHERE {
	?stage rdf:type efo:EFO_0000399 .
	?stage rdfs:label ?stageName .
	?stage dcterms:description ?stageDescription .
}
"""

qs[11][0] = """
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX efo: <http://www.ebi.ac.uk/efo/EFO_0000399>

SELECT DISTINCT ?stage ?stageName ?stageDescription WHERE {
        ?stage rdf:type efo:EFO_0000399 .
		?stage rdfs:label ?stageName .
		?stage dcterms:description ?stageDescription .
		FILTER (CONTAINS(?stageName,"post-juvenile"))
}
"""

qs[12] = [None] * 3

qs[12][0] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName ?stageIRI ?score WHERE {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq lscr:xrefNCBIGene <https://www.ncbi.nlm.nih.gov/gene/118230125> .
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasDevelopmentalStage ?stageIRI .
    ?stageIRI rdf:type efo:EFO_0000399 .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
    FILTER (?anat != obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[12][1] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?anat ?anatName ?stageIRI ?score WHERE {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq dcterms:identifier "118230125" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage ?stageIRI .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[13] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up-protein:<http://purl.uniprot.org/uniprot/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT ?anat ?anatName WHERE {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?anat .
	?seq lscr:xrefUniprot up-protein:P02654 .
	?anat a genex:AnatomicalEntity .
	?anat rdfs:label ?anatName .
}
"""]

qs[14] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?symbol ?description ?id
?links ?organism ?uniprot ?ensembl ?ncbi WHERE {
    ?seq a orth:Gene .
    ?seq rdfs:label ?symbol .
    ?seq rdfs:seeAlso ?links .
    ?seq dcterms:description ?description .
    ?seq dcterms:identifier ?id .
    ?seq orth:organism ?organism .
    OPTIONAL{?seq lscr:xrefUniprot ?uniprot .}
    OPTIONAL{?seq lscr:xrefEnsemblGene ?ensembl .}
    OPTIONAL{?seq lscr:xrefNCBIGene ?ncbi .}
    FILTER (?id = "ENSG00000130208")
}
"""]

qs[15] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?anatName WHERE {
	?seq a orth:Gene .
	?seq genex:isAbsentIn ?anat.
	?seq rdfs:label "APOC1" .
	?anat a genex:AnatomicalEntity .
	?anat rdfs:label ?anatName .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:scientificName "Homo sapiens" .
}
"""]

In [2]:
# also store NL questions corresponding to the queries

nlqs = [None] *16

nlqs[0] = ""
nlqs[1] = "What are the species present in Bgee?"
nlqs[2] = "What are the species present in Bgee and their scientific and common names?"
nlqs[3] = "What are the anatomical entities where the APOC1 gene is expressed?"
nlqs[4] = "What are the anatomical entities where the APOC1 Homo sapiens gene is expressed?"
nlqs[5] = "What are the anatomical entities where the APOC1 gene is expressed independently of the developmental stage, sex, strain and cell type?"
nlqs[6] = "What are the anatomical entities where the human gene APOC1 is expressed in the post-juvenile stage?"
nlqs[7] = "What are the anatomical entities where the human gene APOC1 is expressed in the post-juvenile stage?"
nlqs[8] = "What are the anatomical entities where the human gene APOC1 is expressed in the post-juvenile stage along with its expression score independently of the strain, sex, and cell type?"
nlqs[9] = "What are the anatomical entities including cell types, if any, where the human gene APOC1 is expressed at the post-juvenile stage along with its expression score independently of the strain and sex?"
nlqs[10] = "What are the anatomical entities including cell types, if any, where the human gene APOC1 is expressed at the post-juvenile stage along with its expression score independently of the strain and sex?"
nlqs[11] = "What are the developmental stages present in Bgee?"
nlqs[12] = "What are the anatomical entities where the eel gene apoc1 is expressed along with its expression score independently of the strain, sex, and cell type?"
nlqs[13] = "What are the anatomical entities where the P02654 gene is expressed? Note that P02654 is a UniProtKB identifier of the APOC1 human gene."
nlqs[14] = "What is all the metadata related to the ENSG00000130208 gene, where ENSG00000130208 is the identifier of the APOC1 human gene. "
nlqs[15] = "What are the anatomical entities where the APOC1 Homo sapiens gene is not expressed, that is where is APOC1 absent ?"

In [3]:
import os

from rdflib import RDF, RDFS, BNode, Graph, Literal, Namespace
from SPARQLWrapper import JSON, SPARQLWrapper

queries_dir = "../../sparql-examples/bgee"
# queries_dir = "../data/bgee_queries"
os.makedirs(queries_dir, exist_ok=True)

EX = Namespace("https://www.bgee.org/sparql/.well-known/sparql-examples/")
UP = Namespace("http://purl.uniprot.org/core/")
SH = Namespace("http://www.w3.org/ns/shacl#")

sparql_endpoint = SPARQLWrapper("https://www.bgee.org/sparql/")

for i in range(1, len(nlqs)):
    g = Graph()
    g.bind("ex", EX)
    g.bind("up", UP)
    g.bind("sh", SH)
    g.bind("rdf", RDF)
    g.bind("rdfs", RDFS)

    query_resource = EX[str(i)]
    g.add((query_resource, RDF.type, SH.SPARQLSelectExecutable))
    g.add((query_resource, RDF.type, SH.SPARQLExecutable))
    g.add((query_resource, SH.prefixes, BNode("sparql_examples_prefixes")))

    g.add((query_resource, RDFS.comment, Literal(nlqs[i])))
    # TODO: what do we do for 8, 11 and 12 that have multiple queries for 1 description?
    query = qs[i][0]
    g.add((query_resource, SH.select, Literal(query)))

    # Check query gets results
    try:
        sparql_endpoint.setQuery(query)
        sparql_endpoint.setReturnFormat(JSON)
        results = sparql_endpoint.query().convert()
        n_res = len(results["results"]["bindings"])
        print(f"✅ Query {i} got {n_res} results")
    except Exception as e:
        print(f"❌ Query {i} failed: {e}")

    # NOTE: Dirty hack to get the BNode formatted as we want
    ttl_str = g.serialize(format="turtle").replace("sh:prefixes [ ] ;", "sh:prefixes _:sparql_examples_prefixes ;")
    with open(f"{queries_dir}/{i}.ttl", "w") as f:
        f.write(ttl_str)

    # g.serialize(f"../data/bgee_queries/{i}.ttl", format="turtle")

✅ Query 1 got 52 results
✅ Query 2 got 52 results
✅ Query 3 got 584 results
✅ Query 4 got 526 results
✅ Query 5 got 585 results
✅ Query 6 got 457 results
✅ Query 7 got 457 results
✅ Query 8 got 456 results
✅ Query 9 got 456 results
✅ Query 10 got 456 results
✅ Query 11 got 0 results
✅ Query 12 got 99 results
✅ Query 13 got 526 results
✅ Query 14 got 16 results
✅ Query 15 got 6 results
