# Answering questions with SPARQL queries to SIB endpoints

Questions:
* Which info can I get from the UniProt endpoint?
* Give me an example to access cross references from the UniProt SPARQL endpoint to all the databases available in the endpoint

In [2]:
import re

from bs4 import BeautifulSoup
from SPARQLWrapper import JSON, SPARQLWrapper

system_prompt = """You are Expasy, an assistant that helps users to query the databases from the Swiss Institute of Bioinformatics, such as UniProt or Bgee.
When writing the SPARQL query try to factorize the predicates/objects of a subject as much as possible, so that the user can understand the query and the results.
"""
examples_prompt: str = "Here are a list of questions and queries that Expasy has learned to answer, use them as base when answering the question from the user:"

endpoints = [
    "https://sparql.uniprot.org/sparql/",
    "https://www.bgee.org/sparql/",
]

get_queries = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?comment ?query
WHERE
{
    ?sq a sh:SPARQLExecutable ;
        rdfs:comment ?comment ;
        sh:select ?query .
}"""

get_prefixes = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prefix ?namespace
WHERE {
    [] sh:namespace ?namespace ;
        sh:prefix ?prefix .
} ORDER BY ?prefix"""


def remove_a_tags(html_text: str) -> str:
    """Remove all <a> tags from the queries descriptions"""
    soup = BeautifulSoup(html_text, "html.parser")
    for a_tag in soup.find_all("a"):
        a_tag.replace_with(a_tag.text)
    return soup.get_text()


for endpoint_url in endpoints:
    print(f"Processing endpoint {endpoint_url}")
    sparql_endpoint = SPARQLWrapper(endpoint_url)
    sparql_endpoint.setReturnFormat(JSON)

    sparql_endpoint.setQuery(get_prefixes)
    results = sparql_endpoint.query().convert()
    prefix_map = {}
    for row in results["results"]["bindings"]:
        prefix_map[row["prefix"]["value"]] = row["namespace"]["value"]

    # print(f"Found {len(prefix_map)} prefixes")

    sparql_endpoint.setQuery(get_queries)
    results = sparql_endpoint.query().convert()
    queries_list = []
    for row in results["results"]["bindings"]:
        queries_list.append(
            {
                "comment": row["comment"]["value"],
                "query": row["query"]["value"],
            }
        )

    print(f"Found {len(queries_list)} queries")

    for q in queries_list:
        query = q["query"]
        # Add prefixes to queries
        for prefix, namespace in prefix_map.items():
            prefix_str = f"PREFIX {prefix}: <{namespace}>"
            if not re.search(prefix_str, query) and re.search(
                f"[(| |\u00a0|/]{prefix}:", query
            ):
                query = f"{prefix_str}\n{query}"

        example_question = remove_a_tags(q["comment"])

        examples_prompt += f"\n\n- {example_question}:\n{query}"

examples_prompt += "\n\nThe question from the user is:"

Processing endpoint https://sparql.uniprot.org/sparql/
Found 51 queries
Processing endpoint https://www.bgee.org/sparql/
Found 0 queries


  soup = BeautifulSoup(html_text, "html.parser")


In [7]:
from openai import OpenAI

client = OpenAI()

user_asks = "Give me an example to access cross references from the UniProt SPARQL endpoint to all the databases available in the endpoint"

response = client.chat.completions.create(
    model="gpt-4o",
    #   response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{examples_prompt}\n\n{user_asks}"},
    ],
)
print(response.choices[0].message.content)

Sure, here's an example SPARQL query that retrieves cross-references from UniProt entries to all available external databases in the UniProt SPARQL endpoint:

```sparql
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?protein ?externalDatabase ?crossReference
WHERE
{
  ?protein a up:Protein ;
           rdfs:seeAlso ?crossReference .
  ?crossReference up:database ?externalDatabase .
}
LIMIT 100
```

This query does the following:

- Selects all proteins (`?protein`) in the UniProt dataset.
- Retrieves external cross-references (`?crossReference`) associated with each protein.
- Identifies the external database (`?externalDatabase`) to which each cross-reference points.

You can remove the `LIMIT 100` clause to retrieve all results, but keep in mind that this could result in a large number of results depending on the breadth of data in the UniProt database.
