In [9]:

import httpx
from typing import Optional, Any


def query_sparql(
    query: str,
    endpoint_url: str,
    post: bool = False,
    timeout: Optional[int] = None,
    client: Optional[httpx.Client] = None
) -> Any:
    """Execute a SPARQL query on a SPARQL endpoint using httpx.

    Returns 'error' if an HTTPStatusError occurs during the request.
    """
    should_close = False
    if client is None:
        client = httpx.Client(
            follow_redirects=True, headers={"Accept": "application/sparql-results+json"}, timeout=timeout
        )
        should_close = True

    try:
        if post:
            resp = client.post(
                endpoint_url,
                data={"query": query},
            )
        else:
            resp = client.get(
                endpoint_url,
                params={"query": query},
            )

        try:
            resp.raise_for_status() 
            return resp.json()
        except httpx.HTTPStatusError as e:
            #logging.warning(f"HTTP Status Error executing SPARQL query: {e}\nURL: {e.request.url}\nQuery: {query}")
            return e # Return "error" as requested when an HTTP error occurs

    except httpx.RequestError as e:
        #logging.error(f"HTTP Request Error executing SPARQL query: {e}\nURL: {e.request.url}\nQuery: {query}")
        return e # Also return "error" for these request-level issues

    finally:
        if should_close:
            client.close()

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
from typing import Optional, Any

def query_sparql_wrapper(
    query: str,
    endpoint_url: str,
    post: bool = False,
    timeout: Optional[int] = None
) -> Any:
    """
    Execute a SPARQL query on a SPARQL endpoint using SPARQLWrapper.

    Returns 'error' if an exception occurs during the request.
    """
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    if post:
        sparql.setMethod('POST')
    else:
        sparql.setMethod('GET')
    
    if timeout is not None:
        # SPARQLWrapper uses urllib, which supports timeout in seconds
        sparql.setTimeout(timeout)
    
    try:
        results = sparql.query().convert()
        return results
    except Exception as e:
        # You can add logging here if needed
        return "error"

In [3]:
ground_truth_query = """PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX rh: <http://rdf.rhea-db.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>


SELECT 
    DISTINCT
        ?uniprot
        ?uniprotID
        ?recname
        ?gene
        ?chebi
        ?uniprotName
WHERE {
  SERVICE <https://sparql.rhea-db.org/sparql> {
     VALUES (?chebi) { (CHEBI:32395) }
     ?rhea rh:side/rh:contains/rh:compound ?compound .
     ?compound rh:chebi ?chebi .
     ?chebi up:name ?uniprotName .
  }
  ?uniprot up:annotation/up:catalyticActivity/up:catalyzedReaction ?rhea .
  ?uniprot up:mnemonic ?uniprotID .
  ?uniprot up:recommendedName/up:fullName ?recname .
  OPTIONAL {?uniprot up:encodedBy/skos:prefLabel ?gene .}
}
LIMIT 100
"""

ground_truth_endpoint = "https://sparql.uniprot.org/sparql/"

In [4]:
result = query_sparql_wrapper(ground_truth_query, ground_truth_endpoint)
print(result)

{'head': {'vars': ['uniprot', 'uniprotID', 'recname', 'gene', 'chebi', 'uniprotName']}, 'results': {'bindings': [{'uniprotName': {'type': 'literal', 'value': '(5Z,8Z,11Z,14Z)-eicosatetraenoate'}, 'chebi': {'type': 'uri', 'value': 'http://purl.obolibrary.org/obo/CHEBI_32395'}, 'gene': {'type': 'literal', 'value': 'CYP1A2'}, 'uniprot': {'type': 'uri', 'value': 'http://purl.uniprot.org/uniprot/O77809'}, 'uniprotID': {'type': 'literal', 'value': 'CP1A2_MACFA'}, 'recname': {'type': 'literal', 'value': 'Cytochrome P450 1A2'}}, {'uniprotName': {'type': 'literal', 'value': '(5Z,8Z,11Z,14Z)-eicosatetraenoate'}, 'chebi': {'type': 'uri', 'value': 'http://purl.obolibrary.org/obo/CHEBI_32395'}, 'gene': {'type': 'literal', 'value': 'CYP1A2'}, 'uniprot': {'type': 'uri', 'value': 'http://purl.uniprot.org/uniprot/O77810'}, 'uniprotID': {'type': 'literal', 'value': 'CP1A2_CALJA'}, 'recname': {'type': 'literal', 'value': 'Cytochrome P450 1A2'}}, {'uniprotName': {'type': 'literal', 'value': '(5Z,8Z,11Z,14

In [6]:
result_2 = query_sparql(ground_truth_query, ground_truth_endpoint)
print(result_2)

{'head': {'vars': ['uniprot', 'uniprotID', 'recname', 'gene', 'chebi', 'uniprotName']}, 'results': {'bindings': [{'uniprotName': {'type': 'literal', 'value': '(5Z,8Z,11Z,14Z)-eicosatetraenoate'}, 'chebi': {'type': 'uri', 'value': 'http://purl.obolibrary.org/obo/CHEBI_32395'}, 'gene': {'type': 'literal', 'value': 'CYP1A2'}, 'uniprot': {'type': 'uri', 'value': 'http://purl.uniprot.org/uniprot/O77809'}, 'uniprotID': {'type': 'literal', 'value': 'CP1A2_MACFA'}, 'recname': {'type': 'literal', 'value': 'Cytochrome P450 1A2'}}, {'uniprotName': {'type': 'literal', 'value': '(5Z,8Z,11Z,14Z)-eicosatetraenoate'}, 'chebi': {'type': 'uri', 'value': 'http://purl.obolibrary.org/obo/CHEBI_32395'}, 'gene': {'type': 'literal', 'value': 'CYP1A2'}, 'uniprot': {'type': 'uri', 'value': 'http://purl.uniprot.org/uniprot/O77810'}, 'uniprotID': {'type': 'literal', 'value': 'CP1A2_CALJA'}, 'recname': {'type': 'literal', 'value': 'Cytochrome P450 1A2'}}, {'uniprotName': {'type': 'literal', 'value': '(5Z,8Z,11Z,14

In [3]:
ground_truth_query = """PREFIX rh: <http://rdf.rhea-db.org/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>

# Query 13
# Select all Rhea reactions used to annotate Escherichia coli (taxid=83333) in UniProtKB/Swiss-Prot
# return the number of UniProtKB entries
# 
# Federated query using a service to UniProt SPARQL endpoint
#
# This query cannot be performed using the Rhea search website
SELECT ?uniprot ?mnemo ?rhea ?accession ?equation 
WHERE {
  SERVICE <https://sparql.uniprot.org/sparql> { 
    VALUES (?taxid) { (taxon:83333) }
    GRAPH <http://sparql.uniprot.org/uniprot> {
      ?uniprot up:reviewed true . 
      ?uniprot up:mnemonic ?mnemo . 
      ?uniprot up:organism ?taxid .
      ?uniprot up:annotation/up:catalyticActivity/up:catalyzedReaction ?rhea . 
    }
  }
  ?rhea rh:accession ?accession .
  ?rhea rh:equation ?equation .
}"""
ground_truth_endpoint = "https://sparql.rhea-db.org/sparql"

In [15]:
result = query_sparql(ground_truth_query, ground_truth_endpoint)
print(result)

Client error '400 ' for url 'https://sparql.rhea-db.org/sparql?query=PREFIX+rh%3A+%3Chttp%3A%2F%2Frdf.rhea-db.org%2F%3E%0APREFIX+taxon%3A+%3Chttp%3A%2F%2Fpurl.uniprot.org%2Ftaxonomy%2F%3E%0APREFIX+up%3A+%3Chttp%3A%2F%2Fpurl.uniprot.org%2Fcore%2F%3E%0A%0A%23+Query+13%0A%23+Select+all+Rhea+reactions+used+to+annotate+Escherichia+coli+%28taxid%3D83333%29+in+UniProtKB%2FSwiss-Prot%0A%23+return+the+number+of+UniProtKB+entries%0A%23+%0A%23+Federated+query+using+a+service+to+UniProt+SPARQL+endpoint%0A%23%0A%23+This+query+cannot+be+performed+using+the+Rhea+search+website%0ASELECT+%3Funiprot+%3Fmnemo+%3Frhea+%3Faccession+%3Fequation+%0AWHERE+%7B%0A++SERVICE+%3Chttps%3A%2F%2Fsparql.uniprot.org%2Fsparql%3E+%7B+%0A++++VALUES+%28%3Ftaxid%29+%7B+%28taxon%3A83333%29+%7D%0A++++GRAPH+%3Chttp%3A%2F%2Fsparql.uniprot.org%2Funiprot%3E+%7B%0A++++++%3Funiprot+up%3Areviewed+true+.+%0A++++++%3Funiprot+up%3Amnemonic+%3Fmnemo+.+%0A++++++%3Funiprot+up%3Aorganism+%3Ftaxid+.%0A++++++%3Funiprot+up%3Aannotation%2Fup%3

In [16]:
result = query_sparql_wrapper(ground_truth_query, ground_truth_endpoint)
print(result)

QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'<!DOCTYPE html SYSTEM "about:legacy-compat">\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head><title>Rhea</title><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"/><link href="/" rel="home"/><link href="/base.css" type="text/css" rel="stylesheet"/><link type="image/vnd.microsoft.icon" href="https://www.rhea-db.org//favicon.ico" rel="shortcut icon"/><link href="/rhea-sparql.css" type="text/css" rel="stylesheet"/><link href="https://creativecommons.org/licenses/by/4.0/" rel="license"/><script src="https://cdn.jsdelivr.net/npm/yasgui-yasqe@2.11.22/dist/yasqe.bundled.min.js"></script><link type="text/css" rel="stylesheet" href="https://cdn.jsdelivr.net/yasqe/2.2/yasqe.min.css"/><meta content="width=device-width, initial-scale=1" name="viewport"/><meta content="no-referrer" name="referrer"/></head><body typeof="schema:WebPage"><heade

In [1]:
ground_truth_query = """PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX rh: <http://rdf.rhea-db.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>


SELECT 
    DISTINCT
        ?uniprot
        ?uniprotID
        ?recname
        ?gene
        ?chebi
        ?uniprotName
WHERE {
  SERVICE <https://sparql.rhea-db.org/sparql> {
     VALUES (?chebi) { (CHEBI:32395) }
     ?rhea rh:side/rh:contains/rh:compound ?compound .
     ?compound rh:chebi ?chebi .
     ?chebi up:name ?uniprotName .
  }
  ?uniprot up:annotation/up:catalyticActivity/up:catalyzedReaction ?rhea .
  ?uniprot up:mnemonic ?uniprotID .
  ?uniprot up:recommendedName/up:fullName ?recname .
  OPTIONAL {?uniprot up:encodedBy/skos:prefLabel ?gene .}
}
LIMIT 100
"""

ground_truth_endpoint = "https://sparql.uniprot.org/sparql/"

In [2]:
import os
import sys 


sys.path.append('/Users/sebastian/Documents/Bachelor Thesis/sparql-rag-agent/sparql-rag-agent')


from experiments.utilities.result_metric import calculate_column_metrics_with_label_similarity, format_query_result_dataframe




df_ground_truth, df_predicted = format_query_result_dataframe(ground_truth_query, ground_truth_endpoint, ground_truth_query, ground_truth_endpoint)



  Using cached result for query
Querying ground truth endpoint...


In [3]:
df_ground_truth.head()

Unnamed: 0,uniprot,uniprotID,recname,gene,chebi,uniprotName
0,http://purl.uniprot.org/uniprot/O77809,CP1A2_MACFA,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
1,http://purl.uniprot.org/uniprot/O77810,CP1A2_CALJA,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
2,http://purl.uniprot.org/uniprot/P00184,CP1A1_MOUSE,Cytochrome P450 1A1,Cyp1a1,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
3,http://purl.uniprot.org/uniprot/P00186,CP1A2_MOUSE,Cytochrome P450 1A2,Cyp1a2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
4,http://purl.uniprot.org/uniprot/P00187,CP1A2_RABIT,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"


In [4]:
df_predicted.head()

Unnamed: 0,uniprot,uniprotID,recname,gene,chebi,uniprotName
0,http://purl.uniprot.org/uniprot/O77809,CP1A2_MACFA,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
1,http://purl.uniprot.org/uniprot/O77810,CP1A2_CALJA,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
2,http://purl.uniprot.org/uniprot/P00184,CP1A1_MOUSE,Cytochrome P450 1A1,Cyp1a1,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
3,http://purl.uniprot.org/uniprot/P00186,CP1A2_MOUSE,Cytochrome P450 1A2,Cyp1a2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
4,http://purl.uniprot.org/uniprot/P00187,CP1A2_RABIT,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"


In [5]:
# Keeps as DataFrame (with column name and structure)
df_predicted = df_predicted[["uniprotID"]]
df_ground_truth = df_ground_truth[["uniprotID"]]

In [6]:
df_predicted = df_predicted[:10]
print(len(df_predicted))
print(len(df_ground_truth))

10
100


In [7]:
df_predicted = df_predicted.rename(columns={"uniprotID": "iprotID"})


result = calculate_column_metrics_with_label_similarity(df_ground_truth, df_predicted)

uniprotID -> iprotID (similarity: 0.8259257078170776)
100
10
{('CP2D6_PANPA',), ('A0A8I3VYX7_CALJA',), ('C5IMY1_PAPCY',), ('B6EY23_MACFA',), ('A0A0D9RP42_CHLSB',), ('CP1B1_RAT',), ('A0A8C5XD37_MICMU',), ('B6EY24_MACFA',), ('H0XKZ2_OTOGA',), ('CP1A2_FELCA',), ('CP2D6_PANTR',), ('A0A2K6T358_SAIBB',), ('CP1A1_MESAU',), ('LX15B_HUMAN',), ('CP1A2_MACFU',), ('CP1A2_HUMAN',), ('A0A7N9D4P8_MACFA',), ('B6EY14_MACFA',), ('CP240_MOUSE',), ('A0A2R8ZXU3_PANPA',), ('B6EY34_MACFA',), ('CP1A2_CAVPO',), ('B6EY19_MACFA',), ('CP1A1_RABIT',), ('A0A2K5PV53_CEBIM',), ('CP4Z1_HUMAN',), ('A0A2K5VAK7_MACFA',), ('CP1A2_RAT',), ('CP1A1_CAVPO',), ('B6EY36_MACFA',), ('A0A2K6Q3V3_RHIRO',), ('CP1A2_MOUSE',), ('CP2CN_RAT',), ('CP1A2_CALJA',), ('A0A2R9B9N0_PANPA',), ('CP1A1_FELCA',), ('CP1A2_CANLF',), ('A4F3V8_HUMAN',), ('A0A8C5UY91_MICMU',), ('A0A2K5MMW4_CERAT',), ('A0A2I3LF45_PAPAN',), ('G7P943_MACFA',), ('B6EY30_MACFA',), ('A0A2K5VMH7_MACFA',), ('CP1A2_RABIT',), ('C5IN69_PAPCY',), ('A0A2K6Q3U5_RHIRO',), ('CP3A4_HUM

In [8]:
print(result)

{'precision': 1.0, 'recall': 0.1, 'f1_score': 0.18181818181818182}
