# Testing Expasy query helper

Testing our Expasy query helper reusing the example queries that succeeded retrieved from `test_example_queries.ipynb`

Potential interesting questions to use for evaluation:
- Which are the human genes associated with cancer and their orthologs expressed in the rat brain?
- Select the number of proteins for each of the subclasses of ec:1.1.1.- EC (Enzyme Commission) class (from biosoda)
- 

In [14]:
import re
import os
import requests

from qdrant_client.models import FieldCondition, Filter, MatchValue, ScoredPoint
from rdflib.plugins.sparql import prepareQuery, prepareUpdate
from rdflib.plugins.sparql.algebra import translateQuery
from SPARQLWrapper import JSON, TURTLE, XML, SPARQLWrapper
from dotenv import load_dotenv

from expasy_chat.utils import extract_sparql_queries

load_dotenv()
expasy_api_key = os.getenv('EXPASY_API_KEY')

example_queries = [
  {
    "question": "what is the accession number in uniprot of the human gene LCT? Return only the distinct protein URIs",
    "endpoint": "https://sparql.uniprot.org/sparql/",
    "query": """PREFIX up: <http://purl.uniprot.org/core/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>

SELECT ?protein
WHERE
{
    ?protein a up:Protein .
    ?protein up:organism taxon:9606 .
    ?protein up:encodedBy ?gene .
    ?gene skos:prefLabel "LCT" .
}""",
  },
  {
    # NOTE: The mature part of the question makes it harder to answer
    # "question": "How do I filter for reviewed (mouse) proteins whose mature form carries an N-terminal glycine? Return protein URI and AA sequence",
    "question": "How do I filter for reviewed mouse proteins which carry an N-terminal glycine? Return protein URI and AA sequence",
    "endpoint": "https://sparql.uniprot.org/sparql/",
    "query": """PREFIX up: <http://purl.uniprot.org/core/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT ?protein ?sequence
WHERE
{
    ?protein a up:Protein ;
        up:organism taxon:10090 ;  # Taxonomy ID for Mus musculus (Mouse)
        up:reviewed true ;
        up:sequence ?isoform .
    ?isoform rdf:value ?sequence .
    # Ensure the N-terminal amino acid is Glycine (G)
    FILTER (STRSTARTS(?sequence, "G"))
}""",
  },

  {
    "question": "How could I download a table that only includes the Rhea reactions for which there is experimental evidence? Return only the rhea URI",
    "endpoint": "https://sparql.uniprot.org/sparql/",
    "query": """PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT DISTINCT ?rhea
WHERE {
  # ECO 269 is experimental evidence
  BIND (<http://purl.obolibrary.org/obo/ECO_0000269> as ?evidence)
  GRAPH <http://sparql.uniprot.org/uniprot> {
    ?protein up:reviewed true ;
      up:annotation ?a ;
      up:attribution ?attribution  .

    ?a a up:Catalytic_Activity_Annotation ;
      up:catalyticActivity ?ca .
    ?ca up:catalyzedReaction ?rhea .

    [] rdf:subject ?a ;
      rdf:predicate up:catalyticActivity ;
      rdf:object ?ca ;
      up:attribution ?attribution .

    ?attribution up:evidence ?evidence .
  }
}""",
  },

  {
    "question": "Which human proteins are enzymes catalyzing a reaction involving lipids? Return the protein, lipid and reaction URI",
    "endpoint": "https://sparql.uniprot.org/sparql/",
    "query": """PREFIX up: <http://purl.uniprot.org/core/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX rh: <http://rdf.rhea-db.org/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
SELECT DISTINCT ?protein ?lipid ?reaction
WHERE {
  SERVICE <https://sparql.rhea-db.org/sparql> {
    ?reaction rdfs:subClassOf rh:Reaction .
    ?reaction rh:side/rh:contains/rh:compound ?compound .
    ?compound rh:chebi ?lipid .
    ?lipid rdfs:subClassOf* CHEBI:18059 .
  }
  ?protein a up:Protein ;
    up:organism taxon:9606 ;
    up:annotation/up:catalyticActivity/up:catalyzedReaction ?reaction .
}""",
  },

]

number_of_tries = 3

success = 0
mismatch = 0
fail = 0

def result_sets_are_same(gen_set, ref_set):
    gen_set, ref_set = list(gen_set), list(ref_set)
    for item in gen_set:
        if item not in ref_set:
            print(f"Missing from reference: {item}")
            return False
    for item in ref_set:
        if item not in gen_set:
            print(f"Missing from generated: {item}")
            return False
    return True


print(f"🧪 Testing {len(example_queries)} queries")
for i, test_query in enumerate(example_queries):
    # if i < 1:
    #     continue
    # Execute the reference query
    sparql_endpoint.setQuery(test_query["query"])
    results = sparql_endpoint.query().convert()
    res_from_ref = results["results"]["bindings"]

    for t in range(number_of_tries):
        resp = requests.post("http://localhost:8000/chat",
            json={
                "messages": [{"role": "user", "content": test_query["question"]}],
                "stream": False,
                "api_key": expasy_api_key,
            },
            timeout=60,
        )
        chat_resp_md = resp.json()["choices"][0]["message"]["content"]

        try:
            generated_sparql = extract_sparql_queries(chat_resp_md)[-1]
            if generated_sparql["query"].strip() == test_query["query"].strip():
                print(f"✅ {t+1}/{number_of_tries} {test_query['question']}. EXACT MATCH")
                success += 1
                continue

            # Execute the generated query
            sparql_endpoint = SPARQLWrapper(generated_sparql["endpoint"])
            sparql_endpoint.setReturnFormat(JSON)
            sparql_endpoint.setTimeout(200)
            sparql_endpoint.setQuery(generated_sparql["query"])
            results = sparql_endpoint.query().convert()
            res_from_generated = results["results"]["bindings"]

            if not result_sets_are_same(res_from_generated, res_from_ref):
                mismatch += 1
                print(f"Ref: {len(res_from_ref)} != gen: {len(res_from_generated)}")
                raise Exception(f"\nResults mismatch")
            else:
                print(f"✅ {t+1}/{number_of_tries} {test_query['question']} = {len(res_from_generated)}")
                success += 1

        except Exception as e:
            fail += 1
            print(f"❌ {t+1}/{number_of_tries} {test_query['question']} {e}. Generated query:")
            print(generated_sparql["query"])
            print("Correct query:")
            print(test_query["query"])
            print("")
        # print(f"Results: {len(results['results']['bindings'])}")

    print(f"⚖️ Success: {success}, Mismatch: {mismatch}, Error: {fail-mismatch}")

🧪 Testing 4 queries
✅ what is the accession number in uniprot of the human gene LCT? Return only the distinct protein URIs = 4
✅ what is the accession number in uniprot of the human gene LCT? Return only the distinct protein URIs = 4
✅ what is the accession number in uniprot of the human gene LCT? Return only the distinct protein URIs = 4
⚖️ Success: 3, Mismatch: 0, Error: 0
✅ How do I filter for reviewed mouse proteins which carry an N-terminal glycine? Return protein URI and AA sequence = 20
✅ How do I filter for reviewed mouse proteins which carry an N-terminal glycine? Return protein URI and AA sequence = 20
✅ How do I filter for reviewed mouse proteins which carry an N-terminal glycine? Return protein URI and AA sequence = 20
⚖️ Success: 6, Mismatch: 0, Error: 0
✅ How could I download a table that only includes the Rhea reactions for which there is experimental evidence? Return only the rhea URI = 47936
✅ How could I download a table that only includes the Rhea reactions for which