# Running example queries

Trying to run example SPARQL queries using `SPARQLWrapper`. Painful experience, the wrapper is not working well even with basic Virtuoso SPARQL endpoint

## Questions to test

Select all human UniProt entries with a sequence variant that leads to a tyrosine to phenylalanine substitution



In [None]:
import json
import re
import time

from qdrant_client.models import FieldCondition, Filter, MatchValue
from rdflib.plugins.sparql import prepareQuery
from SPARQLWrapper import JSON, TURTLE, SPARQLWrapper

from sparql_llm.config import settings
from sparql_llm.index import get_vectordb

vectordb = get_vectordb("localhost")
all_queries, _ = vectordb.scroll(
    collection_name=settings.docs_collection_name,
    scroll_filter=Filter(
        must=[
            FieldCondition(
                key="doc_type",
                match=MatchValue(value="sparql"),
            )
        ]
    ),
    limit=1000,
    with_payload=True,
    with_vectors=False,
)


def ensure_limit(query: str) -> str:
    limit_pattern = re.compile(r"LIMIT\s+\d+\s*$", re.IGNORECASE)
    if not limit_pattern.search(query.rstrip()):
        query = query.rstrip() + " LIMIT 1"
    return query


questions_results = []

queries_returning_rdf = ["DescribeQuery", "ConstructQuery"]
# skip_queries = [7, 27, 32, 37, 38, 42, 43, 47, 62, 63]
print(len(all_queries))
for i, query_record in enumerate(all_queries):
    # if i < 63:
    #     continue
    start_time = time.time()
    try:
        # print(query_record.payload)
        question = query_record.payload["question"].split(":", 1)[1].strip()
        endpoint = query_record.payload["endpoint"]
        query = (
            query_record.payload["answer"].removeprefix("```sparql").removesuffix("```")
        )
        # if i in skip_queries:
        #     print(f"⏩️ {i}/{len(all_queries)} Skipping to avoid timeout: {question} <{endpoint}>")
        #     continue

        print(f"[{i}/{len(all_queries)}] {question} <{endpoint}>")

        parsed_query = prepareQuery(query)
        # print(parsed_query.algebra.name)

        sparql_endpoint = SPARQLWrapper(endpoint)
        if parsed_query.algebra.name not in queries_returning_rdf:
            sparql_endpoint.setReturnFormat(JSON)
        else:
            sparql_endpoint.setReturnFormat(TURTLE)
            continue

        sparql_endpoint.setQuery(query)
        sparql_endpoint.setTimeout(200)

        # NOTE: for some reason CONSTRUCT queries are failing directly with every format
        results = sparql_endpoint.query().convert()
        # results = sparql_endpoint.queryAndConvert()
        if parsed_query.algebra.name in queries_returning_rdf:
            # print(results)
            res_count = len(results)
        elif parsed_query.algebra.name == "AskQuery":
            res_count = 1 if results["boolean"] else 0
        else:
            res_count = len(results["results"]["bindings"])

        string_resp = "✅" if res_count > 0 else "⚠️"
        if res_count > 0:
            string_resp = "✅"
            questions_results.append(
                {
                    "question": question,
                    "endpoint": endpoint,
                    "query": query,
                    "results": res_count,
                    "runtime": int(time.time() - start_time),
                }
            )
        else:
            string_resp = "❌"
        if len(questions_results) % 10 == 0:
            print(json.dumps(questions_results, indent=4))
    except Exception as e:
        res_count = str(e)
        string_resp = "💣️"
        # print(query)

    print(f"{string_resp} results {res_count} in {time.time() - start_time:.2f}s")

print(json.dumps(questions_results, indent=4))

129
[0/129] Was any UniProt entry integrated on the 9th of January 2013 <https://sparql.uniprot.org/sparql/>
✅ results 1 in 0.09s
[1/129] Construct new triples of the type 'HumanProtein' from all human UniProt entries <https://sparql.uniprot.org/sparql/>
[2/129] Select all taxa from the UniProt taxonomy <https://sparql.uniprot.org/sparql/>
✅ results 2941742 in 321.27s
[3/129] Select all human UniProt entries with a sequence variant that leads to a tyrosine to phenylalanine substitution <https://sparql.uniprot.org/sparql/>
✅ results 128 in 9.82s
[4/129] Select all UniProt entries with annotated transmembrane regions and the regions' begin and end coordinates on the canonical sequence <https://sparql.uniprot.org/sparql/>
💣️ results The read operation timed out in 5294.79s
[5/129] Select all UniProt entries that were integrated on the 30th of November 2010 <https://sparql.uniprot.org/sparql/>
✅ results 643989 in 180.40s
[6/129] Select all triples that relate to the EMBL CDS entry AA089367