Checkout all the example queries for an endpoint to see which classes/properties that are present in the VoID description are missing from the examples

So we can improve completeness of examples

In [2]:
from qdrant_client.models import FieldCondition, Filter, MatchValue

from sparql_llm.config import settings
from sparql_llm.embed import get_vectordb
from sparql_llm.validate_sparql import get_void_dict, sparql_query_to_dict

check_endpoints = {
    "UniProt": "https://sparql.uniprot.org/sparql/",
    "OMA": "https://sparql.omabrowser.org/sparql/",
    "Bgee": "https://www.bgee.org/sparql/",
}

vectordb = get_vectordb("localhost")

ns_to_ignore = [
    "http://www.w3.org/ns/sparql-service-description#",
    "http://www.w3.org/ns/shacl#",
    "http://www.w3.org/2002/07/owl#",
    "http://rdfs.org/ns/void#",
    "http://purl.org/query/voidext#",
    "http://www.w3.org/2001/XMLSchema#",
]


def ignore_namespaces(cls) -> bool:
    return any(cls.startswith(ns) for ns in ns_to_ignore)


for endpoint_name, endpoint_url in check_endpoints.items():
    print(f"\n\n    🔎 Checking {endpoint_name} at {endpoint_url}")
    all_cls = set()
    all_preds = set()
    unfiltered_void_dict = get_void_dict(endpoint_url)
    void_dict = {}
    for cls, cls_dict in unfiltered_void_dict.items():
        # if cls.startswith("http://www.w3.org/2002/07/owl#"):
        if ignore_namespaces(cls):
            continue
        void_dict[cls] = cls_dict
        all_cls.add(cls)
        for pred, pred_dict in cls_dict.items():
            all_preds.add(pred)
            for cls2 in pred_dict:
                if not ignore_namespaces(cls2):
                    all_cls.add(cls2)

    # print(json.dumps(void_dict, indent=2))
    total_cls_count = len(all_cls)
    total_preds_count = len(all_preds)

    # Get indexed queries for this endpoint
    queries, _ = vectordb.scroll(
        collection_name=settings.docs_collection_name,
        scroll_filter=Filter(
            must=[
                FieldCondition(
                    key="doc_type",
                    match=MatchValue(value="sparql"),
                ),
                FieldCondition(
                    key="endpoint",
                    match=MatchValue(value=endpoint_url),
                ),
            ]
        ),
        limit=1000,
        with_payload=True,
        with_vectors=False,
    )
    for query_record in queries:
        sparql_query = query_record.payload["answer"].removeprefix("```sparql").removesuffix("```")
        query_dict = sparql_query_to_dict(sparql_query, endpoint_url)
        # print(query_dict)
        if not query_dict or endpoint_url not in query_dict:
            # print(f"Issue parsing query {sparql_query}")
            continue

        for subj, subj_dict in query_dict[endpoint_url].items():
            if "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" in subj_dict:
                for subj_type in subj_dict["http://www.w3.org/1999/02/22-rdf-syntax-ns#type"]:
                    if subj_type in void_dict:
                        del void_dict[subj_type]
                    if subj_type in all_cls:
                        all_cls.remove(subj_type)
                        # print(f"Deleted {subj_type}")

            for pred, pred_dict in subj_dict.items():
                if pred in all_preds:
                    all_preds.remove(pred)

    print(f"  CLASSES not used explicitly in {endpoint_name} examples")
    print("\n".join(all_cls))
    print(f"\n  PREDICATES not used in {endpoint_name} examples")
    print("\n".join(all_preds))
    print(
        f"🧾 {endpoint_name}: {total_cls_count-len(all_cls)}/{total_cls_count} classes and {total_preds_count-len(all_preds)}/{total_preds_count} predicates used"
    )



    🔎 Checking UniProt at https://sparql.uniprot.org/sparql/
  CLASSES not used explicitly in UniProt examples
http://purl.uniprot.org/core/Electronic_Citation
http://purl.uniprot.org/core/PH_Dependence_Annotation
http://purl.uniprot.org/core/Self_Interaction
http://purl.uniprot.org/core/Transit_Peptide_Annotation
http://purl.uniprot.org/core/Non-terminal_Residue_Annotation
http://purl.uniprot.org/core/Helix_Annotation
http://purl.uniprot.org/core/Redox_Potential_Annotation
http://biohackathon.org/resource/faldo#ExactPosition
http://purl.uniprot.org/core/Member_Of_Redudant_Proteome
http://purl.uniprot.org/core/Submission_Citation
http://purl.uniprot.org/core/Pathway
http://purl.uniprot.org/core/Alternative_Sequence_Annotation
http://purl.uniprot.org/core/Alternative_Promoter_Usage_Annotation
http://purl.uniprot.org/core/Intramembrane_Annotation
http://purl.uniprot.org/core/Erroneous_Termination_Annotation
http://purl.uniprot.org/core/Erroneous_Translation_Annotation
http://purl.unipr