Checkout all the example queries for an endpoint to see which classes/properties that are present in the VoID description are missing from the examples

So we can improve completeness of examples

In [None]:
from sparql_llm.sparql_examples_loader import GET_SPARQL_EXAMPLES_QUERY
from sparql_llm.utils import query_sparql
from sparql_llm.validate_sparql import get_schema_for_endpoint, sparql_query_to_dict

check_endpoints = [
    "https://sparql.omabrowser.org/sparql/",
    "https://www.bgee.org/sparql/",
    "https://sparql.uniprot.org/sparql/",
]

ns_to_ignore = [
    "http://www.w3.org/ns/sparql-service-description#",
    "http://www.w3.org/ns/shacl#",
    "http://www.w3.org/2002/07/owl#",
    "http://rdfs.org/ns/void#",
    "http://purl.org/query/voidext#",
    "http://www.w3.org/2001/XMLSchema#",
]


def ignore_namespaces(cls) -> bool:
    return any(cls.startswith(ns) for ns in ns_to_ignore)


for endpoint_url in check_endpoints:
    print(f"\n\n    🔎 Checking {endpoint_url}")
    # Sets of all classes and predicates in the void description
    # So we can remove them as we find them in the example queries and know what's left
    all_cls = set()
    all_preds = set()

    # Get all classes and predicates from the void description
    unfiltered_void_dict = get_schema_for_endpoint(endpoint_url)
    void_dict = {}
    for cls, cls_dict in unfiltered_void_dict.items():
        if ignore_namespaces(cls):
            continue
        void_dict[cls] = cls_dict
        all_cls.add(cls)
        for pred, pred_dict in cls_dict.items():
            all_preds.add(pred)
            for cls2 in pred_dict:
                if not ignore_namespaces(cls2):
                    all_cls.add(cls2)

    # print(json.dumps(void_dict, indent=2))
    total_cls_count = len(all_cls)
    total_preds_count = len(all_preds)

    # Get all example queries for this endpoint
    for query_example in query_sparql(GET_SPARQL_EXAMPLES_QUERY, endpoint_url)[
        "results"
    ]["bindings"]:
        sparql_query = query_example["query"]["value"]
        query_dict = sparql_query_to_dict(sparql_query, endpoint_url)
        # print(query_dict)
        if not query_dict or endpoint_url not in query_dict:
            # print(f"Issue parsing query {sparql_query}")
            continue

        for subj, subj_dict in query_dict[endpoint_url].items():
            if "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" in subj_dict:
                for subj_type in subj_dict[
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
                ]:
                    if subj_type in void_dict:
                        del void_dict[subj_type]
                    if subj_type in all_cls:
                        all_cls.remove(subj_type)
                        # print(f"Deleted {subj_type}")

            for pred, pred_dict in subj_dict.items():
                if pred in all_preds:
                    all_preds.remove(pred)

    print(f"  CLASSES not used explicitly in {endpoint_url} examples")
    print("\n".join(all_cls))
    print(f"\n  PREDICATES not used explicitly in {endpoint_url} examples")
    print("\n".join(all_preds))
    print(
        f"\n🧾 {endpoint_url}: {total_cls_count - len(all_cls)}/{total_cls_count} classes and {total_preds_count - len(all_preds)}/{total_preds_count} predicates used"
    )



    🔎 Checking https://sparql.omabrowser.org/sparql/
  CLASSES not used explicitly in https://sparql.omabrowser.org/sparql/ examples
http://purl.uniprot.org/core/Rank
http://www.w3.org/1999/02/22-rdf-syntax-ns#Property
http://purl.uniprot.org/core/Protein
http://rdf.ebi.ac.uk/resource/ensembl/protein
http://purl.obolibrary.org/obo/SO_0000673
http://purl.org/net/orth#Gene
http://purl.org/net/orth#OrthologyDataset
http://semanticscience.org/resource/SIO_000750
http://purl.obolibrary.org/obo/SO_0000704
http://purl.org/net/orth#Organism
http://www.w3.org/2000/01/rdf-schema#Class
http://purl.org/net/orth#TaxonomicRange
http://purl.org/net/orth#HierarchicalGeneTree

  PREDICATES not used explicitly in https://sparql.omabrowser.org/sparql/ examples
http://purl.org/lscr#xrefNCBIGene
http://purl.org/lscr#xrefEnsemblTranscript
http://purl.org/lscr#xrefEnsemblProtein
http://www.w3.org/2000/01/rdf-schema#subPropertyOf
http://purl.org/lscr#xrefNCBIProtein
http://purl.obolibrary.org/obo/CDAO_00001