Generate stats on examples queries. Require to have the `sparql-examples` repository cloned alongside this repository

In [6]:
import os
from collections import defaultdict
from statistics import mean, median

import pandas as pd
from rdflib import Graph

from sparql_llm.validate_sparql import sparql_query_to_dict

GET_EXAMPLE_QUERY = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX schema: <https://schema.org/>
SELECT DISTINCT ?sq ?query ?endpoint
WHERE {
    ?sq a sh:SPARQLExecutable ;
        schema:target ?endpoint ;
        sh:select|sh:construct|sh:describe|sh:ask ?query .
}"""


def compute_queries_stats(directory: str) -> pd.DataFrame:
    ttl_files = []
    # Walk through the directories to a depth of 2
    for root, _dirs, files in os.walk(directory):
        depth = root[len(directory) :].count(os.sep)
        if depth < 3:
            for file in files:
                if file.endswith(".ttl") and not file.endswith("prefixes.ttl"):
                    ttl_files.append(os.path.join(root, file))

    triples_per_query = []
    triples_per_endpoint = defaultdict(
        list
    )  # Store triples per query for each endpoint
    parsing_errors = 0

    # ttl_files = ["../../sparql-examples/examples/neXtProt/NXQ_00266.ttl"]

    for ttl_file in ttl_files:
        # print(ttl_file)
        g = Graph()
        g.parse(ttl_file, format="turtle")
        for row in g.query(GET_EXAMPLE_QUERY):
            query = row["query"]
            main_target_endpoint = str(row["endpoint"])
            # print(query)
            try:
                query_dict = sparql_query_to_dict(query, main_target_endpoint)
                # print(json.dumps(query_dict, indent=2))

                # Count triples for this query
                num_triples = 0
                for _endpoint, subjects in query_dict.items():
                    for _subject, predicates in subjects.items():
                        for _predicate, objects in predicates.items():
                            num_triples += len(objects)

                # if num_triples == 0:
                # print(f"{num_triples} triples for query:\n{query}\n\n")

                triples_per_query.append(num_triples)
                triples_per_endpoint[main_target_endpoint].append(num_triples)
            except RecursionError:
                # Known issue with nested SERVICE clauses https://github.com/RDFLib/rdflib/issues/2136
                parsing_errors += 1
                continue
            except Exception as e:
                print(f"Error parsing query for {ttl_file} {main_target_endpoint}: {e}")
                print(query)
                parsing_errors += 1

    # Prepare data for the DataFrame
    data = []
    for endpoint, triples_list in triples_per_endpoint.items():
        data.append(
            {
                "endpoint": endpoint,
                "queries_count": len(triples_list),
                "average_triples_patterns": round(mean(triples_list), 2),
                "median_triples_patterns": median(triples_list),
                "max_triples_patterns": max(triples_list),
            }
        )

    data.append(
        {
            "endpoint": "total",
            "queries_count": len(triples_per_query),
            "average_triples_patterns": round(mean(triples_per_query), 2)
            if triples_per_query
            else 0,
            "median_triples_patterns": median(triples_per_query)
            if triples_per_query
            else 0,
            "max_triples_patterns": max(triples_per_query) if triples_per_query else 0,
        }
    )
    print(f"Total number of queries overall: {len(triples_per_query)}")
    print(f"Total number of triples overall: {sum(triples_per_query)}")
    print(f"Errors while parsing queries: {parsing_errors}\n")

    return pd.DataFrame(data)


df = compute_queries_stats("../../sparql-examples/examples")
print(df.to_csv(index=False))

Total number of queries overall: 1016
Total number of triples overall: 6632
Errors while parsing queries: 3

endpoint,queries_count,average_triples_patterns,median_triples_patterns,max_triples_patterns
https://biosoda.unil.ch/graphdb/repositories/emi-dbgi,9,21.44,22.0,35
https://sparql.rhea-db.org/sparql/,20,7.05,5.5,22
https://glyconnect.expasy.org/sparql,5,5.0,3.0,9
https://sparql.omabrowser.org/sparql/,15,10.93,9.0,19
https://sparql.nextprot.org/sparql,773,5.95,5.0,26
https://sparql.uniprot.org/sparql/,110,6.71,5.0,32
https://sparql.swisslipids.org/sparql/,22,6.73,5.0,16
https://sparql.orthodb.org/sparql/,20,11.2,8.0,34
https://hamap.expasy.org/sparql/,4,2.0,1.0,6
https://rdf.metanetx.org/sparql/,13,7.31,7.0,12
https://www.bgee.org/sparql/,25,11.88,9.0,25
total,1016,6.53,5.0,35

