# Extract queries from Virtuoso logs 

In [1]:
import os
import re
import urllib.parse


def parse_log_file(file_path):
    """Parse a single log file"""
    with open(file_path) as file:
        log_entries = file.readlines()
    queries_dict = {}

    log_pattern = re.compile(
        r'(?P<ip>\d+\.\d+\.\d+\.\d+) - - \[(?P<date>.*?)\] "GET /sparql\?query=(?P<query>.*?) HTTP.*?" (?P<status>\d+) \d+ ".*?" "(?P<user_agent>.*?)"'
    )

    for entry in log_entries:
        match = log_pattern.match(entry)
        if match:
            query_encoded = match.group("query").split("&")[0]  # Get the query part only
            query_decoded = urllib.parse.unquote_plus(query_encoded)
            user_agent = match.group("user_agent")
            queries_dict[query_decoded.strip()] = user_agent

    return queries_dict

def parse_log_directory(directory_path):
    queries_dict = {}
    for filename in os.listdir(directory_path):
        if filename.endswith(".log"):
            file_path = os.path.join(directory_path, filename)
            queries_dict = {**queries_dict, **parse_log_file(file_path)}
    return queries_dict

log_directory = "../data/logs-virtuoso"
# Order from longest query to shortest, could be improved with a more complex sorting based on BGP count
# But it is enough for now
queries_dict = dict(sorted(parse_log_directory(log_directory).items(), key=lambda item: -len(item[0])))

for query, agent in queries_dict.items():
    print(f"{query}\n[{agent}]\n")

print(f"{len(queries_dict)} queries found in logs")

PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>

CONSTRUCT {
   ?seq genex:isExpressedIn ?anatEntity ;
        rdfs:label ?geneName .
} WHERE {
    VALUES ?g_label {"SCN5A" "KCC2D" "FGF12" "ZMY19" "EMC9" "BANP" "TEKT4" "PTN3"}
    ?seq a orth:Gene;
         orth:organism ?organism ;
         rdfs:label ?g_label .

    ?organism obo:RO_0002162 <http://purl.uniprot.org/taxonomy/9606> . 
    ?seq genex:isExpressedIn ?anatEntity.
    ?anatEntity a genex:AnatomicalEntity ;
                rdfs:label ?anatName .
}
[sparqlwrapper 2.0.0 (rdflib.github.io/sparqlwrapper)]

PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>     PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>     PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>     PREFIX foaf: <http://xmlns.com/foaf/0.1/>     PREFIX ex: <http://example.org/>     PREFIX owl: <http://www.w3.org/2002/07/owl#>  SELECT DISTINCT ?class ?subclass WHERE {     graph <