In [1]:
from question_types.sparql import SparqlQueries
from tabulate import tabulate
import json
import os
import unicodedata
import re

In [2]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-31 13:59:05,271 | INFO | __init__ | Initializing SparqlQueries class...
2024-10-31 13:59:58,094 | INFO | __init__ | Graph parsed
2024-10-31 13:59:58,095 | INFO | __init__ | SparqlQueries class initialized successfully.


In [3]:
def execute_query(query):
    sparql_result = sparql.execute_query(query)
    result_lst = [
        [str(item) for item in (row if isinstance(row, tuple) else [row])]
        for row in sparql_result
    ]
    if not len(result_lst):
        return print("Results were empty")
    headers = [f"Col {idx}" for idx in range(len(result_lst[0]))]
    return result_lst

In [4]:
def get_all_humans_with_optional_occupation():
    query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?human ?humanLabel ?occupation ?occupationLabel WHERE {
            ?human wdt:P31 wd:Q5 ;           # Ensure the entity is a human
                   rdfs:label ?humanLabel .  # Get human label

            OPTIONAL {
                ?human wdt:P106 ?occupation .           # Occupation property (optional)
                ?occupation rdfs:label ?occupationLabel  # Occupation label (optional)
                FILTER(LANG(?occupationLabel) = "en")    # English labels for occupations
            }

            FILTER(LANG(?humanLabel) = "en")             # English labels for humans
        }
    """
    return execute_query(query)

In [5]:
def normalize_string(s):
    """Cleans the input entity to a uniform naming convention, by removing non ascii characters, encoding it to utf, setting it to lowercase, and removing redundant spaces"""
    s = s.lower()
    s = unicodedata.normalize('NFKD', s)
    s = s.encode('ascii', 'ignore').decode('utf-8')
    s = re.sub(r'[^\w\s]', '', s)
    s = ' '.join(s.split())
    return s

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
               
def export_people_json():
    ensure_directory_exists('exports')

    people_lst = get_all_humans_with_optional_occupation()
    
    print(tabulate(list(people_lst[:5]), headers=["Entity ID", "Name", "Occupation ID", "Occupation Label"], tablefmt="grid"))
    
    people_json = {row[0]: normalize_string(row[1]) for row in people_lst if row[1] and len(row[1]) > 3}
     
    with open('exports/people_db.json', 'w', encoding="utf-8") as file:
            json.dump(people_json, file, ensure_ascii=False, indent=2)
    

In [6]:
export_people_json()

+-------------------------------------------+------------------+---------------------------------------+--------------------+
| Entity ID                                 | Name             | Occupation ID                         | Occupation Label   |
| http://www.wikidata.org/entity/Q100423423 | Viktor Krištof   | http://www.wikidata.org/entity/Q33999 | actor              |
+-------------------------------------------+------------------+---------------------------------------+--------------------+
| http://www.wikidata.org/entity/Q1012658   | Yuji Nomi        | None                                  | None               |
+-------------------------------------------+------------------+---------------------------------------+--------------------+
| http://www.wikidata.org/entity/Q1019375   | Béatrice Thiriet | None                                  | None               |
+-------------------------------------------+------------------+---------------------------------------+--------------