In [1]:
from question_types.sparql import SparqlQueries
from tabulate import tabulate
import json
import os
import unicodedata
import re

In [2]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-17 17:42:42,956 | INFO | __init__ | [92mParsing graph[0m
2024-10-17 17:43:41,144 | INFO | __init__ | [92mGraph parsed[0m


In [3]:
def execute_query(query):
    sparql_result = sparql.execute_query(query)
    result_lst = [
        [str(item) for item in (row if isinstance(row, tuple) else [row])]
        for row in sparql_result
    ]
    if not len(result_lst):
        return print("Results were empty")
    headers = [f"Col {idx}" for idx in range(len(result_lst[0]))]
    return result_lst

In [13]:
def get_all_of_type_film():
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?movie ?movieLabel ?type WHERE {{
            ?movie wdt:P31/wdt:P279* wd:Q11424 ;   # Match film or its subclasses
                   rdfs:label ?movieLabel .        # Get the label of the movie
            FILTER(LANG(?movieLabel) = "en")       # Only English labels
            BIND("movie" AS ?type)
        }}
    """
    return execute_query(query)


def get_all_of_type_actor():
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?actor ?actorLabel ?type WHERE {{
            ?actor wdt:P31 wd:Q5 ;  # Ensure the entity is a human
                   wdt:P106/wdt:P279* wd:Q33999 ;  # Occupation is actor or any subclass
                   rdfs:label ?actorLabel .  # Get actor label
            FILTER(LANG(?actorLabel) = "en")  # English labels only
            BIND("actor" AS ?type)
        }}
    """
    return execute_query(query)

def get_all_of_type_film_professionals():
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?professional ?label ?roleLabel WHERE {{
            ?professional rdfs:label ?label ;        # Get any label
                          wdt:P31 wd:Q5 ;            # Instance of human
                          wdt:P106 ?role .           # Occupation property
            ?role rdfs:label ?roleLabel .            # Get role label (for debugging)
            FILTER NOT EXISTS {{
                ?role wdt:P279* wd:Q33999 .          # Exclude actors and subclasses
            }}
            FILTER(LANG(?roleLabel) = "en") .        # English labels for roles
            FILTER(LANG(?label) = "en")              # English labels only
        }}
    """
    return execute_query(query)


In [16]:
def normalize_string(s):
    """Cleans the input entity to a uniform naming convention, by removing non ascii characters, encoding it to utf, setting it to lowercase, and removing redundant spaces"""
    s = s.lower()
    s = unicodedata.normalize('NFKD', s)
    s = s.encode('ascii', 'ignore').decode('utf-8')
    s = re.sub(r'[^\w\s]', '', s)
    s = ' '.join(s.split())
    return s

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
def extend_if_exists(db, lst):
    """Extends the db with the given list of entities, if the entity already exists, it appends the type to the existing entity"""
    for row in lst:
        key = row[0]
        name = normalize_string(row[1])
        type = row[2]
        if key in db:
            db[key][1].append(type)
        else:
            db[key] = (name, [type])
               
def export_entities_json():
    
    entity_json = dict()
    
    ensure_directory_exists('exports')
    
    actor_lst = get_all_of_type_actor()
    prof_lst = get_all_of_type_film_professionals()
    film_lst = get_all_of_type_film()
    
    extend_if_exists(entity_json, actor_lst)
    extend_if_exists(entity_json, prof_lst)
    extend_if_exists(entity_json, film_lst)
     
    with open('exports/entity_db.json', 'w', encoding="utf-8") as file:
            json.dump(entity_json, file, ensure_ascii=False, indent=2)
    
    print(tabulate(list(entity_json.items())[:10], headers=["Entity ID", "Name", "Type"], tablefmt="grid"))


In [17]:
export_entities_json()

+-------------------------------------------+--------------------------------------+
| Entity ID                                 | Name                                 |
| http://www.wikidata.org/entity/Q100423423 | ('viktor kristof', ['actor'])        |
+-------------------------------------------+--------------------------------------+
| http://www.wikidata.org/entity/Q102290694 | ('oleg kapanets', ['actor'])         |
+-------------------------------------------+--------------------------------------+
| http://www.wikidata.org/entity/Q102963    | ('alexander geringas', ['actor'])    |
+-------------------------------------------+--------------------------------------+
| http://www.wikidata.org/entity/Q105683337 | ('giovanni korporaal', ['actor'])    |
+-------------------------------------------+--------------------------------------+
| http://www.wikidata.org/entity/Q106273    | ('jurgen knieper', ['actor'])        |
+-------------------------------------------+--------------------