In [1]:
from question_types.sparql import SparqlQueries
from tabulate import tabulate
import json
import os
import unicodedata
import re

In [2]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-31 14:00:50,206 | INFO | __init__ | Initializing SparqlQueries class...
2024-10-31 14:01:37,894 | INFO | __init__ | Graph parsed
2024-10-31 14:01:37,895 | INFO | __init__ | SparqlQueries class initialized successfully.


In [3]:
def execute_query(query):
    sparql_result = sparql.execute_query(query)
    result_lst = [
        [str(item) for item in (row if isinstance(row, tuple) else [row])]
        for row in sparql_result
    ]
    if not len(result_lst):
        return print("Results were empty")
    headers = [f"Col {idx}" for idx in range(len(result_lst[0]))]
    return result_lst

In [4]:
def get_all_of_type_predicate():
    query = """
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        
        SELECT DISTINCT ?propertyLabel ?property WHERE {
          ?s ?property ?o .
          OPTIONAL { ?property rdfs:label ?propertyLabel . FILTER (lang(?propertyLabel) = 'en') }
        }
    """
    return execute_query(query)

In [5]:
def normalize_string(s):
    """Cleans the input entity to a uniform naming convention, by removing non ascii characters, encoding it to utf, setting it to lowercase, and removing redundant spaces"""
    s = s.lower()
    s = unicodedata.normalize('NFKD', s)
    s = s.encode('ascii', 'ignore').decode('utf-8')
    s = re.sub(r'[^\w\s]', '', s)
    s = ' '.join(s.split())
    return s

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
               
def export_predicate_json():
    
    ensure_directory_exists('exports')
    
    predicate_lst = get_all_of_type_predicate()
    predicate_json = {normalize_string(label): wiki_id for label, wiki_id in predicate_lst if len(label) > 3}
     
    with open('exports/predicate_db.json', 'w', encoding="utf-8") as file:
            json.dump(predicate_json, file, ensure_ascii=False, indent=2)
    
    print(tabulate(list(predicate_json.items())[:5], headers=["Predicate Label", "Predicate ID"], tablefmt="grid"))

In [6]:
export_predicate_json()

+------------------------------------+--------------------------------------------+
| Predicate Label                    | Predicate ID                               |
| country of citizenship             | http://www.wikidata.org/prop/direct/P27    |
+------------------------------------+--------------------------------------------+
| node label                         | http://www.w3.org/2000/01/rdf-schema#label |
+------------------------------------+--------------------------------------------+
| cast member                        | http://www.wikidata.org/prop/direct/P161   |
+------------------------------------+--------------------------------------------+
| languages spoken written or signed | http://www.wikidata.org/prop/direct/P1412  |
+------------------------------------+--------------------------------------------+
| place of birth                     | http://www.wikidata.org/prop/direct/P19    |
+------------------------------------+--------------------------------------