In [136]:
from question_types.sparql import SparqlQueries
from tabulate import tabulate
import json
import os

In [137]:
sparql = SparqlQueries("../dataset/14_graph.nt")

2024-10-10 21:29:11,142 | INFO | __init__ | [92mParsing graph[0m
2024-10-10 21:30:37,034 | INFO | __init__ | [92mGraph parsed[0m


In [138]:
def execute_query(query):
    sparql_result = sparql.execute_query(query)
    result_lst = [
        [str(item) for item in (row if isinstance(row, tuple) else [row])]
        for row in sparql_result
    ]
    if not len(result_lst):
        return print("Results were empty")
    headers = [f"Col {idx}" for idx in range(len(result_lst[0]))]
    # print(tabulate(result_lst[:min(3, len(result_lst))], headers=headers, tablefmt="grid"))
    return result_lst

In [139]:
def get_all_of_type_film():
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?movie ?movieLabel WHERE {{
            ?movie wdt:P31/wdt:P279* wd:Q11424 ;   # Match film or its subclasses
                   rdfs:label ?movieLabel .        # Get the label of the movie
            FILTER(LANG(?movieLabel) = "en")       # Only English labels
        }}
    """
    return execute_query(query)


def get_all_of_type_actor():
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?actor ?actorLabel WHERE {{
            ?actor wdt:P31 wd:Q5 ;  # Ensure the entity is a human
                   wdt:P106/wdt:P279* wd:Q33999 ;  # Occupation is actor or any subclass
                   rdfs:label ?actorLabel .  # Get actor label
            FILTER(LANG(?actorLabel) = "en")  # English labels only
        }}
    """
    return execute_query(query)

def get_all_of_type_film_professionals():
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?professional ?label ?roleLabel WHERE {{
            ?professional rdfs:label ?label ;        # Get any label
                          wdt:P31 wd:Q5 ;            # Instance of human
                          wdt:P106 ?role .           # Occupation property
            ?role rdfs:label ?roleLabel .            # Get role label (for debugging)
            FILTER NOT EXISTS {{
                ?role wdt:P279* wd:Q33999 .          # Exclude actors and subclasses
            }}
            FILTER(LANG(?roleLabel) = "en") .        # English labels for roles
            FILTER(LANG(?label) = "en")              # English labels only
        }}
    """
    return execute_query(query)


In [140]:
def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def export_actor_json():
    try:
        ensure_directory_exists('exports')
        actor_lst = get_all_of_type_actor()
        db = {row[0]: row[1] for row in actor_lst}  
        with open('exports/actors_db.json', 'w', encoding="utf-8") as file:
            json.dump(db, file, ensure_ascii=False)
        print(f"Actor data exported successfully. Stored {len(db)} actors.")
    except Exception as e:
        print(f"An error occurred: {e}")

def export_professionals_json():
    try:
        ensure_directory_exists('exports')
        actor_lst = get_all_of_type_film_professionals()
        db = {row[0]: (row[1], row[2]) for row in actor_lst}  
        with open('exports/professionals_db.json', 'w', encoding="utf-8") as file:
            json.dump(db, file, ensure_ascii=False)
        print(f"Actor data exported successfully. Stored {len(db)} actors.")
    except Exception as e:
        print(f"An error occurred: {e}")

def export_film_json():
    try:
        ensure_directory_exists('exports')
        film_lst = get_all_of_type_film()
        db = {row[0]: row[1] for row in film_lst}
        with open('exports/film_db.json', 'w', encoding="utf-8") as file:
            json.dump(db, file, ensure_ascii=False)
        print(f"Film data exported successfully. Stored {len(db)} films.")
    except Exception as e:
        print(f"An error occurred: {e}")


In [141]:
def search_anything_by_name(name: str):
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        PREFIX schema: <http://schema.org/>

        SELECT ?entity ?label ?description WHERE {{
            ?entity rdfs:label ?label .        # Get any entity with a label
            OPTIONAL {{ ?entity schema:description ?description FILTER(LANG(?description) = "en") }}  # Get descriptions if available
            FILTER(CONTAINS(LCASE(?label), LCASE("{name}"))) .  # Match name substring, case-insensitive
            FILTER(LANG(?label) = "en")  # English labels only
        }}
        LIMIT 10
    """
    return execute_query(query)

def get_all_predicates_by_id(entity_id: str) -> list:
    #[[predicate_id, predicate_name, object_id, object_name]]
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        
        SELECT ?predicate ?predicateLabel ?object ?objectLabel WHERE {{
            wd:{entity_id} ?predicate ?object .  # Retrieve all properties and values for the given entity
            OPTIONAL {{ ?predicate rdfs:label ?predicateLabel . FILTER(LANG(?predicateLabel) = "en") }}  # Get predicate label
            OPTIONAL {{ ?object rdfs:label ?objectLabel . FILTER(LANG(?objectLabel) = "en") }}           # Get object label
        }}
        ORDER BY ?predicate
    """
    return execute_query(query)

def id_to_predicate_dict(ids: list) -> dict:
    all_predicates = dict()
    for id_elem in ids:
        actor_id = id_elem[0].split("/")[-1]
        predicate_obj = get_all_predicates_by_id(actor_id)
        
        if actor_id not in all_predicates:
            all_predicates[actor_id] = {"entity_name":id_elem[1]}
        else:
            continue
        
        for pred in predicate_obj:
            if pred[3] == "None":
                continue
                
            if pred[1] not in all_predicates[actor_id]:
                all_predicates[actor_id][pred[1]] = []
            
            all_predicates[actor_id][pred[1]].append(pred[3])
    return all_predicates

def actor_to_predicate(actor_name):
    #[[actor_id, actor_name, imdb_id]]
    possible_ids = execute_query(f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?actor ?actorLabel WHERE {{
            ?actor wdt:P31 wd:Q5 ;                         # Ensure the entity is a human
                   wdt:P106/wdt:P279* wd:Q33999 ;          # Occupation is actor or any subclass
                   rdfs:label ?actorLabel .                # Get actor label

            FILTER(CONTAINS(LCASE(?actorLabel), LCASE("{actor_name}"))) .  # Match name substring in any label
            FILTER(LANG(?actorLabel) = "en")  # English labels only
        }}
    """)
    
    all_predicates = id_to_predicate_dict(possible_ids)
    print(json.dumps(all_predicates, indent=2, ensure_ascii=False))
        
def movie_to_predicate(movie_name: str) -> str:
    possible_ids = execute_query(f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?movie ?movieLabel WHERE {{
            ?movie wdt:P31/wdt:P279* wd:Q11424 ;   # Match film or its subclasses
                   rdfs:label ?movieLabel .        # Get the label of the movie
            FILTER(CONTAINS(LCASE(?movieLabel), LCASE("{movie_name}"))) .  # Substring match
            FILTER(LANG(?movieLabel) = "en")       # Only English labels
        }}
    """)
    all_predicates = id_to_predicate_dict(possible_ids)
    print(json.dumps(all_predicates, indent=2, ensure_ascii=False))

def professional_to_predicate(professional_name: str) -> dict:
    possible_ids = execute_query(
        f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?professional ?label ?roleLabel WHERE {{
            ?professional rdfs:label ?label ;        # Get any label
                          wdt:P31 wd:Q5 ;            # Instance of human
                          wdt:P106 ?role .           # Occupation property
            ?role rdfs:label ?roleLabel .            # Get role label (for debugging)
            FILTER NOT EXISTS {{
                ?role wdt:P279* wd:Q33999 .          # Exclude actors and subclasses
            }}
            FILTER(LANG(?roleLabel) = "en") .        # English labels for roles
            FILTER(CONTAINS(LCASE(?label), LCASE("{professional_name}"))) .  # Match name substring in any label
            FILTER(LANG(?label) = "en")              # English labels only
        }}
        """
    )
    all_predicates = id_to_predicate_dict(possible_ids)
    print(json.dumps(all_predicates, indent=2, ensure_ascii=False))
    
def get_imdb_id_by_name(name: str) -> str:
    possible_ids = execute_query(f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>

        SELECT ?imdbId ?entity  WHERE {{
            ?entity rdfs:label ?label ;
                    wdt:P345 ?imdbId .                # IMDb ID property
            FILTER(CONTAINS(LCASE(?label), LCASE("{name}"))) .  # Match name substring
            FILTER(LANG(?label) = "en")               # English labels only
        }}
        LIMIT 1
    """)
    print(possible_ids)


In [142]:
export_actor_json()
export_film_json()
export_professionals_json()

Actor data exported successfully. Stored 56373 actors.
Film data exported successfully. Stored 27117 films.
Actor data exported successfully. Stored 17992 actors.


In [143]:
actor_to_predicate("Brad Pitt")

{
  "Q35332": {
    "entity_name": "Brad Pitt",
    "native language": [
      "American English"
    ],
    "occupation": [
      "executive producer",
      "actor",
      "model"
    ],
    "student of": [
      "Roy London"
    ],
    "described by source": [
      "Obalky knih.cz"
    ],
    "nominated for": [
      "Academy Award for Best Picture",
      "Academy Award for Best Actor",
      "Academy Award for Best Supporting Actor",
      "European Film Award – People's Choice Award for Best European Film"
    ],
    "languages spoken, written or signed": [
      "English"
    ],
    "award received": [
      "Golden Globe Award",
      "Academy Award for Best Picture",
      "Academy Award for Best Supporting Actor",
      "Saturn Award for Best Supporting Actor",
      "New York Film Critics Circle Award for Best Actor",
      "National Society of Film Critics Award for Best Actor",
      "Rembrandt Award",
      "Screen Actors Guild Award for Outstanding Performance by a Cast

In [144]:
movie_to_predicate("Transformers")

{
  "Q24251026": {
    "entity_name": "Transformers: The Last Knight",
    "film editor": [
      "Roger Barton"
    ],
    "genre": [
      "science fiction action film"
    ],
    "executive producer": [
      "Steven Spielberg"
    ],
    "takes place in fictional universe": [
      "Transformers universe"
    ],
    "follows": [
      "Transformers: Age of Extinction"
    ],
    "followed by": [
      "Bumblebee"
    ],
    "cast member": [
      "Liam Garrigan",
      "Mark Wahlberg",
      "Laura Haddock",
      "Josh Duhamel",
      "Isabela Moner",
      "Jerrod Carmichael",
      "Stanley Tucci",
      "John Turturro",
      "Mitch Pileggi",
      "Tyrese Gibson",
      "Santiago Cabrera",
      "Anthony Hopkins"
    ],
    "MPAA film rating": [
      "PG-13"
    ],
    "part of the series": [
      "Transformers"
    ],
    "uses": [
      "product placement"
    ],
    "NMHH film rating": [
      "Category III"
    ],
    "BBFC rating": [
      "12A certificate"
    ],
    "

In [145]:
professional_to_predicate("Hans Zimmer")

{
  "Q76364": {
    "entity_name": "Hans Zimmer",
    "native language": [
      "German"
    ],
    "occupation": [
      "musician"
    ],
    "described by source": [
      "Obalky knih.cz"
    ],
    "religion": [
      "Christianity"
    ],
    "nominated for": [
      "Academy Award for Best Original Musical or Comedy Score",
      "Academy Award for Best Original Dramatic Score",
      "Academy Award for Best Original Score"
    ],
    "languages spoken, written or signed": [
      "German"
    ],
    "award received": [
      "Stephen Hawking Medal For Science Communication",
      "Academy Award for Best Original Score"
    ],
    "place of birth": [
      "Frankfurt am Main"
    ],
    "country of citizenship": [
      "Germany"
    ],
    "instance of": [
      "human"
    ],
    "work location": [
      "Los Angeles"
    ]
  }
}


In [146]:
get_imdb_id_by_name("Brad Pitt")

[['nm0000093', 'http://www.wikidata.org/entity/Q35332']]


In [147]:
with open("exports/film_db.json", "r", encoding="utf-8") as file:
    film_dict = json.load(file)
with open("exports/actors_db.json", "r", encoding="utf-8") as file:
    actor_dict = json.load(file)
with open("exports/professionals_db.json", "r", encoding="utf-8") as file:
    professional_dict = json.load(file)