In [1]:
import pandas as pd
import json
import random

import igraph as ig
from tqdm.notebook import tqdm

import unicodedata
import re
from collections import defaultdict

import warnings
warnings.filterwarnings("ignore")

db = pd.read_pickle("exports/extended_graph_triples.pkl")

In [7]:
db[db.predicate_label == "genre"]

Unnamed: 0,subject_id,subject_label,predicate_label,object_label
350,http://www.wikidata.org/entity/Q1000825,jan dara,genre,lgbtrelated film
370,http://www.wikidata.org/entity/Q1000826,guns of the magnificent seven,genre,western film
427,http://www.wikidata.org/entity/Q1001114,buddy baker,genre,film score
472,http://www.wikidata.org/entity/Q1001214,buddy ebsen,genre,western
677,http://www.wikidata.org/entity/Q1001759,fist of legend,genre,"martial arts film,action film"
...,...,...,...,...
1714040,http://www.wikidata.org/entity/Q998377,what about bob,genre,comedy film
1714059,http://www.wikidata.org/entity/Q998396,lemmerdeur,genre,"comedy film,buddy film"
1714096,http://www.wikidata.org/entity/Q99871,robert thalheim,genre,"political satire,drama,comedydrama"
1714205,http://www.wikidata.org/entity/Q999318,buck hill,genre,jazz


In [38]:
genre_data = db[db.predicate_label == "genre"].object_label[:500]

unique_genres = set()
for genres in genre_data:
    genres = genres.strip('"')
    for genre in genres.split(','):
        genre = genre.strip()
        unique_genres.add(genre)

In [52]:
from collections import Counter

unique_genres = []
for genres in genre_data[:10000]:
    genres = genres.strip('"')
    unique_genres.extend([genre.strip() for genre in genres.split(',')])

unique_genres = [g for g in unique_genres if g not in movie_values and g not in people_values]
genre_counter = Counter(unique_genres)
top_n_genres = genre_counter.most_common(150)

top_n_genres_df = pd.DataFrame(top_n_genres, columns=['Genre', 'Occurrences'])


In [54]:
top_n_genres_json = top_n_genres_df['Genre'].to_json(orient='values')

# Saving the JSON to a file
json_path = 'exports/genre_db.json'
with open(json_path, 'w') as json_file:
    json.dump(json.loads(top_n_genres_json), json_file)

In [62]:
top_n_genres_df[top_n_genres_df.Genre.str.startswith("wes")]

Unnamed: 0,Genre,Occurrences
25,western film,9
152,western classical music,1


In [40]:
with open(r'exports/movie_db.json') as f:
    movie_data = json.load(f)
    movie_values = set(movie_data.values())

In [41]:
with open(r"exports/people_db.json") as f:
    people_data = json.load(f)
    people_values = set(people_data.values())

In [42]:
len([g for g in unique_genres if g not in movie_values and g not in people_values])

191

In [43]:
[g for g in unique_genres if g not in movie_values and g not in people_values]

['the residents',
 'spy film',
 'comic science fiction',
 'gay trilogy',
 'mystery film',
 'animated film',
 'macross delta',
 'rockumentary',
 'teen film',
 'slasher film',
 'huis clos',
 'speculative fiction',
 'lol',
 'romance novel',
 'epic film',
 'inuyasha',
 'anime film',
 'fantasy film',
 'suspense',
 'neonoir',
 'wd:Q28974159',
 'lgbtrelated film',
 'dystopian fiction',
 'code geass',
 'horror anime and manga',
 'swashbuckler film',
 'crime film',
 'television western',
 'drama television series',
 'war film',
 'social problem film',
 'world music',
 'biographical film',
 '3',
 'puzzle video game',
 '360',
 'erotic thriller',
 'concert film',
 'miniseries',
 'speculative fiction novel',
 'science fiction comic',
 'don',
 'art film',
 'science fiction action film',
 'sword and sorcery',
 'film noir',
 'hana yori dango returns',
 'xxy',
 'tokusatsu',
 'parasyte',
 'gangster film',
 'actionadventure film',
 'historical film',
 'trial film',
 'karate tiger',
 'science fiction',
 '

In [71]:
[m for m in movie_values if m.startswith("ryan")]

['ryan', 'ryans daughter']

In [72]:
[p for p in people_values if p.startswith("ryan g")]

['ryan gage', 'ryan gosling', 'ryan guzman']

In [3]:
# Graph Construction
relevant_predicates = ["director", 
                       "performer",
                       "genre",
                       "screenwriter",
                       "cast member",
                       "publication date",
                       "mpaa film rating"]

db_filtered = db[db.predicate_label.isin(relevant_predicates)]
db_filtered['object_label'] = db_filtered['object_label'].astype(str)
db_filtered = db_filtered.copy()

db_filtered.loc[db_filtered['predicate_label'] == "publication date", 'publication_year'] = (
    db_filtered.loc[db_filtered['predicate_label'] == "publication date", 'object_label']
    .apply(lambda x: x.split("-")[0])
)

with open(r'exports/movie_db.json') as f:
    movie_data = json.load(f)
    movie_values = set(movie_data.values())
    

    
G = ig.Graph(directed=False)

node_to_index = {}
index_to_node = []

def get_or_add_node(node_label):
    if node_label not in node_to_index:
        index = len(index_to_node)
        node_to_index[node_label] = index
        index_to_node.append(node_label)
        G.add_vertex(name=node_label)
    return node_to_index[node_label]

df = db_filtered.copy()
df['object_label'] = df['object_label'].str.split(',')
df = df.explode('object_label')
df['object_label'] = df['object_label'].str.strip()

edge_dict = {}

for _, row in tqdm(df.iterrows(), desc="Processing rows", total=len(df)):
    label = row['predicate_label']
    individual_value = row['object_label']
    movie = row['subject_label']
    
    value_index = get_or_add_node(individual_value)
    movie_index = get_or_add_node(movie)
    
    edge = (movie_index, value_index)
    
    if edge in edge_dict:
        edge_dict[edge] += 1
    else:
        edge_dict[edge] = 1

edges_to_add = list(edge_dict.keys())
weights = list(edge_dict.values())

G.add_edges(edges_to_add)
G.es["weight"] = weights

print("Graph construction complete.")

################################
# Migration Igraph -> NetworkX
################################
import networkx as nx

G_nx = nx.Graph()

for vertex in G.vs:
    G_nx.add_node(vertex["name"])

for edge in G.es:
    source = edge.source
    target = edge.target
    weight = edge["weight"]
    source_label = G.vs[source]["name"]
    target_label = G.vs[target]["name"]
    G_nx.add_edge(source_label, target_label, weight=weight)

print("Converted igraph to NetworkX.")


Processing rows:   0%|          | 0/884749 [00:00<?, ?it/s]

Graph construction complete.
Converted igraph to NetworkX.


In [4]:
def rp_beta_recommendations_aggregate(entities, G, movie_list, num_walks=50, walk_length_range=(2, 4), beta_range=(0, 0.1), top_n=10):
    """
    Generate aggregated recommendations using weighted RP-beta 3 for a list of entities.

    Parameters:
        - entities: A list of starting nodes (e.g., movies or people) for the random walks.
        - G: The graph with movies and object labels.
        - movie_list: A list of valid movies for recommendations.
        - num_walks: Number of random walks to perform per entity.
        - walk_length_range: Range for walk lengths (min, max).
        - beta_range: Range for restart probability (min, max).
        - top_n: Number of recommendations to return.

    Returns:
        - A list of recommended movies and their aggregated relevance scores.
    """
    relevance_scores = defaultdict(float)
    visited_movies = set(entities)

    for entity in entities:
        for _ in range(num_walks):
            walk = [entity]
            current_node = entity
            walk_length = random.randint(*walk_length_range)
            beta = random.uniform(*beta_range)

            for _ in range(walk_length - 1):
                if random.random() < beta:
                    current_node = entity
                else:
                    neighbors = list(G.neighbors(current_node))
                    if not neighbors:
                        break

                    weights = [G[current_node][neighbor].get("weight", 1) for neighbor in neighbors]
                    total_weight = sum(weights)
                    probabilities = [weight / total_weight for weight in weights]

                    next_node = random.choices(neighbors, probabilities, k=1)[0]

                    if next_node != entity and next_node not in visited_movies:
                        if next_node in movie_list:
                            relevance_scores[next_node] += G[current_node][next_node].get("weight", 1)

                    current_node = next_node

    sorted_recommendations = sorted(
        relevance_scores.items(),
        key=lambda x: x[1],
        reverse=True
    )

    recommendations = sorted_recommendations[:top_n]
    return recommendations

In [5]:
# Custom Stopword set, TO DO: Refine if necessary
STOPWORDS = {"a", "an", "the", "of", "on", "and", "in", "at", "for", "to", "is", "it"}

def normalize_string(s):
    """
    Normalize a string by removing accents, punctuation, converting to lowercase,
    and removing stopwords.

    Args:
        s (str): The string to normalize.

    Returns:
        str: The normalized string.
    """
    s = s.lower()
    s = unicodedata.normalize('NFKD', s)
    s = re.sub(r'[^\w\s]', '', s)
    return ' '.join(word for word in s.split() if word not in STOPWORDS)

class PrefixNode:
    """
    A node in the Prefix Tree (Trie) data structure.

    Attributes:
        children (dict): A dictionary mapping tokens to child PrefixNodes.
        is_end_of_entity (bool): Indicates if the node represents the end of an entity.
        original_entity (str): The original entity string stored at the end node.
    """

    def __init__(self):
        self.children = {}
        self.is_end_of_entity = False
        self.original_entity = None

class PrefixTree:
    """
    A Prefix Tree (Trie) for efficient matching of entities within text.

    Attributes:
        root (PrefixNode): The root node of the Prefix Tree.
    """

    def __init__(self):
        self.root = PrefixNode()

    def insert(self, entity_tokens, original_entity):
        """
        Insert an entity into the Prefix Tree.

        Args:
            entity_tokens (list of str): A list of tokens representing the entity.
            original_entity (str): The original entity string.
        """
        node = self.root
        for token in entity_tokens:
            if token not in node.children:
                node.children[token] = PrefixNode()
            node = node.children[token]
        node.is_end_of_entity = True
        node.original_entity = original_entity

    def search(self, tokens, start_index):
        """
        Search for the longest matching entity starting from a given index in the tokens list.

        Args:
            tokens (list of str): The list of tokens to search within.
            start_index (int): The index to start searching from.

        Returns:
            tuple: A tuple containing the matched entity (or None if no match)
                   and the index where the match ends.
        """
        node = self.root
        current_index = start_index
        last_matching_index = -1
        last_matching_entity = None

        while current_index < len(tokens):
            token = tokens[current_index]
            if token in node.children:
                node = node.children[token]
                if node.is_end_of_entity:
                    last_matching_index = current_index
                    last_matching_entity = node.original_entity
                current_index += 1
            else:
                break

        if last_matching_index != -1:
            return last_matching_entity, last_matching_index
        else:
            return None, start_index

def tokenize_query(query):
    """
    Tokenize and normalize the query string.

    Args:
        query (str): The query string to tokenize.

    Returns:
        list of str: A list of normalized tokens from the query.
    """
    normalized_query = normalize_string(query)
    tokens = normalized_query.split()
    return tokens

def extract_entities(query, tries):
    """
    Extract entities from the query using the provided Prefix Trees.

    Args:
        query (str): The query string from which to extract entities.
        tries (dict): A dictionary mapping entity types to their respective PrefixTrees.

    Returns:
        dict: A dictionary mapping entity types to lists of matched entities.
    """
    tokens = tokenize_query(query)
    matched_entities = defaultdict(list)
    i = 0
    while i < len(tokens):
        match_found = False
        for entity_type, trie in tries.items():
            match, end_index = trie.search(tokens, i)
            if match:
                matched_entities[entity_type].append(match)
                i = end_index + 1  # Move past the matched entity
                match_found = True
                break
        if not match_found:
            i += 1  # Move to the next token if no match
    return matched_entities


In [6]:
MIN_TITLE_LENGTH = 4
MIN_NAME_LENGTH = 4

with open('exports/movie_db.json') as f:
    movie_data = json.load(f)

normalized_movie_titles = {}
for movie_id, movie_title in movie_data.items():
    normalized_title = normalize_string(movie_title)
    title_length = len(normalized_title.replace(' ', ''))
    if title_length >= MIN_TITLE_LENGTH:
        normalized_movie_titles[normalized_title] = movie_title

with open('exports/people_db.json') as f:
    people_data = json.load(f)

normalized_people_names = {}
for person_id, person_name in people_data.items():
    normalized_name = normalize_string(person_name)
    name_length = len(normalized_name.replace(' ', ''))
    if name_length >= MIN_NAME_LENGTH:
        normalized_people_names[normalized_name] = person_name

movie_trie = PrefixTree()
for normalized_title, original_title in normalized_movie_titles.items():
    title_tokens = normalized_title.split()
    movie_trie.insert(title_tokens, original_title)

people_trie = PrefixTree()
for normalized_name, original_name in normalized_people_names.items():
    name_tokens = normalized_name.split()
    people_trie.insert(name_tokens, original_name)

tries = {
    'movies': movie_trie,
    'people': people_trie
}


In [7]:
queries = [
    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
    "I love movies like The Matrix and Inception.",
    "Can you recommend films directed by Christopher Nolan?",
    "Looking for movies starring Tom Hanks and Meryl Streep.",
    "What are some movies similar to The Godfather and Scarface?",
    "Suggest movies such as Toy Story or Finding Nemo.",
    "Who directed Pulp Fiction and Kill Bill?",
    "Find films with performances by Robert De Niro.",
    "Are there any movies like Titanic or Avatar?",
    "Could you recommend some films by Steven Spielberg?",
    "Where was Angelina Jolie born?",
    "Who won the Ballon dOr?",
    "Who is Hans Zimmer?",
    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
    "Did Leonardo diCaprio play in Inception?",
    "Who directed Good Will Hunting?"
]

for q in queries:
    matched_entities = extract_entities(q, tries)
    print(f"\nQuery: {q}\nMatched Entities:")
    for entity_type, entities in matched_entities.items():
        unique_entities = list(set(entities)) 
        print(f"{entity_type.capitalize()}: {', '.join(unique_entities)}")


Query: Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.
Matched Entities:
Movies: friday the 13th, halloween, a nightmare on elm street

Query: I love movies like The Matrix and Inception.
Matched Entities:
Movies: the matrix, inception, love

Query: Can you recommend films directed by Christopher Nolan?
Matched Entities:
People: christopher nolan

Query: Looking for movies starring Tom Hanks and Meryl Streep.
Matched Entities:
People: meryl streep, tom hanks

Query: What are some movies similar to The Godfather and Scarface?
Matched Entities:
Movies: scarface, the godfather

Query: Suggest movies such as Toy Story or Finding Nemo.
Matched Entities:
Movies: finding nemo, toy story

Query: Who directed Pulp Fiction and Kill Bill?
Matched Entities:
Movies: pulp fiction

Query: Find films with performances by Robert De Niro.
Matched Entities:
People: robert de niro

Query: Are there any movies like Titanic or Avatar?
Matched Entities:
Movies: avatar, titanic

In [15]:
def get_query_entities(query):
    """
    Extract movies and people entities from a query.

    Args:
        query (str): The user query.

    Returns:
        set: A set of extracted movie and people names.
    """
    matched_entities = extract_entities(query, tries)
    extracted_entities = set()
    for entity_list in matched_entities.values():
        extracted_entities.update(entity_list)
    return extracted_entities


recommendation_queries =  [
                    "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.",
                    "I love movies like The Matrix and Inception. Recommend similar ones",
                    "Can you recommend films directed by Christopher Nolan?",
                    "Looking for movies starring Tom Hanks and Meryl Streep.",
                    "What are some movies similar to The Godfather and Scarface?",
                    "Suggest movies such as Toy Story or Finding Nemo.",
                    "Find films with performances by Robert De Niro.",
                    "Are there any movies like Titanic or Avatar?",
                    "Could you recommend some films by Steven Spielberg?",
                    "Given that I like The Lion King, Pocahontas, and The Beauty and the Beast, can you recommend some movies?",
                    "My favorite movies are Inception and Pocahontas, recommend movies I may like",
                    "I like Inception a lot, can you recommend similar movies?"
                ]

for q in recommendation_queries:
    print(f"\nQuery: {q}")
    
    extracted_entities = get_query_entities(q)
    print(f"Extracted Entities: {extracted_entities}")

    recommendations = rp_beta_recommendations_aggregate(
        entities=extracted_entities,
        G=G_nx,
        movie_list=movie_values,
        num_walks=300,
        walk_length_range=(2, 4),
        beta_range=(0, 0.05),
        top_n=20
    )

    recommended_movies = []
    for movie, score in recommendations:
        if movie not in extracted_entities:
            recommended_movies.append((movie, score))
        if len(recommended_movies) >= 5:
            break

    if len(recommended_movies) < 5:
        additional_movies = [m for m in movie_values if m not in extracted_entities and m not in [rm[0] for rm in recommended_movies]]
        random.shuffle(additional_movies)
        for m in additional_movies:
            recommended_movies.append((m, 0))
            if len(recommended_movies) >= 5:
                break

    print("Recommended Movies:")
    for movie, score in recommended_movies:
        print(f"{movie}: Score = {score:.2f}")


Query: Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.
Extracted Entities: {'friday the 13th', 'halloween', 'a nightmare on elm street'}
Recommended Movies:
the fog: Score = 16.00
crystal lake memories the complete history of friday the 13th: Score = 14.00
whats the matter with helen: Score = 13.00
going to pieces the rise and fall of the slasher film: Score = 10.00
assault on precinct 13: Score = 10.00

Query: I love movies like The Matrix and Inception. Recommend similar ones
Extracted Entities: {'the matrix', 'inception', 'love'}
Recommended Movies:
the signal: Score = 15.00
x night of vengeance: Score = 13.00
the ugly duckling: Score = 13.00
batman begins: Score = 7.00
the prestige: Score = 6.00

Query: Can you recommend films directed by Christopher Nolan?
Extracted Entities: {'christopher nolan'}
Recommended Movies:
inception: Score = 60.00
dunkirk: Score = 58.00
the prestige: Score = 57.00
interstellar: Score = 51.00
the dark knight: Score = 48.00