In [80]:
import pandas as pd
import json

import warnings
warnings.filterwarnings("ignore")

db = pd.read_pickle("exports/extended_graph_triples.pkl")

In [81]:
relevant_predicates = ["director", 
                       "performer",
                       "genre",
                       "screenwriter",
                       "cast member",
                       "publication date",
                       "mpaa film rating"]

db_filtered = db[db.predicate_label.isin(relevant_predicates)]
db_filtered['object_label'] = db_filtered['object_label'].astype(str)
db_filtered = db_filtered.copy()

db_filtered.loc[db_filtered['predicate_label'] == "publication date", 'publication_year'] = (
    db_filtered.loc[db_filtered['predicate_label'] == "publication date", 'object_label']
    .apply(lambda x: x.split("-")[0])
)

In [82]:
with open(r'exports/movie_db.json') as f:
    movie_data = json.load(f)
    movie_values = set(movie_data.values())

In [83]:
import igraph as ig
import pandas as pd
from tqdm.notebook import tqdm

G = ig.Graph(directed=False)

node_to_index = {}
index_to_node = []

def get_or_add_node(node_label):
    if node_label not in node_to_index:
        index = len(index_to_node)
        node_to_index[node_label] = index
        index_to_node.append(node_label)
        G.add_vertex(name=node_label)
    return node_to_index[node_label]

df = db_filtered.copy()
df['object_label'] = df['object_label'].str.split(',')
df = df.explode('object_label')
df['object_label'] = df['object_label'].str.strip()

edge_dict = {}

for _, row in tqdm(df.iterrows(), desc="Processing rows", total=len(df)):
    label = row['predicate_label']
    individual_value = row['object_label']
    movie = row['subject_label']
    
    value_index = get_or_add_node(individual_value)
    movie_index = get_or_add_node(movie)
    
    edge = (movie_index, value_index)
    
    if edge in edge_dict:
        edge_dict[edge] += 1
    else:
        edge_dict[edge] = 1

edges_to_add = list(edge_dict.keys())
weights = list(edge_dict.values())

G.add_edges(edges_to_add)
G.es["weight"] = weights

print("Graph construction complete.")


Processing rows:   0%|          | 0/884749 [00:00<?, ?it/s]

Graph construction complete.


In [84]:
################################
# Migration Igraph -> NetworkX
################################
import networkx as nx

G_nx = nx.Graph()

for vertex in G.vs:
    G_nx.add_node(vertex["name"])

for edge in G.es:
    source = edge.source
    target = edge.target
    weight = edge["weight"]
    source_label = G.vs[source]["name"]
    target_label = G.vs[target]["name"]
    G_nx.add_edge(source_label, target_label, weight=weight)

print("Converted igraph to NetworkX.")

Converted igraph to NetworkX.


In [86]:
import random
from collections import defaultdict

def rp_beta_recommendations_weighted(movie, G, movie_list, num_walks=50, walk_length_range=(2,4), beta_range=(0, 0.1), top_n=10):
    """
    Generate recommendations using weighted RP-beta 3 with relevance-based sorting.
    
    Parameters:
        - movie: The starting node (e.g., "Hans Zimmer", "The Lion King") for the random walk.
        - G: The graph with movies and object labels.
        - movie_list: A list of valid movies for recommendations.
        - num_walks: Number of random walks to perform.
        - walk_length_range: Range for walk lengths (min, max).
        - beta_range: Range for restart probability (min, max).
        - top_n: Number of recommendations to return.
    
    Returns:
        - A list of recommended movies and their relevance scores based on weighted RP-beta 3.
    """
    all_walks = []
    relevance_scores = defaultdict(float)
    
    for _ in range(num_walks):
        walk = [movie]
        current_node = movie
        walk_length = random.randint(*walk_length_range)
        beta = random.uniform(*beta_range)
        
        for _ in range(walk_length - 1):
            if random.random() < beta:
                current_node = movie
            else:
                neighbors = list(G.neighbors(current_node))
                if not neighbors:
                    break
                
                weights = [G[current_node][neighbor].get("weight", 1) for neighbor in neighbors]
                total_weight = sum(weights)
                probabilities = [weight / total_weight for weight in weights]
                
                next_node = random.choices(neighbors, probabilities, k=1)[0]
                
                if next_node != movie:
                    relevance_scores[next_node] += G[current_node][next_node].get("weight", 1)
                
                walk.append(next_node)
                current_node = next_node
        
        all_walks.extend(walk)
    
    sorted_recommendations = sorted(
        ((movie, score) for movie, score in relevance_scores.items() if movie in movie_list),
        key=lambda x: x[1],
        reverse=True
    )
    
    recommendations = sorted_recommendations[:top_n]
    return recommendations

recommendations = rp_beta_recommendations_weighted("the lion king", G_nx, movie_values)

print("Top recommendations by relevance:")
for movie, score in recommendations:
    print(f"{movie}: {score:.2f}")


Top recommendations by relevance:
family film: 3.00
open season: 2.00
whisper 3: 1.00
a hatful of rain: 1.00
school of rock: 1.00
neighbors 2: 1.00
hodet over vannet: 1.00
alvin and the chipmunks chipwrecked: 1.00
samson: 1.00
some kind of monster: 1.00


In [87]:
recommendations_one = rp_beta_recommendations_weighted("a nightmare on elm street", G_nx, movie_values)
print("Top recommendations by relevance:")
for movie, score in recommendations_one:
    print(f"{movie}: {score:.2f}")

Top recommendations by relevance:
paris je taime: 3.00
childs play: 3.00
fear clinic: 2.00
salvage: 2.00
r xmas: 1.00
final analysis: 1.00
urban legends final cut: 1.00
the girl with the dragon tattoo: 1.00
sheitan: 1.00
the appeared: 1.00


In [88]:
recommendations_two = rp_beta_recommendations_weighted("friday the 13th", G_nx, movie_values)
print("Top recommendations by relevance:")
for movie, score in recommendations_two:
    print(f"{movie}: {score:.2f}")

Top recommendations by relevance:
crystal lake memories the complete history of friday the 13th: 3.00
silent night deadly night: 2.00
going to pieces the rise and fall of the slasher film: 2.00
the gingerdead man: 2.00
nothing: 1.00
american pastime: 1.00
house of wax: 1.00
final destination 5: 1.00
hansel gretel witch hunters: 1.00
reeker: 1.00


In [89]:
recommendations_three = rp_beta_recommendations_weighted("halloween", G_nx, movie_values)
print("Top recommendations by relevance:")
for movie, score in recommendations_three:
    print(f"{movie}: {score:.2f}")

Top recommendations by relevance:
the fog: 4.00
black christmas: 3.00
george washington: 2.00
halloween ii: 2.00
invaders from mars: 2.00
dawn of the dead: 2.00
manglehorn: 2.00
hairspray: 2.00
machete: 1.00
the brady bunch movie: 1.00


In [283]:
def normalize_string(s):
    """Normalizes strings by removing non-ASCII characters, punctuation, and selected stopwords."""
    return ' '.join(re.sub(r'[^\w\s]', '', unicodedata.normalize('NFKD', s.lower())
                                 .encode('ascii', 'ignore').decode('utf-8')).split())
    

In [284]:
import logging
import unicodedata
import re

def fuzzy_match(query_str, comparison_list):
    normalized_query = normalize_string(query_str)    
    entities = {**movie_data, **people_data}
    name_to_id = {v: k for k, v in entities.items()}

    longest_full_match = ""
    longest_full_length = 0
    longest_prefix_match = ""
    longest_prefix_length = 0
    longest_suffix_match = ""
    longest_suffix_length = 0

    full_matches = []
    prefix_matches = []
    suffix_matches = []
                
    for subject in comparison_list:
        if "porn" in subject:
            continue
        
        subject_tokens = normalize_string(subject).split()
        normalized_subject = normalize_string(subject)
        
        # Check for full match within the query
        if normalized_subject in normalized_query:
            full_matches.append((subject, len(subject)))
            if len(subject) > longest_full_length:
                longest_full_match = subject
                longest_full_length = len(subject)

        # Check for prefix match
        for i in range(len(normalized_subject), 0, -1):
            if normalized_subject[:i] == normalized_query[:i] and len(normalized_subject) > len(normalized_query):
                prefix_matches.append((subject, i))
                if i > longest_prefix_length:
                    longest_prefix_match = subject
                    longest_prefix_length = i
                break
        
        # Check for suffix match
        for i in range(len(subject_tokens), 0, -1):
            suffix_sequence = " ".join(subject_tokens[-i:])
            if suffix_sequence in normalized_query:
                suffix_matches.append((subject, len(suffix_sequence)))
                if len(suffix_sequence) > longest_suffix_length:
                    longest_suffix_match = subject
                    longest_suffix_length = len(suffix_sequence)
                break
        
    top_full_matches = sorted(full_matches, key=lambda x: x[1], reverse=True)[:5]
    top_prefix_matches = sorted(prefix_matches, key=lambda x: x[1], reverse=True)[:5]
    top_suffix_matches = sorted(suffix_matches, key=lambda x: x[1], reverse=True)[:5]
    
    if top_full_matches:
        print("Top FULL matches:")
        for match in set(top_full_matches):
            print(f"Match: {match[0]}, Length: {match[1]}")
            
    elif top_prefix_matches:
        print("Top PREFIX matches:")
        for match in set(top_prefix_matches):
            print(f"Match: {match[0]}, Length: {match[1]}")
    
    elif top_suffix_matches:
        print("Top SUFFIX matches:")
        for match in set(top_suffix_matches):
            print(f"Match: {match[0]}, Length: {match[1]}")    
    

In [285]:
example_query = "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween."

In [286]:
with open(r'exports/movie_db.json') as f:
    movie_data = json.load(f)
    movie_ids = set(movie_data.keys())
movie_db = pd.DataFrame(list(movie_data.items()), columns=["entity_id", "entity_label"])

with open(r'exports/people_db.json') as f:
    people_data = json.load(f)
    people_ids = set(people_data.keys())
people_db = pd.DataFrame(list(people_data.items()), columns=["entity_id", "entity_label"])

In [287]:
fuzzy_match(example_query, movie_db.entity_label.tolist())
fuzzy_match(example_query, people_db.entity_label.tolist())

Top FULL matches:
Match: friday the 13th, Length: 15
Match: halloween, Length: 9
Top FULL matches:
Match: , Length: 0


In [288]:
movie_db[movie_db.entity_label.str.endswith("elm street")]

Unnamed: 0,entity_id,entity_label
19785,http://www.wikidata.org/entity/Q4660616,a wet dream on elm street
22206,http://www.wikidata.org/entity/Q300508,a nightmare on elm street
23875,http://www.wikidata.org/entity/Q329434,a nightmare on elm street
