In [59]:
import pandas as pd
import json

db = pd.read_pickle("exports/extended_graph_triples.pkl")

In [116]:
relevant_predicates = ["director", 
                       "performer",
                       "genre",
                       "screenwriter",
                       "cast member",
                       "publication date",
                       "narrative location"]

db_filtered = db[db.predicate_label.isin(relevant_predicates)]

In [64]:
with open(r'exports/movie_db.json') as f:
    movie_data = json.load(f)
    movie_values = set(movie_data.values())

In [97]:
df[df.subject_label == "the lion king"]

Unnamed: 0,subject_id,subject_label,predicate_label,object_label
193614,http://www.wikidata.org/entity/Q134138,the lion king,publication date,1994-12-08
721412,http://www.wikidata.org/entity/Q27044293,the lion king,director,jon favreau
721421,http://www.wikidata.org/entity/Q27044293,the lion king,genre,"comedy film,computeranimated film,family film,..."
721441,http://www.wikidata.org/entity/Q27044293,the lion king,publication date,2019-07-17
721443,http://www.wikidata.org/entity/Q27044293,the lion king,screenwriter,jeff nathanson
1010124,http://www.wikidata.org/entity/Q36479,the lion king,director,"roger allers,rob minkoff"
1010138,http://www.wikidata.org/entity/Q36479,the lion king,genre,"drama,childrens film,traditionally animated fi..."
1010155,http://www.wikidata.org/entity/Q36479,the lion king,performer,elton john
1010159,http://www.wikidata.org/entity/Q36479,the lion king,publication date,1994-06-15
1010162,http://www.wikidata.org/entity/Q36479,the lion king,screenwriter,"jonathan roberts,irene mecchi,linda woolverton"


In [117]:
import networkx as nx
from tqdm.notebook import tqdm 

G = nx.Graph()

df = db_filtered.copy()

for label in tqdm(df['predicate_label'].unique(), desc="Processing predicate labels"):
    label_df = df[df['predicate_label'] == label]
    
    for value in tqdm(label_df['object_label'].unique(), desc=f"Processing object labels for {label}", leave=False):
        
        movies_with_value = label_df[label_df['object_label'] == value]['subject_label'].unique()
        
        if value not in G:
            G.add_node(value, type='object_label', label=label)
        
        for movie in movies_with_value:
            if movie not in G:
                G.add_node(movie, type='movie')
            if not G.has_edge(movie, value):
                G.add_edge(movie, value, weight=1)
            else:
                G[movie][value]['weight'] += 1


Processing predicate labels:   0%|          | 0/7 [00:00<?, ?it/s]

Processing object labels for narrative location:   0%|          | 0/3897 [00:00<?, ?it/s]

Processing object labels for cast member:   0%|          | 0/74423 [00:00<?, ?it/s]

Processing object labels for performer:   0%|          | 0/8256 [00:00<?, ?it/s]

Processing object labels for director:   0%|          | 0/21670 [00:00<?, ?it/s]

Processing object labels for genre:   0%|          | 0/9485 [00:00<?, ?it/s]

Processing object labels for publication date:   0%|          | 0/6578 [00:00<?, ?it/s]

Processing object labels for screenwriter:   0%|          | 0/26685 [00:00<?, ?it/s]

In [311]:
import random
from collections import Counter

def rp_beta_recommendations_unweighted(movie, G, movie_list, num_walks=50, walk_length_range=(3,6), beta_range=(0.1, 0.2), top_n=100):
    """
    Generate recommendations using unweighted RP-beta 3.
    
    Parameters:
        - movie: The starting node (e.g., "Hans Zimmer") for the random walk.
        - G: The graph with movies and object labels.
        - num_walks: Number of random walks to perform.
        - walk_length: Length of each walk.
        - beta: Probability of returning to the starting node at each step.
        - top_n: Number of recommendations to return.
    
    Returns:
        - A list of recommended movies based on unweighted RP-beta 3.
    """
    all_walks = []
    
    for _ in range(num_walks):
        walk = [movie]
        current_node = movie
        walk_length = random.randint(*walk_length_range)
        beta = random.uniform(*beta_range)
        
        for _ in range(walk_length - 1):
            if random.random() < beta:
                current_node = movie
            else:
                neighbors = list(G.neighbors(current_node))
                if not neighbors:
                    break
                next_node = random.choice(neighbors)
                
                walk.append(next_node)
                current_node = next_node
        
        all_walks.extend(walk)
    
    movie_counts = Counter(all_walks)
    del movie_counts[movie]
    
    recommendations = [movie for movie, count in movie_counts.most_common(top_n) if movie in movie_list]
    return recommendations

recommendations = rp_beta_recommendations_unweighted("the lion king", G, movie_values)
print("Top recommendations:\n", recommendations)


Top recommendations:
 ['jungle', 'the little prince', 'speed 2 cruise control', 'paradise beach', 'the last samurai', 'crypto', 'jexi', 'sherlock holmes', 'rush hour 3', 'maronas fantastic tale', 'pinocchio', 'aladin', 'this is not berlin']


In [312]:
def order_by_edge_weight(recommendations, start_movie, G):
    weighted_scores = {}
    
    for rec in recommendations:
        score = sum(
            G[start_movie][neighbor]['weight'] * G[rec][neighbor]['weight']
            for neighbor in set(G.neighbors(start_movie)).intersection(G.neighbors(rec))
            if G.has_edge(start_movie, neighbor) and G.has_edge(rec, neighbor)
        )
        weighted_scores[rec] = score
    
    ordered_recommendations = sorted(weighted_scores, key=weighted_scores.get, reverse=True)
    return ordered_recommendations

ordered_recommendations = order_by_edge_weight(recommendations, "the lion king", G)
print("Recommendations ordered by edge weight:\n", ordered_recommendations)

Recommendations ordered by edge weight:
 ['sherlock holmes', 'the little prince', 'speed 2 cruise control', 'paradise beach', 'the last samurai', 'crypto', 'jexi', 'rush hour 3', 'maronas fantastic tale', 'pinocchio', 'this is not berlin', 'jungle', 'aladin']


In [330]:
recommendations_one = rp_beta_recommendations_unweighted("a nightmare on elm street", G, movie_values)
print("Top recommendations:\n", recommendations_one)

Top recommendations:
 ['truth or dare', 'juanita', 'the texas chainsaw massacre', 'angel', 'boogeyman 3', 'kit kittredge an american girl', 'ive heard the mermaids singing', 'slashers', 'tokyo', 'godzilla vs megaguirus', 'a wet dream on elm street', 'the bostonians', 'scarecrow slayer', 'enders game', 'gorko', 'dead in 3 days', 'longhorns', 'shocker', 'leatherface the texas chainsaw massacre iii', 'psycho iii', 'the pirate', 'halloween the curse of michael myers', 'the last horror film', 'battleship', 'the texas chainsaw massacre the beginning']


In [328]:
recommendations_two = rp_beta_recommendations_unweighted("friday the 13th", G, movie_values)
print("Top recommendations:\n", recommendations_two)

Top recommendations:
 ['crystal lake memories the complete history of friday the 13th', 'friday the 13th the final chapter', 'mortal thoughts', 'terror train', 'gardener of eden', 'london river', 'white material', 'frequently asked questions about time travel', 'cop land', 'the hills run red', 'satisfaction', 'mystery team', 'mutants']


In [329]:
recommendations_three = rp_beta_recommendations_unweighted("halloween", G, movie_values)
print("Top recommendations:\n", recommendations_three)

Top recommendations:
 ['halloween 5 the revenge of michael myers', 'halloween ii', 'can you ever forgive me', 'our brand is crisis', 'wildlife', 'salvage', 'mid90s', 'the mule', 'left for dead', 'candyman farewell to the flesh', 'drowning ghost', 'when i grow up ill be a kangaroo', 'the last gang', 'lucas', 'george washington', 'nh10', 'dead mary', 'life in a metro', 'macabre', 'undertow', 'cheaper by the dozen']


In [283]:
def normalize_string(s):
    """Normalizes strings by removing non-ASCII characters, punctuation, and selected stopwords."""
    return ' '.join(re.sub(r'[^\w\s]', '', unicodedata.normalize('NFKD', s.lower())
                                 .encode('ascii', 'ignore').decode('utf-8')).split())
    

In [284]:
import logging
import unicodedata
import re

def fuzzy_match(query_str, comparison_list):
    normalized_query = normalize_string(query_str)    
    entities = {**movie_data, **people_data}
    name_to_id = {v: k for k, v in entities.items()}

    longest_full_match = ""
    longest_full_length = 0
    longest_prefix_match = ""
    longest_prefix_length = 0
    longest_suffix_match = ""
    longest_suffix_length = 0

    full_matches = []
    prefix_matches = []
    suffix_matches = []
                
    for subject in comparison_list:
        if "porn" in subject:
            continue
        
        subject_tokens = normalize_string(subject).split()
        normalized_subject = normalize_string(subject)
        
        # Check for full match within the query
        if normalized_subject in normalized_query:
            full_matches.append((subject, len(subject)))
            if len(subject) > longest_full_length:
                longest_full_match = subject
                longest_full_length = len(subject)

        # Check for prefix match
        for i in range(len(normalized_subject), 0, -1):
            if normalized_subject[:i] == normalized_query[:i] and len(normalized_subject) > len(normalized_query):
                prefix_matches.append((subject, i))
                if i > longest_prefix_length:
                    longest_prefix_match = subject
                    longest_prefix_length = i
                break
        
        # Check for suffix match
        for i in range(len(subject_tokens), 0, -1):
            suffix_sequence = " ".join(subject_tokens[-i:])
            if suffix_sequence in normalized_query:
                suffix_matches.append((subject, len(suffix_sequence)))
                if len(suffix_sequence) > longest_suffix_length:
                    longest_suffix_match = subject
                    longest_suffix_length = len(suffix_sequence)
                break
        
    top_full_matches = sorted(full_matches, key=lambda x: x[1], reverse=True)[:5]
    top_prefix_matches = sorted(prefix_matches, key=lambda x: x[1], reverse=True)[:5]
    top_suffix_matches = sorted(suffix_matches, key=lambda x: x[1], reverse=True)[:5]
    
    if top_full_matches:
        print("Top FULL matches:")
        for match in set(top_full_matches):
            print(f"Match: {match[0]}, Length: {match[1]}")
            
    elif top_prefix_matches:
        print("Top PREFIX matches:")
        for match in set(top_prefix_matches):
            print(f"Match: {match[0]}, Length: {match[1]}")
    
    elif top_suffix_matches:
        print("Top SUFFIX matches:")
        for match in set(top_suffix_matches):
            print(f"Match: {match[0]}, Length: {match[1]}")    
    

In [285]:
example_query = "Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween."

In [286]:
with open(r'exports/movie_db.json') as f:
    movie_data = json.load(f)
    movie_ids = set(movie_data.keys())
movie_db = pd.DataFrame(list(movie_data.items()), columns=["entity_id", "entity_label"])

with open(r'exports/people_db.json') as f:
    people_data = json.load(f)
    people_ids = set(people_data.keys())
people_db = pd.DataFrame(list(people_data.items()), columns=["entity_id", "entity_label"])

In [287]:
fuzzy_match(example_query, movie_db.entity_label.tolist())
fuzzy_match(example_query, people_db.entity_label.tolist())

Top FULL matches:
Match: friday the 13th, Length: 15
Match: halloween, Length: 9
Top FULL matches:
Match: , Length: 0


In [288]:
movie_db[movie_db.entity_label.str.endswith("elm street")]

Unnamed: 0,entity_id,entity_label
19785,http://www.wikidata.org/entity/Q4660616,a wet dream on elm street
22206,http://www.wikidata.org/entity/Q300508,a nightmare on elm street
23875,http://www.wikidata.org/entity/Q329434,a nightmare on elm street
