In [13]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from datasketch import MinHash, MinHashLSH
import re
import numpy as np
from sklearn.preprocessing import StandardScaler
from numpy.linalg import norm
from scipy.sparse import csr_matrix

In [None]:
movie_path = os.path.abspath('../data/bronze/movies.csv')
rating_path = os.path.abspath('../data/bronze/ratings.csv')
df_movies_original = pd.read_csv(movie_path)
df_ratings_original = pd.read_csv(rating_path)

In [3]:
df_movies = df_movies_original.copy()
df_ratings = df_ratings_original.copy()

In [4]:
df_ratings.drop(columns=["timestamp"], inplace=True)
df_ratings = df_ratings.set_index("movieId", drop=False)

In [5]:
def extract_year(title: str):
    """Extracts year from parentheses at end of title and returns (clean_title, year or pd.NA)."""
    match = re.search(r"\((\d{4})\)", title)
    if match:
        year = int(match.group(1))
        clean_title = title[:match.start()].strip()
        return clean_title, year
    else:
        return title.strip(), pd.NA

# Apply extraction
df_movies[["title", "year"]] = df_movies["title"].apply(lambda x: pd.Series(extract_year(x)))

# ✅ Force to nullable integer dtype (keeps NaN-safe integers)
df_movies["year"] = df_movies["year"].astype("Int64")

# Split genres and set index
df_movies["genres"] = df_movies["genres"].apply(lambda g: g.split("|"))
df_movies = df_movies.set_index("movieId", drop=False)

In [16]:
# users per movie (fixes ambiguity by grouping on the index level)
movie_users = (
    df_ratings.groupby(level="movieId")["userId"]
    .apply(lambda s: set(map(int, s.to_numpy())))
    .reindex(df_movies.index, fill_value=set())
)

# genres per movie
movie_genres = df_movies["genres"].apply(lambda gs: set(g.strip() for g in gs))

# combined token set per movie: genres + users
movie_tokens = {}
for mid in df_movies.index:
    tokens = set()
    tokens.update({f"g:{g}" for g in movie_genres.loc[mid]})
    tokens.update({f"u:{u}" for u in movie_users.loc[mid]})
    movie_tokens[mid] = tokens


In [43]:
num_perm = 128
minhashes = {}
for mid, tokens in movie_tokens.items():
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        m.update(t.encode("utf-8"))
    minhashes[mid] = m

lsh_threshold = 0.3
lsh = MinHashLSH(threshold=lsh_threshold, num_perm=num_perm)
for mid, m in minhashes.items():
    lsh.insert(str(mid), m)


In [44]:
def jaccard(a, b):
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0

def token_breakdown(tokens_a, tokens_b):
    inter = tokens_a & tokens_b
    inter_genres = {t for t in inter if t.startswith("g:")}
    inter_users = {t for t in inter if t.startswith("u:")}
    return {
        "overlap_genres": len(inter_genres),
        "overlap_users": len(inter_users),
        "size_genres_a": sum(1 for t in tokens_a if t.startswith("g:")),
        "size_genres_b": sum(1 for t in tokens_b if t.startswith("g:")),
        "size_users_a": sum(1 for t in tokens_a if t.startswith("u:")),
        "size_users_b": sum(1 for t in tokens_b if t.startswith("u:")),
    }


In [45]:
def similar_movies_lsh(
    movie_id: int,
    top_k: int = 15,
    lsh_index: MinHashLSH = lsh,
    signatures: dict = minhashes,
    tokens: dict = movie_tokens,
    movies_df: pd.DataFrame = df_movies,
):
    if movie_id not in signatures:
        raise ValueError(f"movie_id {movie_id} not found")

    query_sig = signatures[movie_id]
    candidates = lsh_index.query(query_sig)
    cand_ids = [int(cid) for cid in candidates if int(cid) != movie_id]

    base_tokens = tokens[movie_id]
    rows = []
    for cid in cand_ids:
        ctokens = tokens[cid]
        score = jaccard(base_tokens, ctokens)
        b = token_breakdown(base_tokens, ctokens)
        title = movies_df.loc[cid, "title"]
        year = movies_df.loc[cid, "year"]
        rows.append({
            "movieId": cid,
            "title": title,
            "year": int(year) if pd.notna(year) else None,
            "jaccard": score,
            **b
        })

    if not rows:
        return pd.DataFrame(columns=["movieId","title","year","jaccard","overlap_genres","overlap_users",
                                     "size_genres_a","size_genres_b","size_users_a","size_users_b"])

    df = pd.DataFrame(rows).sort_values("jaccard", ascending=False)
    return df.head(top_k)


In [46]:
# Find movieid based on partial title
def find_movie_by_title(partial_title: str, df_movies: pd.DataFrame) -> pd.DataFrame:
    mask = df_movies['title'].str.contains(partial_title, case=False, na=False)
    print(df_movies[mask][['title', 'genres']])

In [61]:
find_movie_by_title("mononoke", df_movies=df_movies)

                                     title  \
movieId                                      
3000     Princess Mononoke (Mononoke-hime)   

                                                 genres  
movieId                                                  
3000     [Action, Adventure, Animation, Drama, Fantasy]  


In [48]:
df_movies[df_movies['movieId'] == 858]

Unnamed: 0_level_0,movieId,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
858,858,"Godfather, The","[Crime, Drama]",1972


In [62]:
# Example
target_id = 3000
similar_movies_lsh(target_id, top_k=10)


Unnamed: 0,movieId,title,year,jaccard,overlap_genres,overlap_users,size_genres_a,size_genres_b,size_users_a,size_users_b
86,31658,Howl's Moving Castle (Hauru no ugoku shiro),2004,0.362024,3,6285,5,4,13136,10512
43,5971,My Neighbor Totoro (Tonari no Totoro),1988,0.358857,3,5935,5,4,13136,9340
126,5618,Spirited Away (Sen to Chihiro no kamikakushi),2001,0.35281,3,9350,5,3,13136,22719
106,7099,Nausicaä of the Valley of the Wind (Kaze no ta...,1984,0.298465,4,4236,5,5,13136,5300
35,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,1986,0.292396,4,4203,5,6,13136,5448
116,741,Ghost in the Shell (Kôkaku kidôtai),1995,0.268281,1,4552,5,2,13136,8381
144,5690,Grave of the Fireflies (Hotaru no haka),1988,0.238906,2,3465,5,3,13136,4835
87,26662,Kiki's Delivery Service (Majo no takkyûbin),1989,0.236155,4,3369,5,5,13136,4510
107,48394,"Pan's Labyrinth (Laberinto del fauno, El)",2006,0.210339,2,5735,5,3,13136,19868
66,4878,Donnie Darko,2001,0.19811,1,6667,5,4,13136,27181
