In [1]:
import pandas as pd
import os
from datasketch import MinHash, MinHashLSH
import re
import numpy as np

In [2]:
movie_path = os.path.abspath('../data/bronze/movies.csv')
rating_path = os.path.abspath('../data/bronze/ratings.csv')
df_movies_original = pd.read_csv(movie_path)
df_ratings_original = pd.read_csv(rating_path)

In [3]:
df_movies = df_movies_original.copy()
df_ratings = df_ratings_original.copy()

In [4]:
df_ratings.drop(columns=["timestamp"], inplace=True)
df_ratings = df_ratings.set_index("movieId", drop=False)

In [5]:
def extract_year(title: str):
    """Extracts year from parentheses at end of title and returns (clean_title, year or pd.NA)."""
    match = re.search(r"\((\d{4})\)", title)
    if match:
        year = int(match.group(1))
        clean_title = title[:match.start()].strip()
        return clean_title, year
    else:
        return title.strip(), pd.NA

# Apply extraction
df_movies[["title", "year"]] = df_movies["title"].apply(lambda x: pd.Series(extract_year(x)))

# ✅ Force to nullable integer dtype (keeps NaN-safe integers)
df_movies["year"] = df_movies["year"].astype("Int64")

# Split genres and set index
df_movies["genres"] = df_movies["genres"].apply(lambda g: g.split("|"))
df_movies = df_movies.set_index("movieId", drop=False)

In [None]:
# users per movie (fixes ambiguity by grouping on the index level)
movie_users = (
    df_ratings.groupby(level="movieId")["userId"]
    .apply(lambda s: set(map(int, s.to_numpy())))
    .reindex(df_movies.index, fill_value=set())
)

# genres per movie
movie_genres = df_movies["genres"].apply(lambda gs: set(g.strip() for g in gs))

# combined token set per movie: genres + users
movie_tokens = {}
for mid in df_movies.index:
    tokens = set()
    tokens.update({f"g:{g}" for g in movie_genres.loc[mid]})
    tokens.update({f"u:{u}" for u in movie_users.loc[mid]})
    movie_tokens[mid] = tokens

In [10]:
num_perm = 128
minhashes = {}
for mid, tokens in movie_tokens.items():
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        m.update(t.encode("utf-8"))
    minhashes[mid] = m

lsh_threshold = 0.3
lsh = MinHashLSH(threshold=lsh_threshold, num_perm=num_perm)
for mid, m in minhashes.items():
    lsh.insert(str(mid), m)

In [11]:
def jaccard(a, b):
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0

def token_breakdown(tokens_a, tokens_b):
    inter = tokens_a & tokens_b
    inter_genres = {t for t in inter if t.startswith("g:")}
    inter_users = {t for t in inter if t.startswith("u:")}
    return {
        "overlap_genres": len(inter_genres),
        "overlap_users": len(inter_users),
        "size_genres_a": sum(1 for t in tokens_a if t.startswith("g:")),
        "size_genres_b": sum(1 for t in tokens_b if t.startswith("g:")),
        "size_users_a": sum(1 for t in tokens_a if t.startswith("u:")),
        "size_users_b": sum(1 for t in tokens_b if t.startswith("u:")),
    }


In [12]:
def similar_movies_lsh(
    movie_id: int,
    top_k: int = 15,
    lsh_index: MinHashLSH = lsh,
    signatures: dict = minhashes,
    tokens: dict = movie_tokens,
    movies_df: pd.DataFrame = df_movies,
):
    if movie_id not in signatures:
        raise ValueError(f"movie_id {movie_id} not found")

    query_sig = signatures[movie_id]
    candidates = lsh_index.query(query_sig)
    cand_ids = [int(cid) for cid in candidates if int(cid) != movie_id]

    base_tokens = tokens[movie_id]
    rows = []
    for cid in cand_ids:
        ctokens = tokens[cid]
        score = jaccard(base_tokens, ctokens)
        b = token_breakdown(base_tokens, ctokens)
        title = movies_df.loc[cid, "title"]
        year = movies_df.loc[cid, "year"]
        rows.append({
            "movieId": cid,
            "title": title,
            "year": int(year) if pd.notna(year) else None,
            "jaccard": score,
            **b
        })

    if not rows:
        return pd.DataFrame(columns=["movieId","title","year","jaccard","overlap_genres","overlap_users",
                                     "size_genres_a","size_genres_b","size_users_a","size_users_b"])

    df = pd.DataFrame(rows).sort_values("jaccard", ascending=False)
    return df.head(top_k)


In [13]:
# Find movieid based on partial title
def find_movie_by_title(partial_title: str, df_movies: pd.DataFrame) -> pd.DataFrame:
    mask = df_movies['title'].str.contains(partial_title, case=False, na=False)
    print(df_movies[mask][['title', 'genres']])

In [54]:
find_movie_by_title("matrix", df_movies=df_movies)

                                                  title  \
movieId                                                   
2571                                        Matrix, The   
6365                               Matrix Reloaded, The   
6934                            Matrix Revolutions, The   
27660                                    Animatrix, The   
132490   Return to Source: The Philosophy of The Matrix   
157721                            Armitage: Dual Matrix   
172255                             The Matrix Revisited   
179489                                The Living Matrix   
181103                                   Matrix of Evil   

                                                   genres  
movieId                                                    
2571                           [Action, Sci-Fi, Thriller]  
6365          [Action, Adventure, Sci-Fi, Thriller, IMAX]  
6934          [Action, Adventure, Sci-Fi, Thriller, IMAX]  
27660                  [Action, Animation, Drama, 

In [23]:
df_movies[df_movies['movieId'] == 3000]

Unnamed: 0_level_0,movieId,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000,3000,Princess Mononoke (Mononoke-hime),"[Action, Adventure, Animation, Drama, Fantasy]",1997


In [15]:
# Example
target_id = 3000
similar_movies_lsh(target_id, top_k=10)

Unnamed: 0,movieId,title,year,jaccard,overlap_genres,overlap_users,size_genres_a,size_genres_b,size_users_a,size_users_b
55,31658,Howl's Moving Castle (Hauru no ugoku shiro),2004,0.362024,3,6285,5,4,13136,10512
134,5971,My Neighbor Totoro (Tonari no Totoro),1988,0.358857,3,5935,5,4,13136,9340
73,5618,Spirited Away (Sen to Chihiro no kamikakushi),2001,0.35281,3,9350,5,3,13136,22719
75,7099,Nausicaä of the Valley of the Wind (Kaze no ta...,1984,0.298465,4,4236,5,5,13136,5300
142,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,1986,0.292396,4,4203,5,6,13136,5448
57,741,Ghost in the Shell (Kôkaku kidôtai),1995,0.268281,1,4552,5,2,13136,8381
116,5690,Grave of the Fireflies (Hotaru no haka),1988,0.238906,2,3465,5,3,13136,4835
11,26662,Kiki's Delivery Service (Majo no takkyûbin),1989,0.236155,4,3369,5,5,13136,4510
103,48394,"Pan's Labyrinth (Laberinto del fauno, El)",2006,0.210339,2,5735,5,3,13136,19868
0,4878,Donnie Darko,2001,0.19811,1,6667,5,4,13136,27181


# Multiple movies!

In [16]:
def build_profile_tokens(
    movie_ids,
    tokens: dict[int, set[str]] = movie_tokens,
    use_user_tokens: bool = True,
    use_genre_tokens: bool = True,
):
    """
    Create a synthetic 'user profile' token set from multiple movies.
    You can choose to include/exclude user tokens and/or genre tokens.
    """
    profile = set()
    for mid in movie_ids:
        if mid not in tokens:
            continue
        for t in tokens[mid]:
            if t.startswith("g:") and use_genre_tokens:
                profile.add(t)
            elif t.startswith("u:") and use_user_tokens:
                profile.add(t)
    return profile


In [17]:
from datasketch import MinHash

def profile_minhash(profile_tokens, num_perm=num_perm):
    m = MinHash(num_perm=num_perm)
    for t in profile_tokens:
        m.update(t.encode("utf-8"))
    return m


In [57]:
def similar_movies_for_profile(
    movie_ids,
    top_k: int = 15,
    lsh_index: MinHashLSH = lsh,
    tokens: dict = movie_tokens,
    signatures: dict = minhashes,
    movies_df: pd.DataFrame = df_movies,
    use_user_tokens: bool = True,
    use_genre_tokens: bool = True,
):
    """
    Recommend movies for a 'virtual user' whose taste is defined
    by a list of liked movie_ids.
    """
    # 1) Build profile tokens
    base_tokens = build_profile_tokens(
        movie_ids,
        tokens=tokens,
        use_user_tokens=use_user_tokens,
        use_genre_tokens=use_genre_tokens,
    )
    if not base_tokens:
        raise ValueError("Profile has no tokens; check input movie_ids or token settings.")

    # 2) Build profile MinHash and query LSH
    q_sig = profile_minhash(base_tokens)
    candidates = lsh_index.query(q_sig)

    liked_set = set(movie_ids)
    cand_ids = [int(cid) for cid in candidates if int(cid) not in liked_set]

    rows = []
    for cid in cand_ids:
        ctokens = tokens[cid]
        score = jaccard(base_tokens, ctokens)  # your existing jaccard(a, b)
        b = token_breakdown(base_tokens, ctokens)  # your existing breakdown

        title = movies_df.loc[cid, "title"]
        year = movies_df.loc[cid, "year"]

        rows.append({
            "movieId": cid,
            "title": title,
            "year": int(year) if pd.notna(year) else None,
            "jaccard": score,
            **b
            #"overlap_genres": b["overlap_genres"],
            #"overlap_users": b["overlap_users"],
        })

    if not rows:
        return pd.DataFrame(columns=[
            #"movieId","title","year","jaccard","overlap_genres","overlap_users","size_genres_a","size_genres_b","size_users_a","size_users_b"
            "movieId","title","jaccard","overlap_genres","overlap_users"
        ])

    df = pd.DataFrame(rows).sort_values("jaccard", ascending=False)
    return df.head(top_k)


In [58]:
# new virtual user who loves these movies
seed_movies = [2571, 6365, 6934]   # example movieIds
recs = similar_movies_for_profile(seed_movies, top_k=20, use_user_tokens=True, use_genre_tokens=True)
recs


Unnamed: 0,movieId,title,year,jaccard,overlap_genres,overlap_users,size_genres_a,size_genres_b,size_users_a,size_users_b
81,4993,"Lord of the Rings: The Fellowship of the Ring,...",2001,0.535374,1,45569,5,2,74945,55736
130,2959,Fight Club,1999,0.530688,2,46361,5,4,74945,58773
110,7153,"Lord of the Rings: The Return of the King, The",2003,0.502994,2,42082,5,4,74945,50797
109,5952,"Lord of the Rings: The Two Towers, The",2002,0.501036,1,42087,5,2,74945,51138
132,1196,Star Wars: Episode V - The Empire Strikes Back,1980,0.494736,3,43791,5,3,74945,57361
86,260,Star Wars: Episode IV - A New Hope,1977,0.486882,3,47042,5,3,74945,68717
24,1198,Raiders of the Lost Ark (Indiana Jones and the...,1981,0.46192,2,40956,5,2,74945,54675
100,3578,Gladiator,2000,0.446562,2,36922,5,3,74945,44656
25,318,"Shawshank Redemption, The",1994,0.446387,0,48279,5,2,74945,81482
102,2762,"Sixth Sense, The",1999,0.444896,0,37462,5,3,74945,46713


In [59]:
print(df_movies[df_movies['movieId'] == 4993]['genres'])

movieId
4993    [Adventure, Fantasy]
Name: genres, dtype: object


In [61]:
print(df_movies[df_movies['movieId'] == 6365])

         movieId                 title  \
movieId                                  
6365        6365  Matrix Reloaded, The   

                                              genres  year  
movieId                                                     
6365     [Action, Adventure, Sci-Fi, Thriller, IMAX]  2003  


## Hold-out evaluation

In [3]:
# -------------------------------------------------------------------
# 0. Load data
# -------------------------------------------------------------------
df_movies_eval = df_movies_original.copy()
df_ratings_eval = df_ratings_original.copy()

# Drop timestamp, we don't need it
df_ratings_eval = df_ratings_eval.drop(columns=["timestamp"])  # keep flat here!

# -------------------------------------------------------------------
# 1. Movie preprocessing (same style as your original code)
# -------------------------------------------------------------------

def extract_year(title: str):
    """Extracts year from parentheses at end of title and returns (clean_title, year or pd.NA)."""
    match = re.search(r"\((\d{4})\)", title)
    if match:
        year = int(match.group(1))
        clean_title = title[:match.start()].strip()
        return clean_title, year
    else:
        return title.strip(), pd.NA

df_movies_eval[["title", "year"]] = df_movies_eval["title"].apply(
    lambda x: pd.Series(extract_year(x))
)
df_movies_eval["year"] = df_movies_eval["year"].astype("Int64")
df_movies_eval["genres"] = df_movies_eval["genres"].apply(lambda g: g.split("|"))
df_movies_eval = df_movies_eval.set_index("movieId", drop=False)

In [4]:
# -------------------------------------------------------------------
# 2. Train / Test split by user (optimized with mask)
# -------------------------------------------------------------------

def train_test_split_by_user_fast(
    df_ratings_flat: pd.DataFrame,
    test_size: float = 0.2,
    min_items: int = 5,
    random_state: int = 42,
):
    """
    Faster train/test split by user using pandas groupby+sample.

    - Users with < min_items ratings: all go to train.
    - Users with >= min_items ratings: a random fraction `test_size`
      of their ratings go to test (may be 0 for some users if their
      count * test_size < 1; that's usually fine for evaluation).
    """

    # Count ratings per user
    counts = df_ratings_flat.groupby("userId")["movieId"].transform("size")

    # Users with enough ratings to split
    mask_big = counts >= min_items
    df_big = df_ratings_flat[mask_big]
    df_small = df_ratings_flat[~mask_big]  # all train

    # For "big" users, sample a frac for test within each user
    df_test_big = (
        df_big
        .groupby("userId", group_keys=False)
        .sample(frac=test_size, random_state=random_state)
    )

    # Faster mask-based exclusion instead of .drop()
    test_index = df_test_big.index
    mask = df_big.index.isin(test_index)
    df_train_big = df_big.loc[~mask]

    # Final train/test
    df_train = pd.concat([df_train_big, df_small], axis=0, ignore_index=True)
    df_test = df_test_big.copy()

    return df_train, df_test


df_train, df_test = train_test_split_by_user_fast(
    df_ratings_eval,
    test_size=0.2,
    min_items=5,
    random_state=42,
)

print(f"Train ratings: {len(df_train)}, Test ratings: {len(df_test)}")

# For training, we want movieId as index like in your original pipeline
df_train = df_train.set_index("movieId", drop=False)


Train ratings: 20000887, Test ratings: 4999208


In [8]:
# -------------------------------------------------------------------
# 3. Build tokens and LSH on TRAIN only
# -------------------------------------------------------------------

# users per movie (train only)
movie_users = (
    df_train.groupby(level="movieId")["userId"]
    .apply(lambda s: set(map(int, s.to_numpy())))
    .reindex(df_movies_eval.index, fill_value=set())
)

# genres per movie
movie_genres = df_movies_eval["genres"].apply(lambda gs: set(g.strip() for g in gs))

# combined token set per movie: genres + users
movie_tokens_eval = {}
for mid in df_movies_eval.index:
    tokens = set()
    tokens.update({f"g:{g}" for g in movie_genres.loc[mid]})
    tokens.update({f"u:{u}" for u in movie_users.loc[mid]})
    movie_tokens_eval[mid] = tokens

num_perm = 128
minhashes = {}
for mid, tokens in movie_tokens_eval.items():
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        m.update(t.encode("utf-8"))
    minhashes[mid] = m

lsh_threshold = 0.3
lsh = MinHashLSH(threshold=lsh_threshold, num_perm=num_perm)
for mid, m in minhashes.items():
    lsh.insert(str(mid), m)

In [9]:
# -------------------------------------------------------------------
# 4. Similarity helpers
# -------------------------------------------------------------------

def jaccard(a, b):
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0

def token_breakdown(tokens_a, tokens_b):
    inter = tokens_a & tokens_b
    inter_genres = {t for t in inter if t.startswith("g:")}
    inter_users = {t for t in inter if t.startswith("u:")}
    return {
        "overlap_genres": len(inter_genres),
        "overlap_users": len(inter_users),
        "size_genres_a": sum(1 for t in tokens_a if t.startswith("g:")),
        "size_genres_b": sum(1 for t in tokens_b if t.startswith("g:")),
        "size_users_a": sum(1 for t in tokens_a if t.startswith("u:")),
        "size_users_b": sum(1 for t in tokens_b if t.startswith("u:")),
    }

def build_profile_tokens(
    movie_ids,
    tokens: dict[int, set[str]] = movie_tokens_eval,
    use_user_tokens: bool = True,
    use_genre_tokens: bool = True,
):
    """
    Create a synthetic 'user profile' token set from multiple movies.
    You can choose to include/exclude user tokens and/or genre tokens.
    """
    profile = set()
    for mid in movie_ids:
        if mid not in tokens:
            continue
        for t in tokens[mid]:
            if t.startswith("g:") and use_genre_tokens:
                profile.add(t)
            elif t.startswith("u:") and use_user_tokens:
                profile.add(t)
    return profile

def profile_minhash(profile_tokens, num_perm=num_perm):
    m = MinHash(num_perm=num_perm)
    for t in profile_tokens:
        m.update(t.encode("utf-8"))
    return m

In [10]:
def similar_movies_for_profile(
    movie_ids,
    top_k: int = 15,
    lsh_index: MinHashLSH = lsh,
    tokens: dict = movie_tokens_eval,
    signatures: dict = minhashes,
    movies_df: pd.DataFrame = df_movies_eval,
    use_user_tokens: bool = True,
    use_genre_tokens: bool = True,
):
    """
    Recommend movies for a 'virtual user' whose taste is defined
    by a list of liked movie_ids.
    """
    # 1) Build profile tokens
    base_tokens = build_profile_tokens(
        movie_ids,
        tokens=tokens,
        use_user_tokens=use_user_tokens,
        use_genre_tokens=use_genre_tokens,
    )
    if not base_tokens:
        raise ValueError("Profile has no tokens; check input movie_ids or token settings.")

    # 2) Build profile MinHash and query LSH
    q_sig = profile_minhash(base_tokens)
    candidates = lsh_index.query(q_sig)

    liked_set = set(movie_ids)
    cand_ids = [int(cid) for cid in candidates if int(cid) not in liked_set]

    rows = []
    for cid in cand_ids:
        ctokens = tokens[cid]
        score = jaccard(base_tokens, ctokens)
        b = token_breakdown(base_tokens, ctokens)

        title = movies_df.loc[cid, "title"]
        year = movies_df.loc[cid, "year"]

        rows.append({
            "movieId": cid,
            "title": title,
            "year": int(year) if pd.notna(year) else None,
            "jaccard": score,
            **b,
        })

    if not rows:
        return pd.DataFrame(columns=[
            "movieId","title","year","jaccard",
            "overlap_genres","overlap_users",
            "size_genres_a","size_genres_b","size_users_a","size_users_b"
        ])

    df = pd.DataFrame(rows).sort_values("jaccard", ascending=False)
    return df.head(top_k)

In [35]:
print(df_test["userId"].unique()[5:10])

[ 6  7  8  9 10]


In [43]:
df_test[df_test['userId'] == 7]

Unnamed: 0,userId,movieId,rating
1281,7,28,4.0
1289,7,288,3.0
1293,7,308,4.0
1297,7,457,3.0
1294,7,329,3.0


In [44]:
print(df_movies_eval[df_movies_eval['movieId'] == 28])
print(df_movies_eval[df_movies_eval['movieId'] == 288])
print(df_movies_eval[df_movies_eval['movieId'] == 308])
print(df_movies_eval[df_movies_eval['movieId'] == 457])

         movieId       title            genres  year
movieId                                             
28            28  Persuasion  [Drama, Romance]  1995
         movieId                 title                     genres  year
movieId                                                                
288          288  Natural Born Killers  [Action, Crime, Thriller]  1994
         movieId                                     title           genres  \
movieId                                                                       
308          308  Three Colors: White (Trzy kolory: Bialy)  [Comedy, Drama]   

         year  
movieId        
308      1994  
         movieId          title      genres  year
movieId                                          
457          457  Fugitive, The  [Thriller]  1993


In [45]:
print(df_movies_eval[df_movies_eval['movieId'] == 329])

         movieId                   title                      genres  year
movieId                                                                   
329          329  Star Trek: Generations  [Adventure, Drama, Sci-Fi]  1994


In [46]:
# new virtual user who loves these movies (could be from a test user)
seed_movies = [28, 288, 308, 457]   # example movieIds
recs = similar_movies_for_profile(
    seed_movies,
    top_k=20,
    use_user_tokens=True,
    use_genre_tokens=True,
)
recs

Unnamed: 0,movieId,title,year,jaccard,overlap_genres,overlap_users,size_genres_a,size_genres_b,size_users_a,size_users_b
25,592,Batman,1989,0.389662,3,24006,6,3,50245,35370
5,380,True Lies,1994,0.388381,4,23394,6,5,50245,33387
18,480,Jurassic Park,1993,0.385051,2,28176,6,4,50245,51103
30,150,Apollo 13,1995,0.382133,1,24629,6,3,50245,38830
28,589,Terminator 2: Judgment Day,1991,0.375607,1,26278,6,2,50245,45990
44,377,Speed,1994,0.373421,3,22648,6,3,50245,33055
22,590,Dances with Wolves,1990,0.368147,1,22467,6,3,50245,33244
32,593,"Silence of the Lambs, The",1991,0.363157,2,29157,6,3,50245,59198
48,110,Braveheart,1995,0.362768,2,25990,6,3,50245,47387
0,296,Pulp Fiction,1994,0.361264,4,30258,6,4,50245,63774
