In [None]:
# Import packages
import os
import re
import pandas as pd
from datasketch import MinHash, MinHashLSH

# -------------------------------------------------------------------
# 0. Load data
# -------------------------------------------------------------------
movie_path = os.path.abspath('../data/bronze/movies.csv')
rating_path = os.path.abspath('../data/bronze/ratings.csv')

df_movies_original = pd.read_csv(movie_path)
df_ratings_original = pd.read_csv(rating_path)

df_movies = df_movies_original.copy()
df_ratings = df_ratings_original.copy()

# Optional: Train/Test split (80/20) for model building and validation
# -------------------------------------------------------------------
# Uncomment these lines if you want to build the model on 80% of ratings
# and keep the remaining 20% for validation or evaluation.
# When commented out, df_ratings = full dataset (no split).

from sklearn.model_selection import train_test_split
df_ratings_train, df_ratings_val = train_test_split(
    df_ratings, test_size=0.2, random_state=42, stratify=df_ratings["userId"]
)
df_ratings = df_ratings_train.copy()   # use only train set to build model
print(f"Train ratings: {len(df_ratings_train)}, Validation ratings: {len(df_ratings_val)}")


# Drop timestamp, we don't need it
df_ratings.drop(columns=["timestamp"], inplace=True)

In [4]:
# -------------------------------------------------------------------
# 1. Movie preprocessing
# -------------------------------------------------------------------
def extract_year(title: str):
    """Extract year from parentheses in title and return (clean_title, year or pd.NA)."""
    match = re.search(r"\((\d{4})\)", title)
    if match:
        year = int(match.group(1))
        clean_title = title[:match.start()].strip()
        return clean_title, year
    else:
        return title.strip(), pd.NA

df_movies[["title", "year"]] = df_movies["title"].apply(
    lambda x: pd.Series(extract_year(x))
)
df_movies["year"] = df_movies["year"].astype("Int64")
df_movies["genres"] = df_movies["genres"].apply(lambda g: g.split("|"))
df_movies = df_movies.set_index("movieId", drop=False)

In [5]:
# -------------------------------------------------------------------
# 2. Build per-movie sets: genres, liked users, disliked users
# -------------------------------------------------------------------

# Rating threshold: rating >= t is a "like", rating < t is a "dislike"
RATING_THRESHOLD = 3.0  # you can tune this

df_pos = df_ratings[df_ratings["rating"] >= RATING_THRESHOLD]
df_neg = df_ratings[df_ratings["rating"] < RATING_THRESHOLD]

# Users who liked each movie
movie_users_pos = (
    df_pos.groupby("movieId")["userId"]
    .apply(lambda s: set(map(int, s.to_numpy())))
    .reindex(df_movies.index, fill_value=set())
)

# Users who disliked each movie
movie_users_neg = (
    df_neg.groupby("movieId")["userId"]
    .apply(lambda s: set(map(int, s.to_numpy())))
    .reindex(df_movies.index, fill_value=set())
)

# Genres per movie as a clean set
movie_genres = df_movies["genres"].apply(
    lambda gs: set(g.strip() for g in gs)
)

In [6]:
# -------------------------------------------------------------------
# 3. Build token sets for LSH (single index)
# -------------------------------------------------------------------
# Tokens used for LSH:
#   g:<genre>           -> genre info
#   u+:<userId>         -> positive rating
#   u-:<userId>         -> negative rating

movie_tokens_lsh = {}
for mid in df_movies.index:
    tokens = set()
    # genre tokens
    for g in movie_genres.loc[mid]:
        tokens.add(f"g:{g}")
    # positive user tokens
    for u in movie_users_pos.loc[mid]:
        tokens.add(f"u+:{u}")
    # negative user tokens
    for u in movie_users_neg.loc[mid]:
        tokens.add(f"u-:{u}")
    movie_tokens_lsh[mid] = tokens

In [7]:
# -------------------------------------------------------------------
# 4. MinHash signatures and single LSH index
# -------------------------------------------------------------------
num_perm = 128

minhashes = {}
for mid, tokens in movie_tokens_lsh.items():
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        m.update(t.encode("utf-8"))
    minhashes[mid] = m

lsh_threshold = 0.3
lsh = MinHashLSH(threshold=lsh_threshold, num_perm=num_perm)
for mid, m in minhashes.items():
    lsh.insert(str(mid), m)

In [12]:
# -------------------------------------------------------------------
# 5. Helper functions: Jaccard + combined similarity
# -------------------------------------------------------------------
def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0

def double_jaccard(ap: set, bp: set, am: set, bm: set) -> float:
    if not ap and not bp and not am and not bm:
        return 0.0
    inter = len(ap & bp) + len(am & bm)
    union = len(ap | bp) + len(am | bm)
    return inter / union if union else 0.0

# Weights for user likes/dislikes and genre vs user balance
ALPHA_POS = 1.0   # weight of shared likes
BETA_NEG = 1.0    # weight of shared dislikes
LAMBDA_GENRE = 0.4  # 0 = only user-based, 1 = only genre-based

def combined_similarity(
    genres_a: set, genres_b: set,
    users_pos_a: set, users_pos_b: set,
    users_neg_a: set, users_neg_b: set,
) -> dict:
    """
    Compute:
      J_g      : genre Jaccard
      J_pos    : Jaccard on liked users
      J_neg    : Jaccard on disliked users
      S_user   : combined user similarity in [0,1]
      S_final  : final similarity mixing genre and user channels
    """
    J_g = jaccard(genres_a, genres_b)
    #J_pos = jaccard(users_pos_a, users_pos_b)
    #J_neg = jaccard(users_neg_a, users_neg_b)
    J_rating = double_jaccard(users_pos_a, users_pos_b,users_neg_a, users_neg_b)

    # Signed user similarity (raw)
    # S_user_raw in [-BETA_NEG, ALPHA_POS]
    #S_user_raw = ALPHA_POS * J_pos - BETA_NEG * J_neg

    # Normalize to [0,1]
    #S_user = (S_user_raw + BETA_NEG) / (ALPHA_POS + BETA_NEG)
    S_user = J_rating

    # Final similarity: mix genres and user-based
    S_final = LAMBDA_GENRE * J_g + (1.0 - LAMBDA_GENRE) * S_user

    return {
        "J_genre": J_g,
        #"J_pos": J_pos,
        #"J_neg": J_neg,
        "J_rating": J_rating,
        "S_user": S_user,
        "similarity": S_final,
    }

In [13]:
# -------------------------------------------------------------------
# 6. Profile building (multi-movie virtual user)
# -------------------------------------------------------------------
def build_profile_sets(
    movie_ids,
    use_genre: bool = True,
    use_user_pos: bool = True,
    use_user_neg: bool = True,
):
    """
    Build profile sets (genres, positive users, negative users)
    by unioning information from a list of movies.
    """
    profile_genres = set()
    profile_users_pos = set()
    profile_users_neg = set()

    for mid in movie_ids:
        if mid not in df_movies.index:
            continue
        if use_genre:
            profile_genres |= movie_genres.loc[mid]
        if use_user_pos:
            profile_users_pos |= movie_users_pos.loc[mid]
        if use_user_neg:
            profile_users_neg |= movie_users_neg.loc[mid]

    return profile_genres, profile_users_pos, profile_users_neg

def build_lsh_tokens_from_sets(
    genres: set,
    users_pos: set,
    users_neg: set,
) -> set:
    """
    Convert genre / user sets into the token format used by LSH.
    """
    tokens = set()
    for g in genres:
        tokens.add(f"g:{g}")
    for u in users_pos:
        tokens.add(f"u+:{u}")
    for u in users_neg:
        tokens.add(f"u-:{u}")
    return tokens

def profile_minhash_from_sets(
    genres: set,
    users_pos: set,
    users_neg: set,
    num_perm: int = num_perm,
) -> MinHash:
    """
    Build a MinHash signature for a profile defined by
    (genres, users_pos, users_neg) sets.
    """
    tokens = build_lsh_tokens_from_sets(genres, users_pos, users_neg)
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        m.update(t.encode("utf-8"))
    return m



In [14]:
# -------------------------------------------------------------------
# 7. Similar movies for a single movie (using combined similarity)
# -------------------------------------------------------------------
def similar_movies_lsh(
    movie_id: int,
    top_k: int = 15,
    lsh_index: MinHashLSH = lsh,
    signatures: dict = minhashes,
    movies_df: pd.DataFrame = df_movies,
):
    """
    Find movies similar to a single movie_id using the shared LSH index
    and the combined (genre + user likes/dislikes) similarity.
    """
    if movie_id not in signatures:
        raise ValueError(f"movie_id {movie_id} not found")

    # Build profile sets from just this one movie
    g_a = movie_genres.loc[movie_id]
    up_a = movie_users_pos.loc[movie_id]
    un_a = movie_users_neg.loc[movie_id]

    query_sig = signatures[movie_id]
    candidates = lsh_index.query(query_sig)
    cand_ids = [int(cid) for cid in candidates if int(cid) != movie_id]

    rows = []
    for cid in cand_ids:
        g_b = movie_genres.loc[cid]
        up_b = movie_users_pos.loc[cid]
        un_b = movie_users_neg.loc[cid]

        sim = combined_similarity(g_a, g_b, up_a, up_b, un_a, un_b)

        title = movies_df.loc[cid, "title"]
        year = movies_df.loc[cid, "year"]

        rows.append({
            "movieId": cid,
            "title": title,
            "year": int(year) if pd.notna(year) else None,
            **sim,
        })

    if not rows:
        return pd.DataFrame(columns=[
            "movieId", "title", "year",
            "similarity", "J_genre", "J_pos", "J_neg", "S_user",
        ])

    df = pd.DataFrame(rows).sort_values("similarity", ascending=False)
    return df.head(top_k)



In [15]:
# -------------------------------------------------------------------
# 8. Similar movies for a profile (multi-movie virtual user)
# -------------------------------------------------------------------
def similar_movies_for_profile(
    movie_ids,
    top_k: int = 15,
    lsh_index: MinHashLSH = lsh,
    movies_df: pd.DataFrame = df_movies,
    use_genre: bool = True,
    use_user_pos: bool = True,
    use_user_neg: bool = True,
):
    """
    Recommend movies for a 'virtual user' whose taste is defined
    by a list of liked movie_ids, using a single LSH index and
    a combined (genre + user likes/dislikes) similarity.
    """
    # 1) Build profile sets (genres, pos users, neg users)
    g_prof, up_prof, un_prof = build_profile_sets(
        movie_ids,
        use_genre=use_genre,
        use_user_pos=use_user_pos,
        use_user_neg=use_user_neg,
    )

    if not g_prof and not up_prof and not un_prof:
        raise ValueError("Profile is empty; check input movie_ids or channel flags.")

    # 2) Build profile MinHash and query LSH
    q_sig = profile_minhash_from_sets(g_prof, up_prof, un_prof)
    candidates = lsh_index.query(q_sig)

    liked_set = set(movie_ids)
    cand_ids = [int(cid) for cid in candidates if int(cid) not in liked_set]

    rows = []
    for cid in cand_ids:
        g_b = movie_genres.loc[cid]
        up_b = movie_users_pos.loc[cid]
        un_b = movie_users_neg.loc[cid]

        sim = combined_similarity(g_prof, g_b, up_prof, up_b, un_prof, un_b)

        title = movies_df.loc[cid, "title"]
        year = movies_df.loc[cid, "year"]

        rows.append({
            "movieId": cid,
            "title": title,
            "year": int(year) if pd.notna(year) else None,
            **sim,
        })

    if not rows:
        return pd.DataFrame(columns=[
            "movieId", "title", "year",
            "similarity", "J_genre", "J_pos", "J_neg", "S_user",
        ])

    df = pd.DataFrame(rows).sort_values("similarity", ascending=False)
    return df.head(top_k)



In [16]:
# -------------------------------------------------------------------
# 9. Example usage
# -------------------------------------------------------------------

# Example: similar to a single movie
target_id = 3000
print("Similar to single movie:")
similar_movies_lsh(target_id, top_k=10)

Similar to single movie:


Unnamed: 0,movieId,title,year,J_genre,J_rating,S_user,similarity
53,5618,Spirited Away (Sen to Chihiro no kamikakushi),2001.0,0.6,0.337125,0.337125,0.442275
55,7099,Nausicaä of the Valley of the Wind (Kaze no ta...,1984.0,0.666667,0.286532,0.286532,0.438586
22,31658,Howl's Moving Castle (Hauru no ugoku shiro),2004.0,0.5,0.342797,0.342797,0.405678
12,5971,My Neighbor Totoro (Tonari no Totoro),1988.0,0.5,0.338893,0.338893,0.403336
20,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,1986.0,0.571429,0.280242,0.280242,0.396717
1,1274,Akira,1988.0,0.5,0.235177,0.235177,0.341106
31,1287,Ben-Hur,1959.0,0.6,0.068028,0.068028,0.280817
8,29,"City of Lost Children, The (Cité des enfants p...",1995.0,0.428571,0.118817,0.118817,0.242718
3,1136,Monty Python and the Holy Grail,1975.0,0.333333,0.154286,0.154286,0.225905
35,741,Ghost in the Shell (Kôkaku kidôtai),1995.0,0.166667,0.245053,0.245053,0.213698


In [37]:
# Example: virtual user who likes several movies
seed_movies = [6365, 2571]   # example movieIds
print("\nSimilar for profile:")
similar_movies_for_profile(
    seed_movies,
    top_k=20,
    use_genre=True,
    use_user_pos=True,
    use_user_neg=True,
)


Similar for profile:


Unnamed: 0,movieId,title,year,J_genre,J_rating,S_user,similarity
88,480,Jurassic Park,1993,0.8,0.295685,0.295685,0.497411
58,1196,Star Wars: Episode V - The Empire Strikes Back,1980,0.6,0.415535,0.415535,0.489321
57,260,Star Wars: Episode IV - A New Hope,1977,0.6,0.403822,0.403822,0.482293
52,44191,V for Vendetta,2006,0.8,0.243799,0.243799,0.466279
83,1210,Star Wars: Episode VI - Return of the Jedi,1983,0.6,0.357088,0.357088,0.454253
47,541,Blade Runner,1982,0.6,0.280736,0.280736,0.408441
4,122882,Mad Max: Fury Road,2015,0.8,0.121053,0.121053,0.392632
74,79132,Inception,2010,0.5,0.319255,0.319255,0.391553
6,1198,Raiders of the Lost Ark (Indiana Jones and the...,1981,0.4,0.384288,0.384288,0.390573
56,2959,Fight Club,1999,0.285714,0.446317,0.446317,0.382076


In [86]:
df_movies[df_movies['movieId'] == 480]

Unnamed: 0_level_0,movieId,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
480,480,Jurassic Park,"[Action, Adventure, Sci-Fi, Thriller]",1993


In [18]:
# Find movieid based on partial title
def find_movie_by_title(partial_title: str, df_movies: pd.DataFrame) -> pd.DataFrame:
    mask = df_movies['title'].str.contains(partial_title, case=False, na=False)
    print(df_movies[mask][['title', 'genres']])

In [60]:
find_movie_by_title("matrix", df_movies=df_movies)

                                                  title  \
movieId                                                   
2571                                        Matrix, The   
6365                               Matrix Reloaded, The   
6934                            Matrix Revolutions, The   
27660                                    Animatrix, The   
132490   Return to Source: The Philosophy of The Matrix   
157721                            Armitage: Dual Matrix   
172255                             The Matrix Revisited   
179489                                The Living Matrix   
181103                                   Matrix of Evil   

                                                   genres  
movieId                                                    
2571                           [Action, Sci-Fi, Thriller]  
6365          [Action, Adventure, Sci-Fi, Thriller, IMAX]  
6934          [Action, Adventure, Sci-Fi, Thriller, IMAX]  
27660                  [Action, Animation, Drama, 

In [35]:
import numpy as np
from tqdm import tqdm

# -------------------------------------------------------------------
# Evaluation: per-movie hits, averaged per user
# -------------------------------------------------------------------

RNG_SEED = 42
rng = np.random.RandomState(RNG_SEED)

# df_pos already has rating >= RATING_THRESHOLD (T = 3.0)
user_liked_movies = (
    df_pos.groupby("userId")["movieId"]
    .apply(lambda s: sorted(set(s.astype(int))))
)

MIN_MOVIES_PER_USER = 5        # require at least this many liked movies
SUBSET_SIZE = 50              # max users to evaluate
TOP_K_EVAL = 30              # get enough recs for top20

eligible_users = [
    user_id for user_id, movies in user_liked_movies.items()
    if len(movies) >= MIN_MOVIES_PER_USER
]

if not eligible_users:
    raise ValueError("No users have enough liked movies to evaluate on.")

subset_size = min(SUBSET_SIZE, len(eligible_users))
user_subset = rng.choice(eligible_users, size=subset_size, replace=False)

per_user_results = []

total_hits_10 = 0
total_hits_20 = 0
total_test_movies = 0


for user_id in tqdm(user_subset):
    movies = np.array(user_liked_movies[user_id])
    rng.shuffle(movies)

    split_idx = int(len(movies) * 0.5)
    if split_idx == 0 or split_idx == len(movies):
        continue

    train_movies = movies[:split_idx].tolist()
    test_movies = movies[split_idx:].tolist()

    try:
        recs = similar_movies_for_profile(
            train_movies,
            top_k=TOP_K_EVAL,
            use_genre=True,
            use_user_pos=True,
            use_user_neg=True,
        )
    except ValueError:
        continue

    if recs.empty:
        # No recs: all test movies are misses
        user_hit10_count = 0
        user_hit20_count = 0
    else:
        rec_ids = recs["movieId"].tolist()
        top10 = set(rec_ids[:10])
        top20 = set(rec_ids[:20])

        # For EACH movie in the test set:
        #   if it is in top10 -> add 1, else 0
        #   same for top20
        user_hit10_count = sum(1 for m in test_movies if m in top10)
        user_hit20_count = sum(1 for m in test_movies if m in top20)

    n_test = len(test_movies)
    if n_test == 0:
        continue

    # Per-user average hit rate over their test movies
    user_hit10_rate = user_hit10_count / n_test
    user_hit20_rate = user_hit20_count / n_test

    per_user_results.append({
        "userId": user_id,
        "n_liked": len(movies),
        "n_test": n_test,
        "hit@10_user": user_hit10_rate,
        "hit@20_user": user_hit20_rate,
    })

    # Global counters (movie-level)
    total_hits_10 += user_hit10_count
    total_hits_20 += user_hit20_count
    total_test_movies += n_test


# Convert to DataFrame
df_eval = pd.DataFrame(per_user_results)

# Average over users (what you asked for)
hit10_user_avg = df_eval["hit@10_user"].mean() if not df_eval.empty else float("nan")
hit20_user_avg = df_eval["hit@20_user"].mean() if not df_eval.empty else float("nan")

# Optional: global movie-level hit rate (over all test movies)
hit10_global = total_hits_10 / total_test_movies if total_test_movies > 0 else float("nan")
hit20_global = total_hits_20 / total_test_movies if total_test_movies > 0 else float("nan")

print(f"Users evaluated           : {len(df_eval)}")
print(f"Avg per-user Hit@10       : {hit10_user_avg:.4f}")
print(f"Avg per-user Hit@20       : {hit20_user_avg:.4f}")
print(f"Global movie Hit@10 (opt) : {hit10_global:.4f}")
print(f"Global movie Hit@20 (opt) : {hit20_global:.4f}")

100%|██████████| 50/50 [02:58<00:00,  3.57s/it]

Users evaluated           : 50
Avg per-user Hit@10       : 0.0610
Avg per-user Hit@20       : 0.0899
Global movie Hit@10 (opt) : 0.0395
Global movie Hit@20 (opt) : 0.0540





In [None]:
df_eval.head()

Unnamed: 0,userId,n_liked,n_test,hit@10_user,hit@20_user
0,80933,52,11,0.0,0.090909
1,23851,45,9,0.0,0.0
2,161314,24,5,0.0,0.0
3,74492,20,4,0.0,0.0
4,39061,25,5,0.0,0.0


In [34]:
genres = []
movietitles = []
for a in df_ratings[df_ratings['userId'] == 74492]['movieId']:
    genres.append(df_movies[df_movies['movieId'] == a]['genres'].values[0])
    movietitles.append(df_movies[df_movies['movieId'] == a]['title'].values[0])

for i in range(len(genres)):
    print(movietitles[i], genres[i])

What's Eating Gilbert Grape ['Drama']
Schindler's List ['Drama', 'War']
L.A. Confidential ['Crime', 'Film-Noir', 'Mystery', 'Thriller']
Chasing Amy ['Comedy', 'Drama', 'Romance']
Rain Man ['Drama']
Saving Private Ryan ['Action', 'Drama', 'War']
Commitments, The ['Comedy', 'Drama', 'Musical']
Fisher King, The ['Comedy', 'Drama', 'Fantasy', 'Romance']
9 1/2 Weeks (Nine 1/2 Weeks) ['Drama', 'Romance']
Beverly Hills Cop ['Action', 'Comedy', 'Crime', 'Drama']
Moscow Does Not Believe in Tears (Moskva slezam ne verit) ['Drama', 'Romance']
Unfaithful ['Drama', 'Thriller']
About a Boy ['Comedy', 'Drama', 'Romance']
Thirteen Conversations About One Thing (a.k.a. 13 Conversations) ['Drama']
Bourne Identity, The ['Action', 'Mystery', 'Thriller']
Minority Report ['Action', 'Crime', 'Mystery', 'Sci-Fi', 'Thriller']
Harry Potter and the Chamber of Secrets ['Adventure', 'Fantasy']
Adaptation ['Comedy', 'Drama', 'Romance']
Chicago ['Comedy', 'Crime', 'Drama', 'Musical']
Love Actually ['Comedy', 'Drama'