In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from datasketch import MinHash, MinHashLSH
import re
import numpy as np
from sklearn.preprocessing import StandardScaler
from numpy.linalg import norm
from scipy.sparse import csr_matrix

In [2]:
movie_path = os.path.abspath('../data/bronze/movies.csv')
rating_path = os.path.abspath('../data/bronze/ratings.csv')
df_movies_original = pd.read_csv(movie_path)
df_ratings_original = pd.read_csv(rating_path)

In [3]:
df_movies = df_movies_original.copy()
df_ratings = df_ratings_original.copy()

In [4]:
df_ratings.drop(columns=["timestamp"], inplace=True)

In [5]:
def extract_year(title: str):
    """Extracts year from parentheses at end of title and returns (clean_title, year or pd.NA)."""
    match = re.search(r"\((\d{4})\)", title)
    if match:
        year = int(match.group(1))
        clean_title = title[:match.start()].strip()
        return clean_title, year
    else:
        return title.strip(), pd.NA

# Apply extraction
df_movies[["title", "year"]] = df_movies["title"].apply(lambda x: pd.Series(extract_year(x)))

# ✅ Force to nullable integer dtype (keeps NaN-safe integers)
df_movies["year"] = df_movies["year"].astype("Int64")

# Split genres and set index
df_movies["genres"] = df_movies["genres"].apply(lambda g: g.split("|"))
df_movies = df_movies.set_index("movieId", drop=False)

In [6]:
movie_features = ( df_ratings.groupby("movieId")["rating"].mean().to_frame("avg_rating") 
                  .join(df_ratings.groupby("movieId")["rating"].count().to_frame("num_ratings")) 
                  .join(df_movies) ) 

# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_features = pd.DataFrame(mlb.fit_transform(movie_features["genres"]), columns=mlb.classes_, index=movie_features.index)
X = pd.concat([genre_features, movie_features[["avg_rating", "num_ratings"]]], axis=1).fillna(0)
print("Initial shape of X:", X.shape)

# Filter out movies with less than 50 ratings
X = X[X['num_ratings'] >= 5]
print("Shape of X after filtering movies with less than 50 ratings:", X.shape)

Initial shape of X: (59047, 22)
Shape of X after filtering movies with less than 50 ratings: (32720, 22)


In [7]:
# Find movieid based on partial title
def find_movie_by_title(partial_title: str, df_movies: pd.DataFrame) -> pd.DataFrame:
    mask = df_movies['title'].str.contains(partial_title, case=False, na=False)
    print(df_movies[mask][['title', 'genres']])

In [8]:
# --- Cell 1: Build weighted cosine features (genres + optional avg_rating) ---

def build_cosine_features(
    genre_features: pd.DataFrame,
    movie_features: pd.DataFrame,
    *,
    w_genre: float = 1.0,
    w_rating: float = 0.0
):
    """
    Returns:
      X_cos (pd.DataFrame): weighted feature matrix used for cosine similarity
      X_np (np.ndarray), movie_ids (np.ndarray), x_norms (np.ndarray)
    Notes:
      - If w_rating == 0, similarity is based on genres only.
      - avg_rating is standardized before weighting.
    """
    # 1) Genres block (float32)
    G = genre_features.loc[movie_features.index].astype(np.float32)
    if w_genre != 1.0:
        G = G.mul(w_genre)

    # 2) Avg rating block (standardized, then weighted)
    r = movie_features["avg_rating"].copy()
    if r.isna().all():
        # safe fallback if no ratings at all
        r = r.fillna(3.0)
    else:
        r = r.fillna(r.mean())

    scaler = StandardScaler()
    r_scaled = scaler.fit_transform(r.to_frame())[:, 0].astype(np.float32)
    R = pd.DataFrame({"avg_rating_scaled": r_scaled}, index=movie_features.index)
    if w_rating != 0.0:
        R = R.mul(w_rating)
    else:
        # if w_rating=0 we can drop the column entirely to be explicit
        R = pd.DataFrame(index=movie_features.index)

    # 3) Final cosine features: genres (+ optional rating)
    X_cos = pd.concat([G, R], axis=1).astype(np.float32)

    # Arrays for fast math
    X_np = X_cos.values
    movie_ids = X_cos.index.to_numpy()
    x_norms = np.maximum(norm(X_np, axis=1), 1e-12)

    return X_cos, X_np, movie_ids, x_norms


In [9]:
# --- Cell 2: Random Hyperplane LSH for cosine ---

from numpy.random import default_rng
rng = default_rng(42)

def build_lsh(X_np, n_tables=10, n_planes=18, random_state=42):
    n, d = X_np.shape
    local_rng = default_rng(random_state)
    hyperplanes = [local_rng.normal(size=(n_planes, d)).astype(np.float32) for _ in range(n_tables)]
    tables = [dict() for _ in range(n_tables)]
    for i in range(n):
        v = X_np[i]
        for t in range(n_tables):
            H = hyperplanes[t]
            sig_bits = (H @ v >= 0).astype(np.uint8)
            key = sig_bits.tobytes()
            tables[t].setdefault(key, []).append(i)
    return tables, hyperplanes

def lsh_candidates(idx, tables, hyperplanes, X_np):
    v = X_np[idx]
    cand = set()
    for t, H in zip(tables, hyperplanes):
        sig_bits = (H @ v >= 0).astype(np.uint8)
        key = sig_bits.tobytes()
        cand.update(t.get(key, []))
    cand.discard(idx)
    return list(cand)

def cosine_sim_batch(v, V, v_norm, V_norms):
    dots = V @ v
    return dots / (V_norms * v_norm + 1e-12)


In [10]:
# --- Cell 3: Build weighted features + LSH (choose your weights here) ---

# Choose weights: w_genre dominates; w_rating nudges similarity
# Set w_rating=0.0 for pure genre similarity
W_GENRE = 1.0
W_RATING = 1.0

X_cos, X_np, movie_ids, x_norms = build_cosine_features(
    genre_features=genre_features,
    movie_features=movie_features.loc[X.index],  # align to filtered set
    w_genre=W_GENRE,
    w_rating=W_RATING
)

tables, hyperplanes = build_lsh(X_np, n_tables=10, n_planes=18, random_state=42)


In [11]:
# --- Cell 4: Query similar movies
# Rank primarily by cosine similarity; optionally re-rank by avg_rating afterwards.

def recommend_similar(
    query_movie_id: int,
    *,
    df_meta: pd.DataFrame,   # movie_features with title, year, avg_rating, num_ratings
    X_np: np.ndarray,
    x_norms: np.ndarray,
    movie_ids: np.ndarray,
    tables,
    hyperplanes,
    top_k: int = 10,
    min_sim: float = 0.2,
    rerank_by_rating: bool = True,
    min_votes: int = 20
) -> pd.DataFrame:
    if query_movie_id not in movie_ids:
        raise KeyError(f"movieId {query_movie_id} not present in the feature matrix (maybe filtered by num_ratings).")

    # locate row index
    idx = int(np.where(movie_ids == query_movie_id)[0][0])

    # LSH candidate pool
    cand_idx = lsh_candidates(idx, tables, hyperplanes, X_np)
    if not cand_idx:
        return pd.DataFrame(columns=["movieId","title","year","similarity","avg_rating","num_ratings"])

    # cosine similarity to candidates
    v = X_np[idx]; v_norm = x_norms[idx]
    V = X_np[cand_idx]; V_norms = x_norms[cand_idx]
    sims = cosine_sim_batch(v, V, v_norm, V_norms)

    out = pd.DataFrame({"idx": cand_idx, "similarity": sims}).sort_values("similarity", ascending=False)
    out = out[out["similarity"] >= min_sim]
    if out.empty:
        return pd.DataFrame(columns=["movieId","title","year","similarity","avg_rating","num_ratings"])

    # attach metadata
    out["movieId"] = out["idx"].map(lambda i: int(movie_ids[i]))
    meta = df_meta.loc[out["movieId"], ["title","year","avg_rating","num_ratings"]].reset_index()
    out = out.drop(columns=["idx"]).merge(meta, on="movieId", how="left")

    # optionally keep only items with reasonable vote count
    if min_votes:
        has_floor = out["num_ratings"] >= min_votes
        if has_floor.any():
            out = out[has_floor]

    if out.empty:
        return pd.DataFrame(columns=["movieId","title","year","similarity","avg_rating","num_ratings"])

    # final ordering
    if rerank_by_rating:
        out = out.sort_values(by=["similarity","avg_rating","num_ratings"], ascending=[False, False, False])
    else:
        out = out.sort_values(by=["similarity","num_ratings"], ascending=[False, False])

    return out.head(top_k).reset_index(drop=True)[
        ["movieId","title","year","similarity","avg_rating","num_ratings"]
    ]


In [12]:
# --- Cell 5: Example usage ---

df_meta = movie_features.loc[X.index]  # align metadata to the filtered set used in features
query_id = 858

print("Query:", df_meta.loc[query_id, "title"], df_meta.loc[query_id, "year"])
recs = recommend_similar(
    query_movie_id=query_id,
    df_meta=df_meta,
    X_np=X_np,
    x_norms=x_norms,
    movie_ids=movie_ids,
    tables=tables,
    hyperplanes=hyperplanes,
    top_k=10,
    min_sim=0.25,
    rerank_by_rating=True,   # similarity first; rating breaks ties
    min_votes=30
)
recs


Query: Godfather, The 1972


Unnamed: 0,movieId,title,year,similarity,avg_rating,num_ratings
0,1221,"Godfather: Part II, The",1974,0.999683,4.261759,34188
1,318,"Shawshank Redemption, The",1994,0.999462,4.413576,81482
2,1213,Goodfellas,1990,0.998144,4.180525,32663
3,169906,The Night Of,2016,0.996838,4.141328,467
4,2329,American History X,1998,0.996787,4.140001,31157
5,1945,On the Waterfront,1954,0.996282,4.127636,5880
6,2731,"400 Blows, The (Les quatre cents coups)",1959,0.995245,4.105151,3281
7,3147,"Green Mile, The",1999,0.990349,4.027754,30482
8,55820,No Country for Old Men,2007,0.990223,4.026145,18474
9,8656,"Short Film About Killing, A (Krótki film o zab...",1988,0.988785,4.008711,574


In [24]:
find_movie_by_title("snow white and", df_movies)

                                    title  \
movieId                                     
594       Snow White and the Seven Dwarfs   
94780         Snow White and the Huntsman   
124519   Snow White and the Three Stooges   
134900          Snow White and 7 Wise Men   
175853         Snow White and Russian Red   
206645            Snow White and Rose Red   

                                                 genres  
movieId                                                  
594      [Animation, Children, Drama, Fantasy, Musical]  
94780                        [Action, Adventure, Drama]  
124519    [Adventure, Children, Comedy, Drama, Fantasy]  
134900                                         [Comedy]  
175853                                          [Drama]  
206645                             [Animation, Fantasy]  
