In [3]:
!pip install faiss-cpu
!pip install surprise

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: surprise
Successfully installed surprise-0.1


In [4]:
# mini_hybrid_rec.py
import os, zipfile, tempfile, urllib.request, random, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from surprise.prediction_algorithms.knns import KNNBasic
from collections import defaultdict



# ------------------------------------------------------------------
# 2. Load & merge + keep 10k ratings
# ------------------------------------------------------------------
ratings = pd.read_csv("/kaggle/working/Filtered_Ratings.csv")
movies  = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")
tags    = pd.read_csv("/kaggle/input/movielens-20m-dataset/tag.csv")

# collapse tags per movie
tags_agg = (tags.groupby("movieId")["tag"]
              .apply(lambda x: " ".join(x.fillna("").astype(str)))
              .reset_index(name="tags"))
movies["genres"] = movies["genres"].str.replace("|", " ")
movies = movies.merge(tags_agg, on="movieId", how="left").fillna("")
movies["content"] = movies["genres"] + " " + movies["tags"]

# ------------------------------------------------------------------
# 3. Content-based: FAISS index on TF-IDF  (FIXED)
# ------------------------------------------------------------------
vectorizer = TfidfVectorizer(max_features=5_000, stop_words='english')
tfidf_sparse = vectorizer.fit_transform(movies["content"])      # sparse matrix
tfidf_mat = np.asarray(tfidf_sparse.todense(), dtype=np.float32)  # <- dense & contiguous

index = faiss.IndexFlatIP(tfidf_mat.shape[1])   # inner-product = cosine
faiss.normalize_L2(tfidf_mat)                   # now works
index.add(tfidf_mat)

def content_recommend(movie_id, k=5):
    idx = movies.index[movies["movieId"] == movie_id][0]
    query = tfidf_mat[idx].reshape(1, -1)
    faiss.normalize_L2(query)
    _, I = index.search(query, k+1)
    return movies.iloc[I[0][1:]][["movieId", "title"]]

# ------------------------------------------------------------------
# 4. Collaborative: quick SVD
# ------------------------------------------------------------------
reader = Reader(rating_scale=(0.5, 5.0))
data   = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
trainset = data.build_full_trainset()
svd = SVD(n_factors=50, random_state=42)
svd.fit(trainset)

def collab_recommend(user_id, k=5):
    seen = set(ratings[ratings["userId"]==user_id]["movieId"])
    preds = [svd.predict(user_id, iid) for iid in movies["movieId"] if iid not in seen]
    preds.sort(key=lambda x: x.est, reverse=True)
    top = preds[:k]
    return pd.DataFrame([(int(p.iid), movies[movies.movieId==p.iid].title.iloc[0]) for p in top],
                        columns=["movieId", "title"])

# ------------------------------------------------------------------
# 5. Hybrid: simple 50-50 score merge
# ------------------------------------------------------------------
def hybrid_recommend(user_id, k=5):
    seen = set(ratings[ratings["userId"]==user_id]["movieId"])
    cands = [iid for iid in movies["movieId"] if iid not in seen]
    # collaborative score
    collab_scores = {iid: svd.predict(user_id, iid).est for iid in cands}
    # content score: average similarity to 10 last liked movies
    liked = list(ratings[(ratings.userId==user_id)&(ratings.rating>=4)].movieId.tail(10))
    content_scores = defaultdict(float)
    for lid in liked:
        sims = content_recommend(lid, k=50)
        for _, row in sims.iterrows():
            content_scores[row.movieId] += 1.0   # crude count
    # normalize both to 0-1
    def norm(d):
        m, M = min(d.values()), max(d.values())
        return {k: (v-m)/(M-m+1e-8) for k,v in d.items()}
    cs, cls = norm(content_scores), norm(collab_scores)
    hybrid = {iid: 0.5*cs.get(iid,0) + 0.5*cls.get(iid,0) for iid in cands}
    top = sorted(hybrid.items(), key=lambda x: x[1], reverse=True)[:k]
    return pd.DataFrame([(int(iid), movies[movies.movieId==iid].title.iloc[0]) for iid,_ in top],
                        columns=["movieId", "title"])

# ------------------------------------------------------------------
# 6. Demo
# ------------------------------------------------------------------
if __name__ == "__main__":
    user = random.choice(ratings.userId.unique())
    print(f"--- Recommendations for user {user} ---")
    print("Content-based (likes similar to last high-rated):")
    print(content_recommend(ratings[(ratings.userId==user)&(ratings.rating>=4)].movieId.iloc[-1]))
    print("\nCollaborative (SVD):")
    print(collab_recommend(user))
    print("\nHybrid (50-50 blend):")
    print(hybrid_recommend(user))

--- Recommendations for user 77549 ---
Content-based (likes similar to last high-rated):
       movieId                                              title
2469      2554                      Children of the Damned (1963)
2468      2553                       Village of the Damned (1960)
5087      5183  He Knows You're Alone (a.k.a. Blood Wedding) (...
17462    88015                                       Elvis (1979)
13420    66171                                        Push (2009)

Collaborative (SVD):
   movieId                                              title
0      232        Eat Drink Man Woman (Yin shi nan nu) (1994)
1      527                            Schindler's List (1993)
2       50                         Usual Suspects, The (1995)
3      265  Like Water for Chocolate (Como agua para choco...
4      318                   Shawshank Redemption, The (1994)

Hybrid (50-50 blend):
   movieId                                              title
0      232        Eat Drink Man Woma