
# 🎬 Hybrid Recommender System (Content + Collaborative)

This notebook demonstrates a **Hybrid Recommender System** using **Content-Based Filtering** (TF-IDF + Cosine Similarity) and **Collaborative Filtering** (SVD Matrix Factorization).  
We evaluate using **RMSE, Precision@K, Recall@K**, and visualize **Top-N recommendations**.

**Dataset:** MovieLens 100K


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

from collections import defaultdict


In [None]:

# Load MovieLens 100K dataset
ratings = pd.read_csv("https://files.grouplens.org/datasets/movielens/ml-100k/u.data",
                      sep="\t", names=["userId","movieId","rating","timestamp"])

movies = pd.read_csv("https://files.grouplens.org/datasets/movielens/ml-100k/u.item",
                     sep="|", encoding="latin-1", header=None)

movies = movies[[0,1]]
movies.columns = ["movieId","title"]

print(ratings.head())
print(movies.head())

# Plot rating distribution
plt.figure(figsize=(6,4))
sns.histplot(ratings["rating"], bins=5, kde=False)
plt.title("Ratings Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()


In [None]:

# Create TF-IDF matrix on movie titles (for simplicity)
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["title"].fillna(""))

# Cosine similarity between items
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print("Cosine similarity matrix shape:", cosine_sim.shape)


In [None]:

# Use Surprise to load dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD
svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)

# RMSE evaluation
rmse = accuracy.rmse(predictions)


In [None]:

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions, recalls = {}, {}
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=10)
print("Precision@10:", sum(prec for prec in precisions.values()) / len(precisions))
print("Recall@10:", sum(rec for rec in recalls.values()) / len(recalls))


In [None]:

# Hybrid = 0.7*CF + 0.3*Content
def hybrid_score(user_id, movie_id, alpha=0.7):
    try:
        cf_score = svd.predict(user_id, movie_id).est
    except:
        cf_score = 0
    try:
        idx = movies.index[movies["movieId"] == movie_id][0]
        sim_scores = cosine_sim[idx]
        content_score = np.mean(sim_scores)
    except:
        content_score = 0
    return alpha * cf_score + (1 - alpha) * content_score


In [None]:

def get_top_n_recommendations(user_id, n=10, alpha=0.7):
    scores = []
    for movieId in movies["movieId"].values:
        score = hybrid_score(user_id, movieId, alpha=alpha)
        scores.append((movieId, score))
    rated = set(ratings[ratings.userId==user_id].movieId.values)
    scores = [(m,s) for m,s in scores if m not in rated]
    top_movies = sorted(scores, key=lambda x: x[1], reverse=True)[:n]
    
    detailed = []
    for mid, sc in top_movies:
        title = movies[movies["movieId"]==mid]["title"].values[0]
        detailed.append((mid, sc, title))
    return detailed

# Example for user 1
top_n = get_top_n_recommendations(1, n=10, alpha=0.7)
print("Top-10 Hybrid Recommendations for User 1:")
for mid, sc, title in top_n:
    print(f"{mid:4d} | {sc:.4f} | {title}")


In [None]:

# Plot Top-N Recommendations for User 1
movie_titles = [t for _,_,t in top_n]
scores = [s for _,s,_ in [(mid,sc,title) for mid,sc,title in top_n]]

plt.figure(figsize=(8,5))
sns.barplot(x=scores, y=movie_titles)
plt.title("Top-10 Hybrid Recommendations (User 1)")
plt.xlabel("Hybrid Score")
plt.ylabel("Movie Title")
plt.show()


In [None]:

# Compare CF vs Hybrid for User 1
cf_scores = [svd.predict(1, mid).est for mid,_,_ in top_n]
hy_scores = [sc for _,sc,_ in top_n]

df_compare = pd.DataFrame({"Movie": movie_titles, "CF": cf_scores, "Hybrid": hy_scores})

df_compare.plot(x="Movie", kind="bar", figsize=(10,5))
plt.title("CF vs Hybrid Scores for User 1")
plt.ylabel("Score")
plt.xticks(rotation=45, ha="right")
plt.show()
