In [2]:
import pandas as pd
import os

In [3]:


file_path = r"C:\Users\pricc\Downloads\movies_enriched_full.csv"
df = pd.read_csv(file_path, header=None)
print(df.head())


movie_df = ("movies_enriched_full.csv")

# File paths
ratings_path = r"C:\Users\pricc\Downloads\ratings.dat"
users_path = r"C:\Users\pricc\Downloads\users.dat"

# Load ratings.dat
ratings = pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

# Load users.dat
users = pd.read_csv(
    users_path,
    sep="::",
    engine="python",
    names=["userId", "gender", "age", "occupation", "zip"]
)

# Preview the data
print("Ratings:")
print(ratings.head())
print("\nUsers:")
print(users.head())


        0                         1                             2     3   \
0  movieId                     title                        genres  year   
1        1          Toy Story (1995)   Animation|Children's|Comedy  1995   
2        2            Jumanji (1995)  Adventure|Children's|Fantasy  1995   
3        3   Grumpier Old Men (1995)                Comedy|Romance  1995   
4        4  Waiting to Exhale (1995)                  Comedy|Drama  1995   

                  4        5   \
0        clean_title  tmdb_id   
1          Toy Story    862.0   
2            Jumanji   8844.0   
3   Grumpier Old Men  15602.0   
4  Waiting to Exhale  31357.0   

                                                  6   \
0                                           overview   
1  Led by Woody, Andy's toys live happily in his ...   
2  When siblings Judy and Peter discover an encha...   
3  A family wedding reignites the ancient feud be...   
4  Cheated on, mistreated and stepped on, the wom...   

       

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# ----------------------------------------
# Load Enriched Movie Data
# ----------------------------------------
df = pd.read_csv("movies_enriched_full.csv")

# ----------------------------------------
# SCombine Metadata Fields
# ----------------------------------------

def combine_metadata(row):
    return " ".join([
        str(row["tmdb_genres"]) if pd.notnull(row["tmdb_genres"]) else "",
        str(row["keywords"]) if pd.notnull(row["keywords"]) else "",
        str(row["top_3_cast"]) if pd.notnull(row["top_3_cast"]) else "",
        str(row["directors"]) if pd.notnull(row["directors"]) else ""
    ]).lower().replace(",", " ").replace(":", " ").replace("-", " ")

df["metadata"] = df.apply(combine_metadata, axis=1)

# ----------------------------------------
#  Build Vectorizers
# ----------------------------------------

# Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(df["metadata"])

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df["metadata"])

# ----------------------------------------
#  Compute Cosine Similarity
# ----------------------------------------

cosine_sim_count = cosine_similarity(count_matrix)
cosine_sim_tfidf = cosine_similarity(tfidf_matrix)

# ----------------------------------------
#  Save Results
# ----------------------------------------

# Save similarity matrices as NumPy arrays
np.save("cosine_sim_count.npy", cosine_sim_count)
np.save("cosine_sim_tfidf.npy", cosine_sim_tfidf)

# Optional: Save similarity matrices as CSVs
pd.DataFrame(cosine_sim_count, index=df["title"], columns=df["title"]).to_csv("cosine_sim_count.csv")
pd.DataFrame(cosine_sim_tfidf, index=df["title"], columns=df["title"]).to_csv("cosine_sim_tfidf.csv")

print(" Models built and similarity matrices saved.")

 Models built and similarity matrices saved.


In [25]:
def recommend_movies(title, similarity_matrix, df, top_n=10):
    idx = df[df["title"] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    similar_movies = [(df.iloc[i]["title"], score) for i, score in sim_scores]
    return similar_movies

# Example: Recommend movies similar to "Toy Story (1995)"
print("\nTop 5 similar movies using TF-IDF:")
for movie, score in recommend_movies("Toy Story (1995)", cosine_sim_tfidf, df, top_n=5):
    print(f"{movie} (Score: {score:.4f})")


Top 5 similar movies using TF-IDF:
Small Soldiers (1998) (Score: 0.3903)
Toy Story 2 (1999) (Score: 0.3539)
Indian in the Cupboard, The (1995) (Score: 0.3101)
Toys (1992) (Score: 0.2656)
Babes in Toyland (1961) (Score: 0.2394)


CountVectorizer

In [19]:
import pandas as pd
import numpy as np

# Load movie data and similarity matrices
df = pd.read_csv("movies_enriched_full.csv")
cosine_sim_count = np.load("cosine_sim_count.npy")
cosine_sim_tfidf = np.load("cosine_sim_tfidf.npy")
print(" Cosine Similarity Matrix (CountVectorizer):")
print(pd.DataFrame(cosine_sim_count, index=df["title"], columns=df["title"]).iloc[:5, :5])

 Cosine Similarity Matrix (CountVectorizer):
title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.054433   
Jumanji (1995)                              0.054433        1.000000   
Grumpier Old Men (1995)                     0.054433        0.000000   
Waiting to Exhale (1995)                    0.066227        0.064889   
Father of the Bride Part II (1995)          0.046676        0.034300   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.054433   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.097333   
Father of the Bride Part II (1995)                 0.068599   

title   

Toy story has 1.000000 self-similarity to Toy Story

0.054433 is how similar Toy Story is to Jumanji

0.000000 means no content overlap between the movies for example Grumpier Old Men and Jumanji have no overlapping keywords/cast/etc.

Waiting to Exhale has 6.6% similarity to Toy Story under CountVectoriz

TF-IDF

In [18]:
print("\n Cosine Similarity Matrix (TF-IDF):")
print(pd.DataFrame(cosine_sim_tfidf, index=df["title"], columns=df["title"]).iloc[:5, :5])



 Cosine Similarity Matrix (TF-IDF):
title                               Toy Story (1995)  Jumanji (1995)  \
title                                                                  
Toy Story (1995)                            1.000000        0.014150   
Jumanji (1995)                              0.014150        1.000000   
Grumpier Old Men (1995)                     0.022040        0.000000   
Waiting to Exhale (1995)                    0.028133        0.017912   
Father of the Bride Part II (1995)          0.009242        0.009244   

title                               Grumpier Old Men (1995)  \
title                                                         
Toy Story (1995)                                   0.022040   
Jumanji (1995)                                     0.000000   
Grumpier Old Men (1995)                            1.000000   
Waiting to Exhale (1995)                           0.017067   
Father of the Bride Part II (1995)                 0.024845   

title           

Print Top 5 Similar Movies for a Given Title

In [24]:
def print_recommendations(title, similarity_matrix, df, top_n=5):
    idx = df[df["title"] == title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    print(f"\n Top {top_n} similar movies to '{title}':")
    for i, (movie_idx, score) in enumerate(sim_scores, 1):
        print(f"{i}. {df.iloc[movie_idx]['title']} (Similarity: {score:.4f})")

In [23]:
print('TF-IDF Recommendations')
print_recommendations("Toy Story (1995)", cosine_sim_tfidf, df, top_n=5)

print(' CountVectorizer Recommendations')
print_recommendations("Toy Story (1995)", cosine_sim_count, df, top_n=5)

TF-IDF Recommendations

 Top 5 similar movies to 'Toy Story (1995)':
1. Small Soldiers (1998) (Similarity: 0.3903)
2. Toy Story 2 (1999) (Similarity: 0.3539)
3. Indian in the Cupboard, The (1995) (Similarity: 0.3101)
4. Toys (1992) (Similarity: 0.2656)
5. Babes in Toyland (1961) (Similarity: 0.2394)
 CountVectorizer Recommendations

 Top 5 similar movies to 'Toy Story (1995)':
1. Toy Story 2 (1999) (Similarity: 0.4518)
2. Small Soldiers (1998) (Similarity: 0.3790)
3. Indian in the Cupboard, The (1995) (Similarity: 0.2887)
4. Big (1988) (Similarity: 0.2502)
5. Babes in Toyland (1961) (Similarity: 0.2485)


In [4]:
# Memory-Based Collaborative Filtering (Bias-Normalized)

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances

# --- Step 1: Create Mean-Centered User-Item Matrix ---
def create_normalized_user_item_matrix(ratings):
    matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    user_means = matrix.mean(axis=1)
    return matrix.sub(user_means, axis=0).fillna(0), user_means

# --- Step 2: Compute Cosine Similarity ---
def compute_similarity(matrix, kind='user'):
    if kind == 'user':
        sim = 1 - pairwise_distances(matrix, metric='cosine')
    elif kind == 'item':
        sim = 1 - pairwise_distances(matrix.T, metric='cosine')
    else:
        raise ValueError("kind must be 'user' or 'item'")
    return sim

# --- Step 3: Recommend Items ---
def recommend_memory_based(user_id, user_item_matrix, user_means, similarity_matrix, kind='user', top_n=10):
    model_label = f"{kind.title()}-Based CF"

    if kind == 'user':
        user_sim_scores = similarity_matrix[user_id - 1]
        normalized_ratings = user_item_matrix.values
        weighted_scores = user_sim_scores @ normalized_ratings
        sum_weights = np.abs(user_sim_scores).sum()
        if sum_weights == 0:
            return pd.DataFrame(columns=['userId', 'movieId', 'score', 'model'])
        predicted_ratings = weighted_scores / sum_weights
        user_seen = user_item_matrix.loc[user_id]
        unseen_mask = user_seen == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]
        recs += user_means.loc[user_id]

    elif kind == 'item':
        user_ratings = user_item_matrix.loc[user_id]
        scores = user_ratings @ similarity_matrix
        sum_weights = (user_ratings != 0) @ np.abs(similarity_matrix)
        with np.errstate(divide='ignore', invalid='ignore'):
            predicted_ratings = np.true_divide(scores, sum_weights)
            predicted_ratings[sum_weights == 0] = 0
        unseen_mask = user_ratings == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]
    else:
        raise ValueError("kind must be 'user' or 'item'")

    recs = recs.sort_values(ascending=False).head(top_n)
    return pd.DataFrame({
        'userId': [user_id] * len(recs),
        'movieId': recs.index,
        'score': recs.values,
        'model': model_label
    })

# --- Step 4: Evaluation Metrics ---
def compute_rmse(preds, truth):
    mask = ~truth.isna()
    mse = ((truth[mask] - preds[mask]) ** 2).mean().mean()
    return np.sqrt(mse)

def precision_recall_ndcg_at_k(preds, truth, k=10, threshold=4.0):
    precisions, recalls, ndcgs = [], [], []

    for uid in preds.index:
        if uid not in truth.index:
            continue
        pred_scores = preds.loc[uid].dropna().sort_values(ascending=False).head(k)
        actual_ratings = truth.loc[uid]

        relevant = actual_ratings[actual_ratings >= threshold].index
        recommended = pred_scores.index

        true_positives = len(set(recommended) & set(relevant))
        precisions.append(true_positives / k)
        recalls.append(true_positives / len(relevant) if len(relevant) else 0)

        dcg = sum([
            1 / np.log2(i + 2) if rec in relevant else 0
            for i, rec in enumerate(recommended)
        ])
        idcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])
        ndcgs.append(dcg / idcg if idcg > 0 else 0)

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

# --- Step 5: Helper to Convert to Prediction Matrix ---
def to_pred_matrix(recs_df, users, items):
    matrix = pd.DataFrame(index=users, columns=items)
    for _, row in recs_df.iterrows():
        matrix.at[row['userId'], row['movieId']] = row['score']
    return matrix.astype(float)

# === Load Ratings ===
ratings = pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

# === Build Matrix & Similarities ===
user_item_matrix, user_means = create_normalized_user_item_matrix(ratings)
user_sim_matrix = compute_similarity(user_item_matrix, kind='user')
item_sim_matrix = compute_similarity(user_item_matrix, kind='item')

# === Generate Recommendations for All Users ===
user_recs_all = []
for uid in ratings['userId'].unique()[:100]:
    recs = recommend_memory_based(uid, user_item_matrix, user_means, user_sim_matrix, kind='user', top_n=10)
    user_recs_all.append(recs)
user_recs_df = pd.concat(user_recs_all, ignore_index=True)

# === Convert to Prediction Matrix ===
all_users = ratings['userId'].unique()
all_movies = ratings['movieId'].unique()
pred_user_cf = to_pred_matrix(user_recs_df, all_users, all_movies)

# === Build Truth Matrix ===
truth = ratings.pivot(index='userId', columns='movieId', values='rating')

# === Evaluate ===
rmse_user = compute_rmse(pred_user_cf, truth)
prec_user, rec_user, ndcg_user = precision_recall_ndcg_at_k(pred_user_cf, truth, k=10)

print(f"User-Based CF Evaluation:")
print(f"  RMSE: {rmse_user:.4f}")
print(f"  Precision@10: {prec_user:.4f}")
print(f"  Recall@10: {rec_user:.4f}")
print(f"  NDCG@10: {ndcg_user:.4f}")

User-Based CF Evaluation:
  RMSE: nan
  Precision@10: 0.0000
  Recall@10: 0.0000
  NDCG@10: 0.0000


In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import time

# ==============================
# Load and Subset Data
# ==============================

ratings =  pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

movies = pd.read_csv(file_path)[['movieId', 'title']]

# TEMP: Use a smaller subset for faster testing
subset_users = ratings['userId'].value_counts().head(500).index
subset_movies = ratings['movieId'].value_counts().head(500).index
ratings_small = ratings[ratings['userId'].isin(subset_users) & ratings['movieId'].isin(subset_movies)]

# ==============================
# Step 1: Create Bias-Normalized Matrix
# ==============================

def create_normalized_user_item_matrix(ratings):
    matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    user_means = matrix.mean(axis=1)
    return matrix.sub(user_means, axis=0).fillna(0), user_means

user_item_matrix, user_means = create_normalized_user_item_matrix(ratings_small)

# ==============================
# Step 2: Compute Similarity Matrices
# ==============================

def compute_similarity(matrix, kind='user'):
    if kind == 'user':
        sim = 1 - pairwise_distances(matrix, metric='cosine')
    elif kind == 'item':
        sim = 1 - pairwise_distances(matrix.T, metric='cosine')
    else:
        raise ValueError("kind must be 'user' or 'item'")
    print(f"{kind.title()}-based similarity computed. Shape: {sim.shape}")
    return sim

start = time.time()
user_sim_matrix = compute_similarity(user_item_matrix, kind='user')
print(f"User similarity computed in {time.time() - start:.2f} sec")

start = time.time()
item_sim_matrix = compute_similarity(user_item_matrix, kind='item')
print(f"Item similarity computed in {time.time() - start:.2f} sec")

# ==============================
# Step 3: Recommendation Function
# ==============================

def recommend_memory_based(user_id, user_item_matrix, user_means, similarity_matrix, kind='user', top_n=50):
    model_label = f"{kind.title()}-Based CF"

    if user_id not in user_item_matrix.index:
        print(f"User {user_id} not in matrix.")
        return pd.DataFrame(columns=['movieId', 'score', 'model'])

    if kind == 'user':
        user_sim_scores = similarity_matrix[user_item_matrix.index.get_loc(user_id)]
        normalized_ratings = user_item_matrix.values

        weighted_scores = user_sim_scores @ normalized_ratings
        sum_weights = np.abs(user_sim_scores).sum()

        if sum_weights == 0:
            print("No similar users found.")
            return pd.DataFrame(columns=['movieId', 'score', 'model'])

        predicted_ratings = weighted_scores / sum_weights
        user_seen = user_item_matrix.loc[user_id]
        unseen_mask = user_seen == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]\
            .sort_values(ascending=False).head(top_n)

        recs += user_means.loc[user_id]

    elif kind == 'item':
        user_ratings = user_item_matrix.loc[user_id]
        scores = user_ratings @ similarity_matrix
        sum_weights = (user_ratings != 0) @ np.abs(similarity_matrix)

        with np.errstate(divide='ignore', invalid='ignore'):
            predicted_ratings = np.true_divide(scores, sum_weights)
            predicted_ratings[sum_weights == 0] = 0

        unseen_mask = user_ratings == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]\
            .sort_values(ascending=False).head(top_n)

    else:
        raise ValueError("kind must be 'user' or 'item'") 

    return pd.DataFrame({
        'movieId': recs.index,
        'score': recs.values,
        'model': model_label
    })

# ==============================
# Step 4: Generate and Merge Recommendations
# ==============================

test_user_id = user_item_matrix.index[0]  # Just pick the first user in the small set

user_cf_recs = recommend_memory_based(
    test_user_id, user_item_matrix, user_means, user_sim_matrix, kind='user', top_n=50
)

item_cf_recs = recommend_memory_based(
    test_user_id, user_item_matrix, user_means, item_sim_matrix, kind='item', top_n=50
)

user_cf_recs = user_cf_recs.merge(movies, on="movieId", how="left")
item_cf_recs = item_cf_recs.merge(movies, on="movieId", how="left")

# ==============================
# Step 5: Display
# ==============================

print(f"\nTop 50 User-Based CF Recommendations for User {test_user_id}:")
print(user_cf_recs[['movieId', 'title', 'score']].head())

print(f"\nTop 50 Item-Based CF Recommendations for User {test_user_id}:")
print(item_cf_recs[['movieId', 'title', 'score']].head())


User-based similarity computed. Shape: (500, 500)
User similarity computed in 0.03 sec
Item-based similarity computed. Shape: (500, 500)
Item similarity computed in 0.02 sec

Top 50 User-Based CF Recommendations for User 48:
   movieId                                              title     score
0      750  Dr. Strangelove or: How I Learned to Stop Worr...  3.818245
1      904                                 Rear Window (1954)  3.756734
2      908                          North by Northwest (1959)  3.723104
3     1089                              Reservoir Dogs (1992)  3.708471
4     1234                                  Sting, The (1973)  3.702266

Top 50 Item-Based CF Recommendations for User 48:
   movieId                         title     score
0      953  It's a Wonderful Life (1946)  0.433139
1     1272                 Patton (1970)  0.418386
2     1961               Rain Man (1988)  0.416326
3     2329     American History X (1998)  0.413722
4     1234             Sting, The (19

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

# Load ratings
ratings =  pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)

# ==============================
# Subset: Top 500 Users & Movies
# ==============================
subset_users = ratings['userId'].value_counts().head(500).index
subset_movies = ratings['movieId'].value_counts().head(500).index
ratings_small = ratings[ratings['userId'].isin(subset_users) & ratings['movieId'].isin(subset_movies)]

# ==============================
# Create Bias-Normalized Matrix
# ==============================
def create_normalized_user_item_matrix(ratings):
    matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    user_means = matrix.mean(axis=1)
    return matrix.sub(user_means, axis=0).fillna(0), user_means

user_item_matrix, user_means = create_normalized_user_item_matrix(ratings_small)

# ==============================
# Similarity Calculation
# ==============================
def compute_similarity(matrix, kind='user'):
    if kind == 'user':
        sim = 1 - pairwise_distances(matrix, metric='cosine')
    elif kind == 'item':
        sim = 1 - pairwise_distances(matrix.T, metric='cosine')
    else:
        raise ValueError("kind must be 'user' or 'item'")
    return sim

user_sim_matrix = compute_similarity(user_item_matrix, kind='user')
item_sim_matrix = compute_similarity(user_item_matrix, kind='item')

# ==============================
# Recommend Function
# ==============================
def recommend_memory_based(user_id, user_item_matrix, user_means, similarity_matrix, kind='user', top_n=10):
    model_label = f"{kind.title()}-Based CF"

    if kind == 'user':
        user_sim_scores = similarity_matrix[user_item_matrix.index.get_loc(user_id)]
        normalized_ratings = user_item_matrix.values

        weighted_scores = user_sim_scores @ normalized_ratings
        sum_weights = np.abs(user_sim_scores).sum()
        if sum_weights == 0:
            return pd.DataFrame(columns=['movieId', 'score', 'model'])

        predicted_ratings = weighted_scores / sum_weights
        user_seen = user_item_matrix.loc[user_id]
        unseen_mask = user_seen == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask] \
            .sort_values(ascending=False).head(top_n)
        recs += user_means.loc[user_id]

    elif kind == 'item':
        user_ratings = user_item_matrix.loc[user_id]
        scores = user_ratings @ similarity_matrix
        sum_weights = (user_ratings != 0) @ np.abs(similarity_matrix)
        predicted_ratings = np.divide(scores, sum_weights, out=np.zeros_like(scores), where=sum_weights != 0)
        unseen_mask = user_ratings == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask] \
            .sort_values(ascending=False).head(top_n)

    return pd.DataFrame({
        'movieId': recs.index,
        'score': recs.values,
        'model': model_label
    })

# ==============================
# Run Example
# ==============================
example_user_id = user_item_matrix.index[0]  # pick any from subset
user_cf_recs = recommend_memory_based(example_user_id, user_item_matrix, user_means, user_sim_matrix, kind='user')
item_cf_recs = recommend_memory_based(example_user_id, user_item_matrix, user_means, item_sim_matrix, kind='item')

print("User-Based CF Recommendations:")
print(user_cf_recs.head())

print("\nItem-Based CF Recommendations:")
print(item_cf_recs.head())

User-Based CF Recommendations:
   movieId     score          model
0      750  3.818245  User-Based CF
1      904  3.756734  User-Based CF
2      908  3.723104  User-Based CF
3     1089  3.708471  User-Based CF
4     1234  3.702266  User-Based CF

Item-Based CF Recommendations:
   movieId     score          model
0      953  0.433139  Item-Based CF
1     1272  0.418386  Item-Based CF
2     1961  0.416326  Item-Based CF
3     2329  0.413722  Item-Based CF
4     1234  0.413218  Item-Based CF


In [11]:
from math import sqrt
from sklearn.metrics import mean_squared_error
import numpy as np

from math import sqrt
from sklearn.metrics import mean_squared_error
import numpy as np

def compute_rmse(preds_df, truth_df):
    # Align both dataframes by index and columns
    preds_aligned, truth_aligned = preds_df.align(truth_df, join='inner', axis=0)
    preds_aligned, truth_aligned = preds_aligned.align(truth_df, join='inner', axis=1)

    # Flatten and keep only pairs where both values are not NaN
    mask = (~preds_aligned.isna()) & (~truth_aligned.isna())
    y_true = truth_aligned[mask].values
    y_pred = preds_aligned[mask].values

    if y_true.size == 0:
        print("No overlapping ratings to compare.")
        return np.nan

    return sqrt(mean_squared_error(y_true, y_pred))


def precision_recall_ndcg_at_k(preds, truth, k=10):
    """
    Compute Precision@K, Recall@K, and NDCG@K.
    """
    precisions, recalls, ndcgs = [], [], []

    for user in truth.index:
        true_ratings = truth.loc[user].dropna()
        if true_ratings.empty:
            continue

        true_top_k = set(true_ratings.sort_values(ascending=False).head(k).index)
        pred_top_k = set(preds.loc[user].sort_values(ascending=False).head(k).index)

        tp = len(true_top_k & pred_top_k)
        prec = tp / k
        rec = tp / len(true_top_k) if true_top_k else 0

        dcg = sum([1 / np.log2(i+2) if m in true_top_k else 0 for i, m in enumerate(preds.loc[user].sort_values(ascending=False).head(k).index)])
        idcg = sum([1 / np.log2(i+2) for i in range(min(k, len(true_top_k)))])
        ndcg = dcg / idcg if idcg != 0 else 0

        precisions.append(prec)
        recalls.append(rec)
        ndcgs.append(ndcg)

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)


In [12]:
def train_test_split_userwise(df, test_ratio=0.2):
    """
    For each user, split their ratings into train and test sets.
    """
    train_list = []
    test_list = []

    for user_id, user_df in df.groupby('userId'):
        user_df = user_df.sample(frac=1, random_state=42)
        test_size = int(len(user_df) * test_ratio)
        test_list.append(user_df.iloc[:test_size])
        train_list.append(user_df.iloc[test_size:])

    train_df = pd.concat(train_list)
    test_df = pd.concat(test_list)
    return train_df, test_df


In [13]:
# Split data
train_ratings, test_ratings = train_test_split_userwise(ratings_small, test_ratio=0.2)

# Create train matrix
user_item_train, user_means_train = create_normalized_user_item_matrix(train_ratings)
user_sim = compute_similarity(user_item_train, kind='user')
item_sim = compute_similarity(user_item_train, kind='item')

# Predict ratings for test users
test_users = test_ratings['userId'].unique()
pred_user_cf = pd.DataFrame(index=user_item_train.index, columns=user_item_train.columns)
pred_item_cf = pd.DataFrame(index=user_item_train.index, columns=user_item_train.columns)

for user in test_users:
    # User-based prediction
    recs_user = recommend_memory_based(user, user_item_train, user_means_train, user_sim, kind='user', top_n=len(user_item_train.columns))
    pred_user_cf.loc[user, recs_user['movieId']] = recs_user['score'].values

    # Item-based prediction
    recs_item = recommend_memory_based(user, user_item_train, user_means_train, item_sim, kind='item', top_n=len(user_item_train.columns))
    pred_item_cf.loc[user, recs_item['movieId']] = recs_item['score'].values

# Create ground truth matrix
truth_matrix = test_ratings.pivot(index='userId', columns='movieId', values='rating')

# Evaluate
rmse_user = compute_rmse(pred_user_cf, truth_matrix)
rmse_item = compute_rmse(pred_item_cf, truth_matrix)

prec_user, rec_user, ndcg_user = precision_recall_ndcg_at_k(pred_user_cf, truth_matrix, k=10)
prec_item, rec_item, ndcg_item = precision_recall_ndcg_at_k(pred_item_cf, truth_matrix, k=10)

print(f"User-based CF:\n RMSE={rmse_user:.4f}, Precision@10={prec_user:.4f}, Recall@10={rec_user:.4f}, NDCG@10={ndcg_user:.4f}")
print(f"Item-based CF:\n RMSE={rmse_item:.4f}, Precision@10={prec_item:.4f}, Recall@10={rec_item:.4f}, NDCG@10={ndcg_item:.4f}")


ValueError: Input contains NaN.