In [1]:
# Data from https://grouplens.org/datasets/movielens/latest/
# Put the downloaded csv files in the "Downloads" folder

import os
os.chdir("Downloads")
print(os.getcwd())


c:\Users\vonks\Documents\GitHub\Movie_Recs\Downloads


In [2]:
import pandas as pd

ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

data = ratings.merge(movies, on="movieId", how="left")

In [4]:
# Removing users with more than 1.000 ratings as these are likely to be bots or outliers
# See Data_Exploration.ipynb for more details about cleaning and outlier detection

ratings_per_user = data.groupby('userId').size()
outliers = ratings_per_user[ratings_per_user > 1000] # or 10.000
filtered_data = data[~data['userId'].isin(outliers.index)]

# Table of removed users
removed_users_table = outliers.reset_index()
removed_users_table.columns = ['userId', 'num_ratings']

#print(removed_users_table)


## User Similarity Recommendation

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Parameters
min_ratings_per_user = 20 # For selection of active users
sample_size = 100
K = 10 # Number of recommendations
liked_threshold = 4
top_n_input = 4 # Number of liked movies to use for similarity

# Select active users
liked_mask = filtered_data['rating'] >= liked_threshold
liked_counts = filtered_data[liked_mask].groupby('userId').size()
active_users = filtered_data.groupby('userId').size()
active_users = active_users[(active_users >= min_ratings_per_user)].index
users_with_5_liked = liked_counts[liked_counts >= 5].index
active_users = active_users.intersection(users_with_5_liked)

# Random sample of users because data is large
np.random.seed(123)
sampled_users = np.random.choice(active_users, size=sample_size, replace=False)

# For each sampled user, split their movie ratings into train (80%) and test (20%)
train_ratings = pd.DataFrame()
test_ratings = pd.DataFrame()

for uid in sampled_users:
    user_data = filtered_data[filtered_data['userId'] == uid]
    train, test = train_test_split(user_data, test_size=0.2, random_state=123)
    train_ratings = pd.concat([train_ratings, train])
    test_ratings = pd.concat([test_ratings, test])

# -----------------------------
# Build matrix from training data
# -----------------------------
user_ids = train_ratings['userId'].unique()
movie_ids = train_ratings['movieId'].unique()
user_to_idx = {uid:i for i, uid in enumerate(user_ids)} # Rows
movie_to_idx = {mid:i for i, mid in enumerate(movie_ids)} # Columns

# Empty matrix
n_users = len(user_ids)
n_movies = len(movie_ids)
user_item_matrix = np.zeros((n_users, n_movies))

# Fill matrix
for row in train_ratings.itertuples():
    u_idx = user_to_idx[row.userId]
    m_idx = movie_to_idx[row.movieId]
    user_item_matrix[u_idx, m_idx] = row.rating

# -----------------------------
# Generate recommendations using only top liked movies
# -----------------------------
user_metrics = []

for uid in sampled_users:
    u_idx = user_to_idx[uid]
    user_vector = user_item_matrix[u_idx, :] # Ratings

    # Select top-N liked movies only (positive ratings)
    liked_indices = np.where(user_vector >= liked_threshold)[0]
    if len(liked_indices) == 0:
        continue
    
    top_indices = np.argsort(user_vector[liked_indices])[-top_n_input:]
    top_indices = liked_indices[top_indices]

    # Find relevant users who rated at least one of these top-N movies
    rel_users = np.any(user_item_matrix[:, top_indices] > 0, axis=1)
    rel_users[u_idx] = False
    relevant_users_matrix = user_item_matrix[rel_users, :]

    if relevant_users_matrix.shape[0] == 0:
        continue

    # Cosine similarity on top-N liked movies
    sims = cosine_similarity([user_vector[top_indices]], relevant_users_matrix[:, top_indices])[0]

    # Compute weighted average for unseen movies based on similar users = prediction
    unseen = np.where(user_vector == 0)[0]
    weighted_scores = np.zeros(len(unseen))
    for i, m_idx in enumerate(unseen):
        weighted_scores[i] = np.sum(relevant_users_matrix[:, m_idx] * sims) / (np.sum(sims) + 1e-8)

    # Top recommendations
    top_recs_idx = np.argsort(weighted_scores)[::-1][:K] # Top K
    recommended_movie_ids = [list(movie_to_idx.keys())[unseen[i]] for i in top_recs_idx]

    # Test set 
    test_all = test_ratings[test_ratings['userId'] == uid]
    test_liked = set(test_all[test_all['rating'] >= liked_threshold]['movieId'])
    test_not_liked = set(test_all[test_all['rating'] < liked_threshold]['movieId'])

    user_metrics.append({
        'recommended': recommended_movie_ids,
        'test_liked': test_liked,
        'test_not_liked': test_not_liked
    })


In [26]:
#Evaluation

all_movie_ids = set(filtered_data['movieId'].unique())
K = 10  # top-K recommendations
liked_threshold = 4.0

counts_per_user = []
hit_rates, precisions, recalls = [], [], []

#filtered_user_metrics = [um for um in user_metrics if len(um['test_liked']) > 0]

for um in user_metrics:
    # Unpack the information per user (um)
    recommended = set(um['recommended'])
    test_liked = um['test_liked']
    test_not_liked = um['test_not_liked']
    test_movies = test_liked | test_not_liked

    # Metrics for table
    rec_watched_liked = len(recommended & test_liked)
    rec_watched_not_liked = len(recommended & test_not_liked)
    rec_not_watched = len(recommended - test_movies)

    nonrec = all_movie_ids - recommended
    nonrec_watched_liked = len(nonrec & test_liked)
    nonrec_watched_not_liked = len(nonrec & test_not_liked)
    nonrec_not_watched = len(nonrec - test_movies)

    counts_per_user.append({
        'Recommended_Watched+Liked': rec_watched_liked,
        'Recommended_Watched+NotLiked': rec_watched_not_liked,
        'Recommended_NotWatched': rec_not_watched,
        'NotRecommended_Watched+Liked': nonrec_watched_liked,
        'NotRecommended_Watched+NotLiked': nonrec_watched_not_liked,
        'NotRecommended_NotWatched': nonrec_not_watched
    })

    # Hit Rate = 1 if at least one liked movie is recommended
    #hit = 1 if len(recommended & test_liked) > 0 else 0
    hit = len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0

    # Precision and Recall K
    precision = len(recommended & test_liked) / len(recommended) if len(recommended) > 0 else 0
    recall = len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0
    
    hit_rates.append(hit)
    precisions.append(precision)
    recalls.append(recall)

# Table with averages
counts_df = pd.DataFrame(counts_per_user)
average_counts = counts_df.mean()

table = pd.DataFrame({
    'Recommended': [
        average_counts['Recommended_Watched+Liked'],
        average_counts['Recommended_Watched+NotLiked'],
        average_counts['Recommended_NotWatched']
    ],
    'Not Recommended': [
        average_counts['NotRecommended_Watched+Liked'],
        average_counts['NotRecommended_Watched+NotLiked'],
        average_counts['NotRecommended_NotWatched']
    ]
}, index=['Liked', 'Not Liked', 'Not Watched'])

# Add totals column
table['Total'] = table['Recommended'] + table['Not Recommended']

print("Recommendations based on user similarity of liked movies:")
print(table)

avg_hit_rate = np.mean(hit_rates)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

print("\nAverage metrics:")
print(f"Fraction of liked movies in {K} recs: {avg_hit_rate:.3f}")
print(f"Precision in {K} recs: {avg_precision:.3f}")
print(f"Recall in {K} recs: {avg_recall:.3f}")


Recommendations based on user similarity of liked movies:
             Recommended  Not Recommended         Total
Liked           1.305263        15.000000     16.305263
Not Liked       0.515789        16.810526     17.326316
Not Watched     8.178947     59433.189474  59441.368421

Average metrics:
Fraction of liked movies in 10 recs: 0.099
Precision in 10 recs: 0.131
Recall in 10 recs: 0.099


## Recommending based on top 10 rated movies overall

In [7]:
import pandas as pd
import numpy as np

K = 10
min_ratings_per_movie = 10000  # threshold applied on the ENTIRE dataset

movie_stats_all = filtered_data.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()
eligible_movies = movie_stats_all[movie_stats_all['count'] >= min_ratings_per_movie]

# Sort by average rating descending
top_movies_by_avg_all = eligible_movies.sort_values('mean', ascending=False).head(K)

# Merge with movie titles
top_movies_summary = top_movies_by_avg_all.merge(movies[['movieId', 'title']], on='movieId', how='left')

# Rename columns
top_movies_summary.rename(columns={'mean': 'average_rating', 'count': 'num_ratings'}, inplace=True)
top_movies_summary = top_movies_summary[['movieId', 'title', 'average_rating', 'num_ratings']]

# Display top movies
print(f"Top-{K} Movies in the Entire Dataset (min {min_ratings_per_movie} ratings):")
print(top_movies_summary)

# Save movie IDs for later recommendation block
top_movie_ids = top_movies_summary['movieId'].tolist()


Top-10 Movies in the Entire Dataset (min 10000 ratings):
   movieId                                              title  average_rating  \
0      318                   Shawshank Redemption, The (1994)        4.419662   
1   202439                                    Parasite (2019)        4.353265   
2      858                              Godfather, The (1972)        4.326352   
3       50                         Usual Suspects, The (1995)        4.274643   
4     1221                     Godfather: Part II, The (1974)        4.273383   
5     1203                                12 Angry Men (1957)        4.273294   
6     2019        Seven Samurai (Shichinin no samurai) (1954)        4.261119   
7      527                            Schindler's List (1993)        4.246293   
8     5618  Spirited Away (Sen to Chihiro no kamikakushi) ...        4.240083   
9     2959                                  Fight Club (1999)        4.239283   

   num_ratings  
0       118898  
1        11163  


In [27]:
import numpy as np
import pandas as pd

# --------------------------
# Parameters (consistent with block 1)
# --------------------------
liked_threshold = 4
K = 10              # number of recommendations
top_n_input = 4     # top liked movies used as input in block 1
all_movie_ids = set(filtered_data['movieId'].unique())

user_metrics_top = []

# --------------------------
# Build per-user evaluation
# --------------------------
for uid in sampled_users:
    # Test set for this user
    test_all = test_ratings[test_ratings['userId'] == uid]

    # Top-N liked movies from TRAIN (same as block 1)
    train_user = train_ratings[train_ratings['userId'] == uid]
    liked_train = train_user[train_user['rating'] >= liked_threshold]
    if liked_train.empty:
        continue

    top_liked_train = liked_train.sort_values('rating', ascending=False).head(top_n_input)['movieId'].tolist()

    # Recommendations for this user (baseline: top_movie_ids)
    recommended = set(top_movie_ids)

    # Exclude top-N liked movies from test liked set
    test_liked = set(test_all[test_all['rating'] >= liked_threshold]['movieId']) - set(top_liked_train)
    test_not_liked = set(test_all[test_all['rating'] < liked_threshold]['movieId'])

    user_metrics_top.append({
        'recommended': recommended,
        'test_liked': test_liked,
        'test_not_liked': test_not_liked
    })

# --------------------------
# Compute counts and evaluation metrics
# --------------------------
counts_per_user = []
hit_rates, precisions, recalls = [], [], []

for um in user_metrics_top:
    recommended = um['recommended']
    test_liked = um['test_liked']
    test_not_liked = um['test_not_liked']
    test_movies = test_liked | test_not_liked

    rec_watched_liked = len(recommended & test_liked)
    rec_watched_not_liked = len(recommended & test_not_liked)
    rec_not_watched = len(recommended - test_movies)

    nonrec = all_movie_ids - recommended
    nonrec_watched_liked = len(nonrec & test_liked)
    nonrec_watched_not_liked = len(nonrec & test_not_liked)
    nonrec_not_watched = len(nonrec - test_movies)

    counts_per_user.append({
        'Recommended_Watched+Liked': rec_watched_liked,
        'Recommended_Watched+NotLiked': rec_watched_not_liked,
        'Recommended_NotWatched': rec_not_watched,
        'NotRecommended_Watched+Liked': nonrec_watched_liked,
        'NotRecommended_Watched+NotLiked': nonrec_watched_not_liked,
        'NotRecommended_NotWatched': nonrec_not_watched
    })

    # Hit rate, precision, recall
    hit_rates.append(rec_watched_liked / len(test_liked) if len(test_liked) > 0 else 0)
    precisions.append(rec_watched_liked / K)
    recalls.append(rec_watched_liked / len(test_liked) if len(test_liked) > 0 else 0)

# --------------------------
# Build table
# --------------------------
counts_df = pd.DataFrame(counts_per_user)
avg_counts = counts_df.mean()

table = pd.DataFrame({
    'Recommended': [
        avg_counts['Recommended_Watched+Liked'],
        avg_counts['Recommended_Watched+NotLiked'],
        avg_counts['Recommended_NotWatched']
    ],
    'Not Recommended': [
        avg_counts['NotRecommended_Watched+Liked'],
        avg_counts['NotRecommended_Watched+NotLiked'],
        avg_counts['NotRecommended_NotWatched']
    ]
}, index=['Liked', 'Not Liked', 'Not Watched'])

table['Total'] = table['Recommended'] + table['Not Recommended']

avg_hit_rate = np.mean(hit_rates)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

print(f"Top-{K} Recommendations Validation for {len(sampled_users)} Users:")
print(table)
print("\nAverage metrics:")
print(f"Fraction of liked movies in {K} recs: {avg_hit_rate:.3f}")
print(f"Precision in {K} recs: {avg_precision:.3f}")
print(f"Recall in {K} recs: {avg_recall:.3f}")


Top-10 Recommendations Validation for 100 Users:
             Recommended  Not Recommended     Total
Liked               0.51            16.04     16.55
Not Liked           0.11            17.78     17.89
Not Watched         9.38         59431.18  59440.56

Average metrics:
Fraction of liked movies in 10 recs: 0.034
Precision in 10 recs: 0.051
Recall in 10 recs: 0.034


In [24]:
import numpy as np
import pandas as pd

# --------------------------
# Parameters (consistent with block 1)
# --------------------------
liked_threshold = 4
K = 10              # same as in block 1
top_n_input = 4     # same as in block 1
all_movie_ids = set(filtered_data['movieId'].unique())

user_metrics_top = []

for uid in sampled_users:
    # Test set for this user
    test_all = test_ratings[test_ratings['userId'] == uid]

    # Find top-N liked movies from TRAIN (same as block 1)
    train_user = train_ratings[train_ratings['userId'] == uid]
    liked_train = train_user[train_user['rating'] >= liked_threshold]
    if liked_train.empty:
        continue

    top_liked_train = liked_train.sort_values('rating', ascending=False).head(top_n_input)['movieId'].tolist()

    # Recommendations (provided externally, e.g. popular or top_movie_ids)
    recommended = set(top_movie_ids)

    # Exclude top-N liked movies (since they were used as input)
    test_liked = set(test_all[test_all['rating'] >= liked_threshold]['movieId']) - set(top_liked_train)
    test_not_liked = set(test_all[test_all['rating'] < liked_threshold]['movieId'])

    user_metrics_top.append({
        'recommended': recommended,
        'test_liked': test_liked,
        'test_not_liked': test_not_liked
    })

# --------------------------
# Compute counts and evaluation metrics
# --------------------------
counts_per_user = []
hit_rates, precisions, recalls = [], [], []

for um in user_metrics_top:
    recommended = um['recommended']
    test_liked = um['test_liked']
    test_not_liked = um['test_not_liked']
    test_movies = test_liked | test_not_liked

    rec_watched_liked = len(recommended & test_liked)
    rec_watched_not_liked = len(recommended & test_not_liked)
    rec_not_watched = len(recommended - test_movies)

    nonrec = all_movie_ids - recommended
    nonrec_watched_liked = len(nonrec & test_liked)
    nonrec_watched_not_liked = len(nonrec & test_not_liked)
    nonrec_not_watched = len(nonrec - test_movies)

    counts_per_user.append({
        'Recommended_Watched+Liked': rec_watched_liked,
        'Recommended_Watched+NotLiked': rec_watched_not_liked,
        'Recommended_NotWatched': rec_not_watched,
        'NotRecommended_Watched+Liked': nonrec_watched_liked,
        'NotRecommended_Watched+NotLiked': nonrec_watched_not_liked,
        'NotRecommended_NotWatched': nonrec_not_watched
    })

    # Hit rate, precision, recall
    hit_rates.append(rec_watched_liked / len(test_liked) if len(test_liked) > 0 else 0)
    precisions.append(rec_watched_liked / K)
    recalls.append(rec_watched_liked / len(test_liked) if len(test_liked) > 0 else 0)

# --------------------------
# Table
# --------------------------
counts_df = pd.DataFrame(counts_per_user)
avg_counts = counts_df.mean()

table = pd.DataFrame({
    'Recommended': [
        avg_counts['Recommended_Watched+Liked'],
        avg_counts['Recommended_Watched+NotLiked'],
        avg_counts['Recommended_NotWatched']
    ],
    'Not Recommended': [
        avg_counts['NotRecommended_Watched+Liked'],
        avg_counts['NotRecommended_Watched+NotLiked'],
        avg_counts['NotRecommended_NotWatched']
    ]
}, index=['Liked', 'Not Liked', 'Not Watched'])

table['Total'] = table['Recommended'] + table['Not Recommended']

avg_hit_rate = np.mean(hit_rates)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

print(f"Top-{K} Recommendations Validation for {len(sampled_users)} Users:")
print(table)
print("\nAverage metrics:")
print(f"Fraction of liked movies in {K} recs: {avg_hit_rate:.3f}")
print(f"Precision in {K} recs: {avg_precision:.3f}")
print(f"Recall in {K} recs: {avg_recall:.3f}")


Top-10 Recommendations Validation for 100 Users:
             Recommended  Not Recommended     Total
Liked               0.50            15.38     15.88
Not Liked           0.08            17.66     17.74
Not Watched         9.42         59431.96  59441.38

Average metrics:
Fraction of liked movies in 10 recs: 0.038
Precision in 10 recs: 0.050
Recall in 10 recs: 0.038


In [9]:
# Introducing genres
import pandas as pd

movies['genre_list'] = movies['genres'].str.split('|')

all_genres = sorted({g for genres in movies['genre_list'] for g in genres})

# One-hot encode genres
for g in all_genres:
    movies[g] = movies['genre_list'].apply(lambda x: int(g in x))

filtered_data_genres = filtered_data.merge(movies[['movieId'] + all_genres], on='movieId', how='left')


In [10]:
# Minimum ratings thresholds to make data manageable

min_movie_ratings = 20
min_user_ratings = 5

# Filter movies
movie_counts = filtered_data_genres.groupby('movieId')['rating'].count()
movies_to_keep = movie_counts[movie_counts >= min_movie_ratings].index
filtered_data_genres = filtered_data_genres[filtered_data_genres['movieId'].isin(movies_to_keep)]

# Filter users
user_counts = filtered_data_genres.groupby('userId')['rating'].count()
users_to_keep = user_counts[user_counts >= min_user_ratings].index
filtered_data_genres = filtered_data_genres[filtered_data_genres['userId'].isin(users_to_keep)]


In [11]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from joblib import Parallel, delayed

K = 10
liked_threshold = 4.0
test_fraction = 0.2
n_jobs = -1
n_users_debug = 1000  # set number of users to debug 

np.random.seed(123)
all_user_ids = filtered_data_genres['userId'].unique()
sample_user_ids = np.random.choice(all_user_ids, min(n_users_debug, len(all_user_ids)), replace=False)

# Filter dataset for these users
data_subset = filtered_data_genres[filtered_data_genres['userId'].isin(sample_user_ids)].copy()

# --------------------------
# Build movie-genre sparse matrix
# --------------------------
genre_columns = all_genres 
unique_movies = filtered_data_genres.drop_duplicates('movieId').set_index('movieId')
movie_ids = unique_movies.index.values
movie_genre_sparse = csr_matrix(unique_movies[genre_columns].values)
movie_genre_sparse = normalize(movie_genre_sparse, axis=1)
movieid_to_idx = {mid: i for i, mid in enumerate(movie_ids)}

# --------------------------
# Train/test split per user
# --------------------------
train_data = []
test_data = []
for user_id, group in data_subset.groupby('userId'):
    group = group.sample(frac=1, random_state=123)  # shuffle
    split_idx = int(len(group) * (1 - test_fraction))
    train_data.append(group.iloc[:split_idx])
    test_data.append(group.iloc[split_idx:])

train_data = pd.concat(train_data)
test_data = pd.concat(test_data)

# --------------------------
# Build user profiles from training data
# --------------------------
user_profiles = {}
user_rated_train = {}
user_test_liked = {}
user_test_not_liked = {}

for user_id, group in train_data.groupby('userId'):
    liked = group[group['rating'] >= liked_threshold]
    not_liked = group[group['rating'] < liked_threshold]

    if liked.empty:
        continue

    liked_indices = [movieid_to_idx[mid] for mid in liked['movieId'] if mid in movieid_to_idx]
    if not liked_indices:
        continue

    profile_vector = movie_genre_sparse[liked_indices].mean(axis=0)
    profile_vector = np.asarray(profile_vector)  # convert from np.matrix
    profile_vector = normalize(profile_vector)  # normalize
    user_profiles[user_id] = profile_vector.ravel()
    user_rated_train[user_id] = set(group['movieId'])

# --------------------------
# Recommendation function
# --------------------------
def recommend_for_user(user_id, profile_vector, user_rated_train, movie_ids, movie_genre_sparse, K=10):
    rated_movies = user_rated_train.get(user_id, set())
    candidate_ids = np.array([mid for mid in movie_ids if mid not in rated_movies])
    if len(candidate_ids) == 0:
        return {'userId': user_id, 'recommended': set()}
    candidate_indices = [movieid_to_idx[mid] for mid in candidate_ids]
    sims = np.asarray(movie_genre_sparse[candidate_indices].dot(profile_vector.T)).ravel()
    top_idx = np.argsort(-sims)[:min(K, len(candidate_ids))]
    recommended = set(candidate_ids[top_idx])
    return {'userId': user_id, 'recommended': recommended}

results = Parallel(n_jobs=n_jobs)(
    delayed(recommend_for_user)(
        user_id, profile, user_rated_train, movie_ids, movie_genre_sparse, K
    )
    for user_id, profile in user_profiles.items()
)

counts_per_user = []
hit_rates, precisions, recalls = [], [], []

for res in results:
    user_id = res['userId']
    recommended = res['recommended']

    user_test = test_data[test_data['userId'] == user_id]
    test_liked = set(user_test[user_test['rating'] >= liked_threshold]['movieId'])
    test_not_liked = set(user_test[user_test['rating'] < liked_threshold]['movieId'])
    test_movies = test_liked | test_not_liked

    rec_watched_liked = len(recommended & test_liked)
    rec_watched_not_liked = len(recommended & test_not_liked)
    rec_not_watched = len(recommended - test_movies)

    nonrec = set(movie_ids) - recommended
    nonrec_watched_liked = len(nonrec & test_liked)
    nonrec_watched_not_liked = len(nonrec & test_not_liked)
    nonrec_not_watched = len(nonrec - test_movies)

    counts_per_user.append({
        'Recommended_Watched+Liked': rec_watched_liked,
        'Recommended_Watched+NotLiked': rec_watched_not_liked,
        'Recommended_NotWatched': rec_not_watched,
        'NotRecommended_Watched+Liked': nonrec_watched_liked,
        'NotRecommended_Watched+NotLiked': nonrec_watched_not_liked,
        'NotRecommended_NotWatched': nonrec_not_watched
    })

    hit_rates.append(len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0)
    precisions.append(len(recommended & test_liked) / len(recommended) if len(recommended) > 0 else 0)
    recalls.append(len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0)

# Table
counts_df = pd.DataFrame(counts_per_user)
average_counts = counts_df.mean()
table = pd.DataFrame({
    'Recommended': [
        average_counts['Recommended_Watched+Liked'],
        average_counts['Recommended_Watched+NotLiked'],
        average_counts['Recommended_NotWatched']
    ],
    'Not Recommended': [
        average_counts['NotRecommended_Watched+Liked'],
        average_counts['NotRecommended_Watched+NotLiked'],
        average_counts['NotRecommended_NotWatched']
    ]
}, index=['Liked', 'Not Liked', 'Not Watched'])
table['Total'] = table['Recommended'] + table['Not Recommended']

print("Genre-based Recommendations (20/80 train/test split):")
print(table)

print("\nAverage metrics:")
print(f"Fraction of liked movies recommended in {K} recs: {np.mean(hit_rates):.3f}")
print(f"Precision in {K} recs: {np.mean(precisions):.3f}")
print(f"Recall in {K} recs: {np.mean(recalls):.3f}")


Genre-based Recommendations (20/80 train/test split):
             Recommended  Not Recommended         Total
Liked           0.033469         9.629817      9.663286
Not Liked       0.025355         8.492901      8.518256
Not Watched     9.941176     16657.877282  16667.818458

Average metrics:
Fraction of liked movies recommended in 10 recs: 0.006
Precision in 10 recs: 0.003
Recall in 10 recs: 0.006


Some thoughts after this
- Genre based seems to be the worst in both absolute number of liked movies and ratio with not liked.
- Top 10 movies used for recommendation have a good ratio of like to not liked, but this is expected from the highest rated movies in the dataset.
- User similarity has the highest absolute value of liked movies from the recommendations, indiciating that it is often the case that people actually watch the movies that were recommended to them.
- Movies that are unwatched make it more difficult to interpret recommendations, as it is unknown if the user would like this recommendation or not.

## User similarity clustering

In [None]:
# Block A: build sparse user-topN matrix, SVD embeddings, MiniBatchKMeans clustering
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans

# -----------------------
# Parameters
# -----------------------
top_n_input = 4             # number of top liked movies used to define user profile
liked_threshold = 4.0       # movies >= this considered liked
min_ratings_per_user = 20   # already used earlier to sample active users
sample_size = None          # all users
n_components = 50           # SVD dimension (lower = faster); 50 is a good start
n_clusters = 200            # number of clusters to produce (tune)
mbk_batch_size = 1024       # minibatch size for KMeans
random_state = 123

# -----------------------
# Select users to use
# -----------------------
# user_metrics should exist from your earlier pipeline; otherwise build similar list
# We'll build sparse profiles from train_ratings (so no test leakage)
# Use train_ratings from your earlier code

# Choose active users (same logic you used before)
active_users = train_ratings.groupby('userId').size()
active_users = active_users[active_users >= min_ratings_per_user].index.values

if sample_size is not None and sample_size < len(active_users):
    np.random.seed(random_state)
    sampled_users = np.random.choice(active_users, size=sample_size, replace=False)
else:
    sampled_users = active_users

sampled_users = np.array(sampled_users)  # ensure numpy array
n_users = len(sampled_users)
print(f"Using {n_users} users for embedding/clustering")

# -----------------------
# Build index maps
# -----------------------
user_to_pos = {uid: i for i, uid in enumerate(sampled_users)}

# We'll only include movies that appear as top-N liked for at least someone.
# First pass: collect candidate movieIds
from collections import Counter
movie_counter = Counter()

# For efficiency, build per-user top-N liked movies from train_ratings aggregated by user
# Create user -> (movieId, rating) lists from train_ratings
train_by_user = train_ratings[train_ratings['userId'].isin(sampled_users)].groupby('userId')

for uid, grp in train_by_user:
    # select liked movies in this user's training set
    liked = grp[grp['rating'] >= liked_threshold]
    if liked.empty:
        continue
    # sort by rating desc then count top_n_input
    top = liked.sort_values('rating', ascending=False)['movieId'].values[:top_n_input]
    for mid in top:
        movie_counter[mid] += 1

# Keep movies that appear at least once (optionally threshold by frequency to reduce dims)
candidate_movies = np.array([mid for mid, cnt in movie_counter.items() if cnt >= 1])
movie_to_pos = {mid: i for i, mid in enumerate(candidate_movies)}
n_movies_candidate = len(candidate_movies)
print(f"Candidate movie features from top-N liked across users: {n_movies_candidate}")

# -----------------------
# Build sparse user x movie matrix (binary or rating-weighted)
# -----------------------
rows = []
cols = []
data_vals = []

# second pass to fill sparse matrix
for uid, grp in train_by_user:
    upos = user_to_pos.get(uid, None)
    if upos is None:
        continue
    liked = grp[grp['rating'] >= liked_threshold]
    if liked.empty:
        continue
    top = liked.sort_values('rating', ascending=False)['movieId'].values[:top_n_input]
    for mid in top:
        mpos = movie_to_pos.get(mid, None)
        if mpos is None:
            continue
        rows.append(upos)
        cols.append(mpos)
        # Use rating value (optional) or 1 for binary. Using rating gives more signal:
        data_vals.append(1.0)  # use 1.0 for binary; switch to rating if desired

# Create CSR matrix
if len(rows) == 0:
    raise RuntimeError("No user top-N liked entries found. Check thresholds or sampled users.")
X_sparse = sparse.csr_matrix((data_vals, (rows, cols)), shape=(n_users, n_movies_candidate), dtype=np.float32)
print("Built sparse user profile matrix:", X_sparse.shape, "nnz=", X_sparse.nnz)

# -----------------------
# Dimensionality reduction (TruncatedSVD on sparse)
# -----------------------
svd = TruncatedSVD(n_components=n_components, random_state=random_state)
user_embeddings = svd.fit_transform(X_sparse)   # result: (n_users, n_components)
explained = svd.explained_variance_ratio_.sum()
print(f"SVD done. Embedding shape: {user_embeddings.shape}. Explained variance_ratio sum: {explained:.3f}")

# Optional: L2-normalize embeddings if you plan to use cosine similarity
from sklearn.preprocessing import normalize
user_embeddings = normalize(user_embeddings, norm='l2', axis=1)

# -----------------------
# Clustering (MiniBatchKMeans)
# -----------------------
mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=mbk_batch_size, random_state=random_state)
cluster_labels = mbk.fit_predict(user_embeddings)
print("Clustering done. Number of clusters:", n_clusters)
# cluster assignments: cluster_labels (len n_users)

# Save outputs to use later
# create dataframe aligning user ids with embeddings and cluster labels
user_embed_df = pd.DataFrame({
    'userId': sampled_users,
    'cluster': cluster_labels
})
# optionally include embedding columns
for i in range(user_embeddings.shape[1]):
    user_embed_df[f'emb_{i}'] = user_embeddings[:, i]

# quick cluster sizes
cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index()
print("Cluster sizes (first 10):")
print(cluster_sizes.head(10))


In [None]:
# Block B: cluster distances & evaluation (uses outputs from Block A)
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

# user_embeddings (n_users x n_components), mbk (fitted), cluster_labels from Block A
# user_embed_df exists mapping userId -> embedding & cluster

# 1) Centroids and pairwise centroid distances (Euclidean)
centroids = mbk.cluster_centers_   # shape (n_clusters, n_components)
centroid_dists = pairwise_distances(centroids, metric='euclidean')  # (n_clusters,n_clusters)

# summarize centroid distances
centroid_mean = centroid_dists[np.triu_indices_from(centroid_dists, k=1)].mean()
centroid_min = centroid_dists[np.triu_indices_from(centroid_dists, k=1)].min()
centroid_max = centroid_dists[np.triu_indices_from(centroid_dists, k=1)].max()
print(f"Centroid pairwise distances (eucl): mean={centroid_mean:.3f}, min={centroid_min:.3f}, max={centroid_max:.3f}")

# 2) Average intra-cluster cosine similarity and inter-cluster cosine similarity
# compute user embeddings grouped by cluster (sample clusters if too many)
from collections import defaultdict
emb_by_cluster = defaultdict(list)
for i, c in enumerate(cluster_labels):
    emb_by_cluster[c].append(user_embeddings[i])

intra_sims = []
inter_sims = []

# to keep fast, sample up to max_per_cluster embeddings per cluster
max_per_cluster = 200
for c, embs in emb_by_cluster.items():
    embs = np.asarray(embs)
    m = embs.shape[0]
    if m <= 1:
        continue
    idxs = np.random.choice(m, size=min(m, max_per_cluster), replace=False)
    sample_embs = embs[idxs]
    # intra-cluster pairwise cosine
    sims = cosine_similarity(sample_embs)
    # take upper triangle mean excluding diagonal
    iu = np.triu_indices_from(sims, k=1)
    intra_sims.append(np.mean(sims[iu]))

# For inter-cluster similarity: sample embeddings from different clusters
# pick many pairs of clusters and compute mean cross-similarity
cluster_ids = list(emb_by_cluster.keys())
n_pair_samples = 200
for _ in range(n_pair_samples):
    a, b = np.random.choice(cluster_ids, size=2, replace=False)
    a_embs = np.asarray(emb_by_cluster[a])
    b_embs = np.asarray(emb_by_cluster[b])
    if a_embs.shape[0] == 0 or b_embs.shape[0] == 0:
        continue
    a_idx = np.random.choice(a_embs.shape[0], size=min(max_per_cluster, a_embs.shape[0]), replace=False)
    b_idx = np.random.choice(b_embs.shape[0], size=min(max_per_cluster, b_embs.shape[0]), replace=False)
    sims = cosine_similarity(a_embs[a_idx], b_embs[b_idx])
    inter_sims.append(np.mean(sims))

print(f"Avg intra-cluster cosine sim: {np.mean(intra_sims):.4f} (std {np.std(intra_sims):.4f})")
print(f"Avg inter-cluster cosine sim: {np.mean(inter_sims):.4f} (std {np.std(inter_sims):.4f})")

# 3) Silhouette score (compute on a subsample to save time)
# silhouette uses distance metric; for large n_users compute on a sample
max_sil_sample = 20000
if user_embeddings.shape[0] > max_sil_sample:
    idx = np.random.choice(user_embeddings.shape[0], size=max_sil_sample, replace=False)
    sil = silhouette_score(user_embeddings[idx], cluster_labels[idx], metric='cosine')
else:
    sil = silhouette_score(user_embeddings, cluster_labels, metric='cosine')
print(f"Silhouette score (cosine) on sample: {sil:.4f}")

# 4) How well "top-4 similarity" differentiates users:
# For each user, compute how many users share at least 1 or >=2 same top-N movies and check same-cluster rate.

# Build map user -> set(topN movieIds)
user_topn = {}
train_by_user = train_ratings[train_ratings['userId'].isin(sampled_users)].groupby('userId')
for uid, grp in train_by_user:
    liked = grp[grp['rating'] >= liked_threshold]
    top = liked.sort_values('rating', ascending=False)['movieId'].values[:top_n_input]
    user_topn[uid] = set(top)

# Create quick lookup: movie -> users who have it in their topN
movie_to_users = defaultdict(list)
for uid, mids in user_topn.items():
    for m in mids:
        movie_to_users[m].append(uid)

# For each user sample, find "neighbors by top-N intersection" and measure fraction in same cluster
n_probe = min(1000, len(sampled_users))
probe_users = np.random.choice(sampled_users, size=n_probe, replace=False)

same_cluster_rates = []
for uid in probe_users:
    u_set = user_topn.get(uid, set())
    if not u_set:
        continue
    # collect candidate users who share at least one movie
    candidates = set()
    for m in u_set:
        candidates.update(movie_to_users.get(m, []))
    candidates.discard(uid)
    if not candidates:
        continue
    candidates = list(candidates)
    # compute how many of these candidates are in same cluster as uid
    uid_cluster = user_embed_df.loc[user_embed_df['userId']==uid, 'cluster'].values[0]
    same = 0
    for cand in candidates:
        cand_cluster = user_embed_df.loc[user_embed_df['userId']==cand, 'cluster'].values[0]
        if cand_cluster == uid_cluster:
            same += 1
    same_cluster_rates.append(same / len(candidates))

print(f"Avg fraction of users who share top-N items and are in same cluster: {np.mean(same_cluster_rates):.3f}")

# 5) Summarize results in a simple dict
results_summary = {
    "centroid_mean_dist": float(centroid_mean),
    "centroid_min_dist": float(centroid_min),
    "centroid_max_dist": float(centroid_max),
    "avg_intra_sim": float(np.mean(intra_sims)),
    "avg_inter_sim": float(np.mean(inter_sims)),
    "silhouette_sampled": float(sil),
    "avg_same_cluster_rate_for_shared_topN": float(np.mean(same_cluster_rates))
}

#print("Summary:", results_summary)
