In [1]:
# Data from https://grouplens.org/datasets/movielens/latest/
# Put the downloaded csv files in the "Downloads" folder

import os
os.chdir("Downloads")
print(os.getcwd())


c:\Users\vonks\Documents\GitHub\Movie_Recs\Downloads


In [2]:
import pandas as pd

ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

data = ratings.merge(movies, on="movieId", how="left")

In [3]:
# Removing users with more than 10,000 ratings as these are likely to be bots or outliers
# See Data_Exploration.ipynb for more details about cleaning and outlier detection

ratings_per_user = data.groupby('userId').size()
outliers = ratings_per_user[ratings_per_user > 10000]
filtered_data = data[~data['userId'].isin(outliers.index)]

# Table of removed users
removed_users_table = outliers.reset_index()
removed_users_table.columns = ['userId', 'num_ratings']

print(removed_users_table)


   userId  num_ratings
0  189614        33332


## User Similarity Recommendation

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Parameters
min_ratings_per_user = 20 # For selection of active users
sample_size = 100
K = 10 # Number of recommendations
liked_threshold = 4
top_n_input = 4 # Number of liked movies to use for similarity

# Select active users
active_users = filtered_data.groupby('userId').size()
active_users = active_users[active_users >= min_ratings_per_user].index

# Random sample of users because data is large
np.random.seed(123)
sampled_users = np.random.choice(active_users, size=sample_size, replace=False)

# For each sampled user, split their movie ratings into train (80%) and test (20%)
train_ratings = pd.DataFrame()
test_ratings = pd.DataFrame()

for uid in sampled_users:
    user_data = filtered_data[filtered_data['userId'] == uid]
    train, test = train_test_split(user_data, test_size=0.2, random_state=42)
    train_ratings = pd.concat([train_ratings, train])
    test_ratings = pd.concat([test_ratings, test])

# -----------------------------
# Build matrix from training data
# -----------------------------
user_ids = train_ratings['userId'].unique()
movie_ids = train_ratings['movieId'].unique()
user_to_idx = {uid:i for i, uid in enumerate(user_ids)} # Rows
movie_to_idx = {mid:i for i, mid in enumerate(movie_ids)} # Columns

# Empty matrix
n_users = len(user_ids)
n_movies = len(movie_ids)
user_item_matrix = np.zeros((n_users, n_movies))

# Fill matrix
for row in train_ratings.itertuples():
    u_idx = user_to_idx[row.userId]
    m_idx = movie_to_idx[row.movieId]
    user_item_matrix[u_idx, m_idx] = row.rating

# -----------------------------
# Generate recommendations using only top liked movies
# -----------------------------
user_metrics = []

for uid in sampled_users:
    u_idx = user_to_idx[uid]
    user_vector = user_item_matrix[u_idx, :] # Ratings

    # Select top-N liked movies only (positive ratings)
    liked_indices = np.where(user_vector >= liked_threshold)[0]
    if len(liked_indices) == 0:
        continue
    
    top_indices = np.argsort(user_vector[liked_indices])[-top_n_input:]
    top_indices = liked_indices[top_indices]

    # Find relevant users who rated at least one of these top-N movies
    rel_users = np.any(user_item_matrix[:, top_indices] > 0, axis=1)
    rel_users[u_idx] = False
    relevant_users_matrix = user_item_matrix[rel_users, :]

    if relevant_users_matrix.shape[0] == 0:
        continue

    # Cosine similarity on top-N liked movies
    sims = cosine_similarity([user_vector[top_indices]], relevant_users_matrix[:, top_indices])[0]

    # Compute weighted average for unseen movies based on similar users = prediction
    unseen = np.where(user_vector == 0)[0]
    weighted_scores = np.zeros(len(unseen))
    for i, m_idx in enumerate(unseen):
        weighted_scores[i] = np.sum(relevant_users_matrix[:, m_idx] * sims) / (np.sum(sims) + 1e-8)

    # Top recommendations
    top_recs_idx = np.argsort(weighted_scores)[::-1][:K] # Top K
    recommended_movie_ids = [list(movie_to_idx.keys())[unseen[i]] for i in top_recs_idx]

    # Test set 
    test_all = test_ratings[test_ratings['userId'] == uid]
    test_liked = set(test_all[test_all['rating'] >= liked_threshold]['movieId'])
    test_not_liked = set(test_all[test_all['rating'] < liked_threshold]['movieId'])

    user_metrics.append({
        'recommended': recommended_movie_ids,
        'test_liked': test_liked,
        'test_not_liked': test_not_liked
    })


In [None]:
#Evaluation

all_movie_ids = set(filtered_data['movieId'].unique())
K = 10  # top-K recommendations
liked_threshold = 4.0

counts_per_user = []
hit_rates, precisions, recalls = [], [], []

#filtered_user_metrics = [um for um in user_metrics if len(um['test_liked']) > 0]

for um in user_metrics:
    # Unpack the information per user (um)
    recommended = set(um['recommended'])
    test_liked = um['test_liked']
    test_not_liked = um['test_not_liked']
    test_movies = test_liked | test_not_liked

    # Metrics for table
    rec_watched_liked = len(recommended & test_liked)
    rec_watched_not_liked = len(recommended & test_not_liked)
    rec_not_watched = len(recommended - test_movies)

    nonrec = all_movie_ids - recommended
    nonrec_watched_liked = len(nonrec & test_liked)
    nonrec_watched_not_liked = len(nonrec & test_not_liked)
    nonrec_not_watched = len(nonrec - test_movies)

    counts_per_user.append({
        'Recommended_Watched+Liked': rec_watched_liked,
        'Recommended_Watched+NotLiked': rec_watched_not_liked,
        'Recommended_NotWatched': rec_not_watched,
        'NotRecommended_Watched+Liked': nonrec_watched_liked,
        'NotRecommended_Watched+NotLiked': nonrec_watched_not_liked,
        'NotRecommended_NotWatched': nonrec_not_watched
    })

    # Hit Rate = 1 if at least one liked movie is recommended
    #hit = 1 if len(recommended & test_liked) > 0 else 0
    hit = len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0

    # Precision and Recall K
    precision = len(recommended & test_liked) / len(recommended) if len(recommended) > 0 else 0
    recall = len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0
    
    hit_rates.append(hit)
    precisions.append(precision)
    recalls.append(recall)

# Table with averages
counts_df = pd.DataFrame(counts_per_user)
average_counts = counts_df.mean()

table = pd.DataFrame({
    'Recommended': [
        average_counts['Recommended_Watched+Liked'],
        average_counts['Recommended_Watched+NotLiked'],
        average_counts['Recommended_NotWatched']
    ],
    'Not Recommended': [
        average_counts['NotRecommended_Watched+Liked'],
        average_counts['NotRecommended_Watched+NotLiked'],
        average_counts['NotRecommended_NotWatched']
    ]
}, index=['Liked', 'Not Liked', 'Not Watched'])

# Add totals column
table['Total'] = table['Recommended'] + table['Not Recommended']

print("Recommendations based on user similarity of liked movies:")
print(table)

avg_hit_rate = np.mean(hit_rates)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

print("\nAverage metrics:")
print(f"Fraction of liked movies recommended in {K} recs: {avg_hit_rate:.3f}")
print(f"Precision in {K} recs: {avg_precision:.3f}")
print(f"Recall in {K} recs: {avg_recall:.3f}")


Recommendations based on user similarity of liked movies:
             Recommended  Not Recommended         Total
Liked           1.130435        13.684783     14.815217
Not Liked       0.543478        15.358696     15.902174
Not Watched     8.326087     81781.956522  81790.282609

Average metrics:
Fraction of liked movies recommended in 10 recs: 0.094
Precision in 10 recs: 0.113
Recall in 10 recs: 0.094


## Recommending based on top 10 rated movies overall

In [None]:
import pandas as pd
import numpy as np

K = 10
min_ratings_per_movie = 10000  # threshold applied on the ENTIRE dataset

movie_stats_all = filtered_data.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()
eligible_movies = movie_stats_all[movie_stats_all['count'] >= min_ratings_per_movie]

# Sort by average rating descending
top_movies_by_avg_all = eligible_movies.sort_values('mean', ascending=False).head(K)

# Merge with movie titles
top_movies_summary = top_movies_by_avg_all.merge(movies[['movieId', 'title']], on='movieId', how='left')

# Rename columns
top_movies_summary.rename(columns={'mean': 'average_rating', 'count': 'num_ratings'}, inplace=True)
top_movies_summary = top_movies_summary[['movieId', 'title', 'average_rating', 'num_ratings']]

# Display top movies
print(f"Top-{K} Movies in the Entire Dataset (min {min_ratings_per_movie} ratings):")
print(top_movies_summary)

# Save movie IDs for later recommendation block
top_movie_ids = top_movies_summary['movieId'].tolist()


Top-10 Movies in the Entire Dataset (min 10000 ratings):
   movieId                                              title  average_rating  \
0      318                   Shawshank Redemption, The (1994)        4.416791   
1   202439                                    Parasite (2019)        4.329973   
2      858                              Godfather, The (1972)        4.326600   
3     1221                     Godfather: Part II, The (1974)        4.269505   
4       50                         Usual Suspects, The (1995)        4.267869   
5     1203                                12 Angry Men (1957)        4.267126   
6     2019        Seven Samurai (Shichinin no samurai) (1954)        4.250774   
7      527                            Schindler's List (1993)        4.242334   
8     2959                                  Fight Club (1999)        4.236028   
9     5618  Spirited Away (Sen to Chihiro no kamikakushi) ...        4.226042   

   num_ratings  
0       122295  
1        12398  


In [42]:
import numpy as np
import pandas as pd

# --------------------------
# Parameters
# --------------------------
sample_user_count = 1000          # Change this number to test more users
liked_threshold = 4.0
test_fraction = 0.2              # 20% test split per user
K = len(top_movie_ids)           # ensure consistency with block 1
np.random.seed(123)

all_movie_ids = set(filtered_data['movieId'].unique())

sampled_users = np.random.choice(filtered_data['userId'].unique(),
                                 size=min(sample_user_count, filtered_data['userId'].nunique()),
                                 replace=False)

# --------------------------
# Split train/test per sampled user
# --------------------------
train_ratings = []
test_ratings = []

for uid in sampled_users:
    group = filtered_data[filtered_data['userId'] == uid]
    n_test = max(1, int(len(group) * test_fraction))
    test_indices = np.random.choice(group.index, size=n_test, replace=False)
    train_indices = group.index.difference(test_indices)
    
    train_ratings.append(filtered_data.loc[train_indices])
    test_ratings.append(filtered_data.loc[test_indices])

train_ratings = pd.concat(train_ratings)
test_ratings = pd.concat(test_ratings)

user_metrics_top = []

for uid, group in test_ratings.groupby('userId'):
    recommended = set(top_movie_ids)
    test_liked = set(group[group['rating'] >= liked_threshold]['movieId'])
    test_not_liked = set(group[group['rating'] < liked_threshold]['movieId'])
    
    user_metrics_top.append({
        'recommended': recommended,
        'test_liked': test_liked,
        'test_not_liked': test_not_liked
    })

# --------------------------
# Compute counts and evaluation metrics
# --------------------------
counts_per_user = []
hit_rates, precisions, recalls = [], [], []

for um in user_metrics_top:
    recommended = set(um['recommended'])
    test_liked = um['test_liked']
    test_not_liked = um['test_not_liked']
    test_movies = test_liked | test_not_liked

    rec_watched_liked = len(recommended & test_liked)
    rec_watched_not_liked = len(recommended & test_not_liked)
    rec_not_watched = len(recommended - test_movies)

    nonrec = all_movie_ids - recommended
    nonrec_watched_liked = len(nonrec & test_liked)
    nonrec_watched_not_liked = len(nonrec & test_not_liked)
    nonrec_not_watched = len(nonrec - test_movies)

    counts_per_user.append({
        'Recommended_Watched+Liked': rec_watched_liked,
        'Recommended_Watched+NotLiked': rec_watched_not_liked,
        'Recommended_NotWatched': rec_not_watched,
        'NotRecommended_Watched+Liked': nonrec_watched_liked,
        'NotRecommended_Watched+NotLiked': nonrec_watched_not_liked,
        'NotRecommended_NotWatched': nonrec_not_watched
    })

    # Hit rate, precision, recall
    hit_rates.append(len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0)
    precisions.append(rec_watched_liked / K)
    recalls.append(rec_watched_liked / len(test_liked) if len(test_liked) > 0 else 0)

# --------------------------
# Table
# --------------------------
counts_df = pd.DataFrame(counts_per_user)
avg_counts = counts_df.mean()

table = pd.DataFrame({
    'Recommended': [
        avg_counts['Recommended_Watched+Liked'],
        avg_counts['Recommended_Watched+NotLiked'],
        avg_counts['Recommended_NotWatched']
    ],
    'Not Recommended': [
        avg_counts['NotRecommended_Watched+Liked'],
        avg_counts['NotRecommended_Watched+NotLiked'],
        avg_counts['NotRecommended_NotWatched']
    ]
}, index=['Watched + Liked', 'Watched + Not Liked', 'Not Watched'])

table['Total'] = table['Recommended'] + table['Not Recommended']

avg_hit_rate = np.mean(hit_rates)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

print(f"Top-{K} Recommendations Validation for {sample_user_count} Users:")
print(table)
print("\nAverage metrics:")
print(f"Fraction of liked movies recommended in {K} recs: {avg_hit_rate:.3f}")
print(f"Precision in {K} recs: {avg_precision:.3f}")
print(f"Recall in {K} recs: {avg_recall:.3f}")


Top-10 Recommendations Validation for 1000 Users:
                     Recommended  Not Recommended      Total
Watched + Liked            0.292            9.714     10.006
Watched + Not Liked        0.060            8.859      8.919
Not Watched                9.648        81792.427  81802.075

Average metrics:
Fraction of liked movies recommended in 10 recs: 0.050
Precision in 10 recs: 0.029
Recall in 10 recs: 0.050


In [7]:
# Introducing genres
import pandas as pd

movies['genre_list'] = movies['genres'].str.split('|')

all_genres = sorted({g for genres in movies['genre_list'] for g in genres})

# One-hot encode genres
for g in all_genres:
    movies[g] = movies['genre_list'].apply(lambda x: int(g in x))

filtered_data_genres = filtered_data.merge(movies[['movieId'] + all_genres], on='movieId', how='left')


In [None]:
# Minimum ratings thresholds to make data manageable

min_movie_ratings = 20
min_user_ratings = 5

# Filter movies
movie_counts = filtered_data_genres.groupby('movieId')['rating'].count()
movies_to_keep = movie_counts[movie_counts >= min_movie_ratings].index
filtered_data_genres = filtered_data_genres[filtered_data_genres['movieId'].isin(movies_to_keep)]

# Filter users
user_counts = filtered_data_genres.groupby('userId')['rating'].count()
users_to_keep = user_counts[user_counts >= min_user_ratings].index
filtered_data_genres = filtered_data_genres[filtered_data_genres['userId'].isin(users_to_keep)]


In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from joblib import Parallel, delayed

K = 10
liked_threshold = 4.0
test_fraction = 0.2
n_jobs = -1
n_users_debug = 1000  # set number of users to debug 

np.random.seed(123)
all_user_ids = filtered_data_genres['userId'].unique()
sample_user_ids = np.random.choice(all_user_ids, min(n_users_debug, len(all_user_ids)), replace=False)

# Filter dataset for these users
data_subset = filtered_data_genres[filtered_data_genres['userId'].isin(sample_user_ids)].copy()

# --------------------------
# Build movie-genre sparse matrix
# --------------------------
genre_columns = all_genres 
unique_movies = filtered_data_genres.drop_duplicates('movieId').set_index('movieId')
movie_ids = unique_movies.index.values
movie_genre_sparse = csr_matrix(unique_movies[genre_columns].values)
movie_genre_sparse = normalize(movie_genre_sparse, axis=1)
movieid_to_idx = {mid: i for i, mid in enumerate(movie_ids)}

# --------------------------
# Train/test split per user
# --------------------------
train_data = []
test_data = []
for user_id, group in data_subset.groupby('userId'):
    group = group.sample(frac=1, random_state=123)  # shuffle
    split_idx = int(len(group) * (1 - test_fraction))
    train_data.append(group.iloc[:split_idx])
    test_data.append(group.iloc[split_idx:])

train_data = pd.concat(train_data)
test_data = pd.concat(test_data)

# --------------------------
# Build user profiles from training data
# --------------------------
user_profiles = {}
user_rated_train = {}
user_test_liked = {}
user_test_not_liked = {}

for user_id, group in train_data.groupby('userId'):
    liked = group[group['rating'] >= liked_threshold]
    not_liked = group[group['rating'] < liked_threshold]

    if liked.empty:
        continue

    liked_indices = [movieid_to_idx[mid] for mid in liked['movieId'] if mid in movieid_to_idx]
    if not liked_indices:
        continue

    profile_vector = movie_genre_sparse[liked_indices].mean(axis=0)
    profile_vector = np.asarray(profile_vector)  # convert from np.matrix
    profile_vector = normalize(profile_vector)  # normalize
    user_profiles[user_id] = profile_vector.ravel()
    user_rated_train[user_id] = set(group['movieId'])

# --------------------------
# Recommendation function
# --------------------------
def recommend_for_user(user_id, profile_vector, user_rated_train, movie_ids, movie_genre_sparse, K=10):
    rated_movies = user_rated_train.get(user_id, set())
    candidate_ids = np.array([mid for mid in movie_ids if mid not in rated_movies])
    if len(candidate_ids) == 0:
        return {'userId': user_id, 'recommended': set()}
    candidate_indices = [movieid_to_idx[mid] for mid in candidate_ids]
    sims = np.asarray(movie_genre_sparse[candidate_indices].dot(profile_vector.T)).ravel()
    top_idx = np.argsort(-sims)[:min(K, len(candidate_ids))]
    recommended = set(candidate_ids[top_idx])
    return {'userId': user_id, 'recommended': recommended}

results = Parallel(n_jobs=n_jobs)(
    delayed(recommend_for_user)(
        user_id, profile, user_rated_train, movie_ids, movie_genre_sparse, K
    )
    for user_id, profile in user_profiles.items()
)

counts_per_user = []
hit_rates, precisions, recalls = [], [], []

for res in results:
    user_id = res['userId']
    recommended = res['recommended']

    user_test = test_data[test_data['userId'] == user_id]
    test_liked = set(user_test[user_test['rating'] >= liked_threshold]['movieId'])
    test_not_liked = set(user_test[user_test['rating'] < liked_threshold]['movieId'])
    test_movies = test_liked | test_not_liked

    rec_watched_liked = len(recommended & test_liked)
    rec_watched_not_liked = len(recommended & test_not_liked)
    rec_not_watched = len(recommended - test_movies)

    nonrec = set(movie_ids) - recommended
    nonrec_watched_liked = len(nonrec & test_liked)
    nonrec_watched_not_liked = len(nonrec & test_not_liked)
    nonrec_not_watched = len(nonrec - test_movies)

    counts_per_user.append({
        'Recommended_Watched+Liked': rec_watched_liked,
        'Recommended_Watched+NotLiked': rec_watched_not_liked,
        'Recommended_NotWatched': rec_not_watched,
        'NotRecommended_Watched+Liked': nonrec_watched_liked,
        'NotRecommended_Watched+NotLiked': nonrec_watched_not_liked,
        'NotRecommended_NotWatched': nonrec_not_watched
    })

    hit_rates.append(len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0)
    precisions.append(len(recommended & test_liked) / len(recommended) if len(recommended) > 0 else 0)
    recalls.append(len(recommended & test_liked) / len(test_liked) if len(test_liked) > 0 else 0)

# Table
counts_df = pd.DataFrame(counts_per_user)
average_counts = counts_df.mean()
table = pd.DataFrame({
    'Recommended': [
        average_counts['Recommended_Watched+Liked'],
        average_counts['Recommended_Watched+NotLiked'],
        average_counts['Recommended_NotWatched']
    ],
    'Not Recommended': [
        average_counts['NotRecommended_Watched+Liked'],
        average_counts['NotRecommended_Watched+NotLiked'],
        average_counts['NotRecommended_NotWatched']
    ]
}, index=['Liked', 'Not Liked', 'Not Watched'])
table['Total'] = table['Recommended'] + table['Not Recommended']

print("Genre-based Recommendations (20/80 train/test split):")
print(table)

print("\nAverage metrics:")
print(f"Fraction of liked movies recommended in {K} recs: {np.mean(hit_rates):.3f}")
print(f"Precision in {K} recs: {np.mean(precisions):.3f}")
print(f"Recall in {K} recs: {np.mean(recalls):.3f}")


Genre-based Recommendations (20/80 train/test split):
             Recommended  Not Recommended         Total
Liked           0.029442        11.292386     11.321827
Not Liked       0.023350         9.823350      9.846701
Not Watched     9.947208     23122.884264  23132.831472

Average metrics:
Fraction of liked movies recommended in 10 recs: 0.004
Precision in 10 recs: 0.003
Recall in 10 recs: 0.004


Some thoughts after this
- Genre based seems to be the worst in both absolute number of liked movies and ratio with not liked.
- Top 10 movies used for recommendation have a good ratio of like to not liked, but this is expected from the highest rated movies in the dataset.
- User similarity has the highest absolute value of liked movies from the recommendations, indiciating that it is often the case that people actually watch the movies that were recommended to them.
- Movies that are unwatched make it more difficult to interpret recommendations, as it is unknown if the user would like this recommendation or not.