In [1]:
# Data from https://grouplens.org/datasets/movielens/latest/
# Put the downloaded csv files in the "Downloads" folder

import os
os.chdir("Downloads")
print(os.getcwd())


c:\Users\vonks\Documents\GitHub\Movie_Recs\Downloads


In [2]:
import pandas as pd

ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

data = ratings.merge(movies, on="movieId", how="left")

In [3]:
# Removing users with more than 10,000 ratings as these are likely to be bots or outliers
# See Data_Exploration.ipynb for more details about cleaning and outlier detection

ratings_per_user = data.groupby('userId').size()
outliers = ratings_per_user[ratings_per_user > 1000]
filtered_data = data[~data['userId'].isin(outliers.index)]

# Table of removed users
removed_users_table = outliers.reset_index()
removed_users_table.columns = ['userId', 'num_ratings']

print(removed_users_table)


      userId  num_ratings
0        149         1507
1        198         1255
2        305         1406
3        461         1692
4        487         2164
...      ...          ...
3646  330517         2304
3647  330535         2791
3648  330687         1535
3649  330842         1288
3650  330914         1404

[3651 rows x 2 columns]


In [4]:
from scipy.sparse import csr_matrix

# Map user and movie IDs to indices
user_ids = filtered_data['userId'].unique()
movie_ids = filtered_data['movieId'].unique()
user_to_idx = {uid: i for i, uid in enumerate(user_ids)}
movie_to_idx = {mid: i for i, mid in enumerate(movie_ids)}

# Create sparse matrix
rows = filtered_data['userId'].map(user_to_idx)
cols = filtered_data['movieId'].map(movie_to_idx)
ratings = filtered_data['rating'].values
user_item_matrix = csr_matrix((ratings, (rows, cols)), shape=(len(user_ids), len(movie_ids)))


In [5]:
# This block can be used to look for movies in the dataset
# Example search string
search_string = "whiplash"

# Convert both to lowercase for case-insensitive match
matches = movies[movies['title'].str.lower().str.contains(search_string.lower())]['title'].tolist()

if matches:
    print("Matching movies found:")
    for m in matches:
        print(" -", m)
else:
    print("No matches found.")


Matching movies found:
 - Whiplash (2014)
 - Whiplash (1948)
 - Whiplash (2013)


In [6]:
import numpy as np
import pandas as pd

# Favorite movies with ratings
my_favorites = {
    "Fight Club (1999)": 5,
    "Interstellar (2014)": 5,
    "Spirited Away (Sen to Chihiro no kamikakushi) (2001)": 5,
    "Whiplash (2014)": 5,
}

# Map movie titles to movieId
movie_title_to_id = dict(zip(movies['title'], movies['movieId']))

# Map movieId to column index in user-item matrix
movie_to_idx = {mid: i for i, mid in enumerate(movie_ids)}

# Initialize input vector
my_vector = np.zeros(len(movie_ids))

for title, rating in my_favorites.items():
    if title in movie_title_to_id:
        mid = movie_title_to_id[title]
        if mid in movie_to_idx:
            my_vector[movie_to_idx[mid]] = rating



In [7]:
# Create movie stats (mean rating and count per movie)
movie_stats = filtered_data.groupby('movieId')['rating'].agg(['mean', 'count'])

print("\nYour favorite movies with dataset stats:")
for title, rating in my_favorites.items():
    if title in movie_title_to_id:
        mid = movie_title_to_id[title]
        if mid in movie_stats.index:
            avg_rating = movie_stats.loc[mid, 'mean']
            n_ratings = movie_stats.loc[mid, 'count']
            print(f"'{title}', "
                  f"Avg = {avg_rating:.2f}, "
                  f"# Ratings = {n_ratings}")
        else:
            print(f"'{title}' exists but has no ratings in the dataset.")
    else:
        print(f"'{title}' is NOT in the dataset.")



Your favorite movies with dataset stats:
'Fight Club (1999)', Avg = 4.24, # Ratings = 82858
'Interstellar (2014)', Avg = 4.16, # Ratings = 38565
'Spirited Away (Sen to Chihiro no kamikakushi) (2001)', Avg = 4.24, # Ratings = 33151
'Whiplash (2014)', Avg = 4.17, # Ratings = 19446


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Find indices of movies you rated
my_rated_movies = [movie_to_idx[movie_title_to_id[title]] 
                   for title in my_favorites if title in movie_title_to_id and movie_title_to_id[title] in movie_to_idx]

# Select only users who rated at least one of your movies
mask = np.any(user_item_matrix[:, my_rated_movies].toarray() > 0, axis=1)
relevant_users_matrix = user_item_matrix[mask, :]

similarities = cosine_similarity([my_vector], relevant_users_matrix)[0]

# Map back to user IDs
relevant_user_ids = user_ids[mask]
similar_users_df = pd.DataFrame({'userId': relevant_user_ids, 'similarity': similarities})
similar_users_df = similar_users_df.sort_values(by='similarity', ascending=False)

print("Top 10 most similar users:")
print(similar_users_df.head(10))


Top 10 most similar users:
        userId  similarity
91849   287289    0.707107
95629   298958    0.707107
7669     23827    0.706129
37065   116239    0.706129
15977    49700    0.702782
65456   204175    0.670820
35946   112579    0.670820
100077  312811    0.670820
25411    79619    0.636396
103402  323307    0.615457


In [9]:
# Recommendation

#Pick top-N similar users (top 25)
top_users = similar_users_df.head(25)['userId'].values

top_users_ratings = filtered_data[filtered_data['userId'].isin(top_users)]

# Remove movies already rated
my_rated_movie_ids = [movie_title_to_id[title] for title in my_favorites if title in movie_title_to_id]
top_users_ratings = top_users_ratings[~top_users_ratings['movieId'].isin(my_rated_movie_ids)]

top_users_ratings = top_users_ratings.merge(similar_users_df[['userId','similarity']], on='userId', how='left')

# Compute weighted rating: rating * similarity
top_users_ratings['weighted_rating'] = top_users_ratings['rating'] * top_users_ratings['similarity']

# Aggregate by movieId: sum(weighted_rating) / sum(similarity)
recommendation_scores = top_users_ratings.groupby('movieId').apply(
    lambda x: x['weighted_rating'].sum() / x['similarity'].sum()
).sort_values(ascending=False)

# Map back to movie titles
recommendations = recommendation_scores.reset_index().merge(movies, on='movieId')[['title', 0]]
recommendations.columns = ['title', 'score out of 5']  # Renamed
recommendations['score out of 5'] = recommendations['score out of 5'].round(2)

# Fix titles (The, A, An)
def fix_title(title):
    if ", The" in title:
        return "The " + title.replace(", The", "")
    elif ", A" in title:
        return "A " + title.replace(", A", "")
    elif ", An" in title:
        return "An " + title.replace(", An", "")
    else:
        return title

recommendations['title'] = recommendations['title'].apply(fix_title)

print("Top 25 recommended movies for you:")
print(recommendations.head(25))


Top 25 recommended movies for you:
                                                title  score out of 5
0   Howl's Moving Castle (Hauru no ugoku shiro) (2...            5.00
1                                    Inception (2010)            5.00
2                                     Parasite (2019)            5.00
3                                      Shrek 2 (2004)            5.00
4                            Good Will Hunting (1997)            4.76
5                     The Shawshank Redemption (1994)            4.63
6                             Schindler's List (1993)            4.52
7                                 Forrest Gump (1994)            4.50
8   Kiki's Delivery Service (Majo no takkyûbin) (1...            4.50
9                                The Big Short (2015)            4.50
10                                  The Batman (2022)            4.50
11                                Pulp Fiction (1994)            4.48
12                                  The Matrix (1999)  

  recommendation_scores = top_users_ratings.groupby('movieId').apply(


In [10]:
import pandas as pd

# Read CSV
ratings = pd.read_csv("ratings.csv")
ratings = ratings[['userId', 'movieId', 'rating']]

# Split in half
mid_index = len(ratings) // 2
ratings1 = ratings.iloc[:mid_index]
ratings2 = ratings.iloc[mid_index:]

# Optional: convert category columns to string
for col in ratings1.select_dtypes(['category']).columns:
    ratings1[col] = ratings1[col].astype(str)
for col in ratings2.select_dtypes(['category']).columns:
    ratings2[col] = ratings2[col].astype(str)

# Save as separate Parquet files
ratings1.to_parquet("ratings1.parquet", engine="fastparquet", index=False)
ratings2.to_parquet("ratings2.parquet", engine="fastparquet", index=False)

print("Ratings split and saved successfully!")


Ratings split and saved successfully!


In [22]:
import os
#os.chdir("Downloads")
parquet_path = "merged.parquet"
data_par = pd.read_parquet(parquet_path, engine="fastparquet")

In [23]:
data_par.head()  # Display first few rows to verify

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,110,4.0,Braveheart (1995)
2,1,158,4.0,Casper (1995)
3,1,260,4.5,Star Wars: Episode IV - A New Hope (1977)
4,1,356,5.0,Forrest Gump (1994)
