In [None]:
#Library Imports
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

DATA PREPROCESSING



In [None]:
#Load the datasets
datasetMovies = pd.read_csv('movies.csv', sep='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres']).dropna()
datasetRatings = pd.read_csv('ratings.csv', sep='::', engine='python', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp']).dropna()
datasetUsers = pd.read_csv('users.csv', sep='::', engine='python', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']).dropna()

Movie information is in the file "movies.csv" and is originally in the following format:

MovieID::Title::Genres

- Titles are identical to titles provided by the IMDB (including
year of release)
- Genres are pipe-separated and are selected from the following genres:

	* Action
	* Adventure
	* Animation
	* Children's
	* Comedy
	* Crime
	* Documentary
	* Drama
	* Fantasy
	* Film-Noir
	* Horror
	* Musical
	* Mystery
	* Romance
	* Sci-Fi
	* Thriller
	* War
	* Western

- Some MovieIDs do not correspond to a movie due to accidental duplicate
entries and/or test entries
- Movies are mostly entered by hand, so errors and inconsistencies may exist

In [None]:
print("movies.csv shape:", datasetMovies.shape)
datasetMovies.head() #display the first 5 rows of movies.csv

All ratings are contained in the file "ratings.csv" and are originally in the
following format:

UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

In [None]:
print("ratings.csv shape:", datasetRatings.shape)
datasetRatings.head() #display the first 5 rows of ratings.csv

User information is in the file "users.csv" and is originally in the following format:

UserID::Gender::Age::Occupation::Zip-code

All demographic information is provided voluntarily by the users and is
not checked for accuracy.  Only users who have provided some demographic
information are included in this data set.

- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

- Occupation is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

In [None]:
print("users.csv shape:", datasetUsers.shape)
datasetUsers.head() #display the first 5 rows of users.csv

In [None]:
datasetMovies['MovieID'] = datasetMovies['MovieID'].astype('int32')
datasetMovies['Title'] = datasetMovies['Title'].astype('str')
datasetMovies['Genres'] = datasetMovies['Genres'].astype('str')
print("movies.csv shape:", datasetMovies.shape)
datasetMovies.head() #display the first 5 rows of movies.csv

In [None]:
datasetRatings = datasetRatings.drop(columns = 'Timestamp')
datasetRatings['UserID'] = datasetRatings['UserID'].astype('int32')
datasetRatings['MovieID'] = datasetRatings['MovieID'].astype('int32')
datasetRatings['Rating'] = datasetRatings['Rating'].astype('float32')
print("ratings.csv shape:", datasetRatings.shape)
datasetRatings.head() #display the first 5 rows of ratings.csv

In [None]:
datasetUsers = datasetUsers.drop(columns = 'Zip-code')
datasetUsers['UserID'] = datasetUsers['UserID'].astype('int32')
datasetUsers['Gender'] = datasetUsers['Gender'].astype('str')
datasetUsers['Age'] = datasetUsers['Age'].astype('int32')
datasetUsers['Occupation'] = datasetUsers['Occupation'].astype('int32')
print("users.csv shape:", datasetUsers.shape)
datasetUsers.head() #display the first 5 rows of users.csv

In [None]:
#Merge the movie and ratings datasets
#Create a new dataset containing the entries of both movies.csv and ratings.csv
movieRatingsMerged = pd.merge(datasetMovies, datasetRatings, on = 'MovieID').dropna()
print("merged dataset shape:", movieRatingsMerged.shape)
movieRatingsMerged.head() #display the first 5 rows of the merged dataset

In [None]:
#Merge the movieRatingsMerged dataframe with the users dataset
datasetMerged = pd.merge(movieRatingsMerged, datasetUsers, on = 'UserID').dropna()
print("final merged dataset shape:", datasetMerged.shape)
datasetMerged.head() #diplay the first 5 rows of the final merged dataset

Dataset size reduction

In [None]:
#Need to reduce the size of the dataset - kernel will keep crashing otherwise
#Lets only include reviews from active users
user_count = datasetMerged['UserID'].value_counts() #counts unique users
active_users = user_count[user_count >= 700].index
datasetReduced = datasetMerged[datasetMerged['UserID'].isin(active_users)]
datasetReduced = datasetReduced.dropna()
print("reduced dataset shape:", datasetReduced.shape)
datasetReduced.head()

Create a mapping for movieId to title (To print titles in final output)

In [None]:
#Create a mapping for movieId to title (To print titles in later outputs)
movie_mapping = dict(zip(datasetMovies['MovieID'], datasetMovies['Title']))
print(movie_mapping)

CONTENT-BASED FILTERING

Working with Genres

In [None]:
#Goal - Create one hot encoding for Genres (WORKING WITH datasetMovies ONLY in order to avoid duplicate movies from merged dataset)
#Split the Genre categories into separate items
datasetMovies['Genres'] = datasetMovies['Genres'].str.replace('|', ' ')
print("final merged dataset shape:", datasetMovies.shape)
datasetMovies.head() #display the first 5 rows of the final merged dataset

TD-IDF Vectorization

In [None]:
#TF-IDF Vectorization
#Convert genres to TF-IDF vectors
tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", lowercase=False, use_idf=False)
tfidf_matrix = tfidf.fit_transform(datasetMovies['Genres'])
print(pd.DataFrame(tfidf_matrix.toarray(), index=datasetMovies['Title'], columns=tfidf.get_feature_names_out()))

In [None]:
#Compute the cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [None]:
#Content-Based Filtering
def content_based_model(user_id, datasetMovies, datasetReduced, cosine_sim, num_reviews, top_n=10):
  if num_reviews == 0:
    choice = input("You haven't reviewed any movies yet. Would you like to search for a movie to watch? (yes/no): ").strip().lower()
    if choice == 'yes':
      #Step 1: Ask the user for the movie that they would like to watch
      movie_title = input(f"Enter the name of the movie that you'd like to watch: ")
      #Ensure the movie exists in the dataset
      if movie_title not in datasetMovies['Title'].values:
          print("❌ Movie not found in the dataset.")
          return datasetReduced

      #Step 2: Ask the user to rate this movie
      try:
          rating = float(input(f"Enter your rating for '{movie_title}' (0.0-5.0): "))
          if rating < 0 or rating > 5:
              print("❌ Invalid rating. Please enter a number between 0.0 and 5.0.")
              return datasetReduced

          #Retrieve MovieID for the selected movie
          movie_id = datasetReduced.loc[datasetReduced['Title'] == movie_title, 'MovieID'].values[0]

          #Step 2.1: Check if the user has already rated the movie
          existing_rating = datasetReduced[(datasetReduced['UserID'] == user_id) & (datasetReduced['MovieID'] == movie_id)]

          if not existing_rating.empty:
              #User has already rated this movie, so we update the rating
              datasetReduced.loc[(datasetReduced['UserID'] == user_id) & (datasetReduced['MovieID'] == movie_id), 'Rating'] = rating
              print(f"\n✅ Your rating for '{movie_title}' has been updated to {rating}/5.0")
          else:
              #User has not rated the movie yet, so we append a new entry
              new_entry = pd.DataFrame({
                  'UserID': [user_id],
                  'MovieID': [movie_id],
                  'Title': [movie_title],
                  'Genres': [datasetReduced[datasetReduced['Title'] == movie_title]['Genres'].values[0]],
                  'Rating': [rating]
              })
              datasetReduced = pd.concat([datasetReduced, new_entry], ignore_index=True)
              print(f"\n✅ Your rating has been recorded successfully!\nUser {user_id} rated '{movie_title}' with a score of {rating}/5.0")

      except ValueError:
          print("❌ Invalid input. Please enter a number.")

    elif choice == 'no':
        print("Recommending a random movie for you...")
        all_titles = set(datasetMovies['Title'].values)
        eligible_titles = list(all_titles)
        if eligible_titles: #Safety check
            movie_title = random.choice(eligible_titles)
            print(f"\nYour first recommended movie is: {movie_title} (Random Pick!)")

        #Step 2: Ask the user to rate this movie
        try:
            rating = float(input(f"Enter your rating for '{movie_title}' (0.0-5.0): "))
            if rating < 0 or rating > 5:
                print("❌ Invalid rating. Please enter a number between 0.0 and 5.0.")
                return datasetReduced

            #Retrieve MovieID for the selected movie
            movie_id = datasetReduced.loc[datasetReduced['Title'] == movie_title, 'MovieID'].values[0]

            #Step 2.1: Check if the user has already rated the movie
            existing_rating = datasetReduced[(datasetReduced['UserID'] == user_id) & (datasetReduced['MovieID'] == movie_id)]

            if not existing_rating.empty:
                #User has already rated this movie, so we update the rating
                datasetReduced.loc[(datasetReduced['UserID'] == user_id) & (datasetReduced['MovieID'] == movie_id), 'Rating'] = rating
                print(f"\n✅ Your rating for '{movie_title}' has been updated to {rating}/5.0")
            else:
                #User has not rated the movie yet, so we append a new entry
                new_entry = pd.DataFrame({
                    'UserID': [user_id],
                    'MovieID': [movie_id],
                    'Title': [movie_title],
                    'Genres': [datasetReduced[datasetReduced['Title'] == movie_title]['Genres'].values[0]],
                    'Rating': [rating]
                })
                datasetReduced = pd.concat([datasetReduced, new_entry], ignore_index=True)
                print(f"\n✅ Your rating has been recorded successfully!\nUser {user_id} rated '{movie_title}' with a score of {rating}/5.0")

        except ValueError:
            print("❌ Invalid input. Please enter a number.")

    else:
      print("Invalid input. Please try again.")
      return datasetReduced
  else:
    #Step 1: Ask the user for the movie that they would like to watch
    movie_title = input(f"Enter the name of the movie that you'd like to watch: ")
    #Ensure the movie exists in the dataset
    if movie_title not in datasetMovies['Title'].values:
        print("❌ Movie not found in the dataset.")
        return datasetReduced

    #Step 2: Ask the user to rate this movie
    try:
        rating = float(input(f"Enter your rating for '{movie_title}' (0.0-5.0): "))
        if rating < 0 or rating > 5:
            print("❌ Invalid rating. Please enter a number between 0.0 and 5.0.")
            return datasetReduced

        #Retrieve MovieID for the selected movie
        movie_id = datasetReduced.loc[datasetReduced['Title'] == movie_title, 'MovieID'].values[0]

        #Step 2.1: Check if the user has already rated the movie
        existing_rating = datasetReduced[(datasetReduced['UserID'] == user_id) & (datasetReduced['MovieID'] == movie_id)]

        if not existing_rating.empty:
            #User has already rated this movie, so we update the rating
            datasetReduced.loc[(datasetReduced['UserID'] == user_id) & (datasetReduced['MovieID'] == movie_id), 'Rating'] = rating
            print(f"\n✅ Your rating for '{movie_title}' has been updated to {rating}/5.0")
        else:
            #User has not rated the movie yet, so we append a new entry
            new_entry = pd.DataFrame({
                'UserID': [user_id],
                'MovieID': [movie_id],
                'Title': [movie_title],
                'Genres': [datasetReduced[datasetReduced['Title'] == movie_title]['Genres'].values[0]],
                'Rating': [rating]
            })
            datasetReduced = pd.concat([datasetReduced, new_entry], ignore_index=True)
            print(f"\n✅ Your rating has been recorded successfully!\nUser {user_id} rated '{movie_title}' with a score of {rating}/5.0")

    except ValueError:
        print("❌ Invalid input. Please enter a number.")

  #Step 3: Get recommendations based on content similarity
  #Get the index of the movie
  idx = datasetMovies[datasetMovies['Title'] == movie_title].index[0]
  #Compute similarity scores
  sim_scores = list(enumerate(cosine_sim[idx]))
  #Sort movies by similarity score (Descending)
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  #Exclude the selected movie from recommendations
  sim_scores = [s for s in sim_scores if datasetMovies.iloc[s[0]]['Title'] != movie_title]
  #Take only the top N recommendations
  sim_scores = sim_scores[:top_n]
  #Get the list of recommended movies
  recommended_movies = [datasetMovies.iloc[i[0]]['Title'] for i in sim_scores]
  #Get the similarity scores of each recommended movie
  similarity_scores = [i[1] for i in sim_scores]

  #Display recommendations to the user
  print("\nHere are the top {0} recommendations for {1}:".format(top_n, movie_title))
  for i, (movie, score) in enumerate(zip(recommended_movies, similarity_scores), start=1):
      print(f"{i}. {movie}: {score:.4f}")

  #🎲 10% chance to show an extra random recommendation
  if random.random() < 0.1:
      all_titles = set(datasetMovies['Title'].values)
      excluded_titles = set(recommended_movies + [movie_title])
      eligible_titles = list(all_titles - excluded_titles)
      if eligible_titles:  #Safety check
          bonus_movie = random.choice(eligible_titles)
          print(f"\n🎲 Bonus Recommendation (Random Pick): {bonus_movie}")

  return datasetReduced  #Return updated dataframe

COLLABORATIVE-BASED FILTERING

In [None]:
def collaborative_based_model(user_id, datasetReduced, movie_mapping, n_neighbors=10):
    #User-Item Interaction Matrix
    user_item_matrix = datasetReduced.pivot(index='UserID', columns='MovieID', values='Rating')
    mean_user_rating = np.mean(user_item_matrix, axis=1)
    normalized_user_item_matrix = user_item_matrix.sub(mean_user_rating, axis=0).fillna(0)

    #Fit KNN model
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn.fit(normalized_user_item_matrix)

    if user_id not in normalized_user_item_matrix.index:
        print(f"User {user_id} is not found in the dataset.")
        return []

    input_vector = normalized_user_item_matrix.loc[user_id].values.reshape(1, -1)
    distances, indices = model_knn.kneighbors(input_vector, n_neighbors=n_neighbors + 1)
    similar_user_ids = normalized_user_item_matrix.index[indices.flatten()[1:]]

    print(f"Most similar users to User {user_id} based on cosine similarity:")
    for i in range(1, len(distances.flatten())):
        print(f"User ID: {similar_user_ids[i-1]}, Similarity Distance: {distances.flatten()[i]:.4f}")

    recommended_movies = datasetReduced[datasetReduced['UserID'].isin(similar_user_ids)]
    top_movies = recommended_movies.groupby('MovieID')['Rating'].mean().sort_values(ascending=False)

    already_rated = datasetReduced[datasetReduced['UserID'] == user_id]['MovieID'].values
    top_unseen_movies = top_movies.loc[~top_movies.index.isin(already_rated)].head(10)

    recommendations = [(movie_mapping.get(movie_id, "Unknown Movie"), rating, movie_id) for movie_id, rating in top_unseen_movies.items()]

    print(f"\nRecommendations for User {user_id} based on similar users:\n")
    for i, (title, rating, _) in enumerate(recommendations, start=1):
        print(f"{i}: {title} (Predicted Rating: {rating:.2f})")

    #🎲 10% chance of showing a bonus recommendation
    bonus_movie = None
    if random.random() <= 0.1:
        all_movie_ids = datasetReduced['MovieID'].unique()
        unrated_movie_ids = list(set(all_movie_ids) - set(already_rated))
        if unrated_movie_ids:
            bonus_movie_id = random.choice(unrated_movie_ids)
            bonus_title = movie_mapping.get(bonus_movie_id, "Unknown Movie")
            bonus_movie = (bonus_title, bonus_movie_id)
            print(f"\n🎲 Bonus Recommendation (Random Pick): {bonus_title}")
            print(f"{len(recommendations)+1}: {bonus_title} (Bonus)")

    #Let the user choose which movie to rate
    total_choices = len(recommendations) + (1 if bonus_movie else 0)
    movie_choice = input(f"\nSelect a movie to watch by entering its number (1-{total_choices}): ")

    try:
        movie_choice = int(movie_choice) - 1
        if movie_choice < 0 or movie_choice >= total_choices:
            print("❌ Invalid choice. Please select a valid number.")
            return datasetReduced

        #Determine if it's a bonus movie
        if bonus_movie and movie_choice == len(recommendations):
            selected_movie_title, movie_id = bonus_movie
        else:
            selected_movie_title, _, movie_id = recommendations[movie_choice]

        #Prompt for rating
        rating = float(input(f"Enter your rating for '{selected_movie_title}' (0.0 - 5.0): "))
        if rating < 0 or rating > 5:
            print("❌ Invalid rating. Please enter a number between 0.0 and 5.0.")
            return datasetReduced

        #🔁 Update rating if exists, otherwise append
        mask = (datasetReduced['UserID'] == user_id) & (datasetReduced['MovieID'] == movie_id)
        if datasetReduced[mask].empty:
            new_entry = pd.DataFrame({
                'UserID': [user_id],
                'MovieID': [movie_id],
                'Title': [selected_movie_title],
                'Genres': [datasetReduced[datasetReduced['MovieID'] == movie_id]['Genres'].values[0]],
                'Rating': [rating]
            })
            datasetReduced = pd.concat([datasetReduced, new_entry], ignore_index=True)
        else:
            datasetReduced.loc[mask, 'Rating'] = rating

        print(f"\n✅ Your rating has been recorded successfully!\nUser {user_id} rated '{selected_movie_title}' with a score of {rating}/5.0")

    except ValueError:
        print("❌ Invalid input. Please enter a valid number.")

    return datasetReduced #Return updated dataframe

USER INTERFACE

In [None]:
def get_recommendations_for_user(user_id, datasetReduced, content_based_model, collaborative_based_model):
  #Check how many reviews the user has submitted
  user_reviews = datasetReduced[datasetReduced['UserID'] == user_id]
  num_reviews = len(user_reviews)

  if num_reviews < 5:
      print("Using Content-Based Filtering...")
      datasetReduced = content_based_model(user_id, datasetMovies, datasetReduced, cosine_sim, num_reviews)
      return datasetReduced

  else:
      print("Using Collaborative-Based Filtering...")
      datasetReduced = collaborative_based_model(user_id, datasetReduced, movie_mapping)
      return datasetReduced


In [None]:
#Asks for your user ID, if you don't have one, it creates a new user ID for you
#Ask the user for their user ID
user_id = input("Enter your User ID (or press Enter if new): ").strip()

#Check if the user ID exists in the dataframe
if user_id in datasetReduced['UserID'].astype(str).values:
    print(f"✅ Welcome back, User {user_id}!")
else:
    #Generate a new user ID
    new_user_id = datasetReduced['UserID'].max() + 1
    user_id = np.int32(new_user_id)
    print(f"🆕 New user detected. Assigning User ID: {user_id}")

GET RECOMMENDATIONS

In [None]:
datasetReduced = get_recommendations_for_user(user_id, datasetReduced, content_based_model, collaborative_based_model)

TESTING AND DEBUGGING

In [None]:
#For testing
print(datasetReduced.tail(10))  #View the last 10 rows
#print(datasetReduced[datasetReduced["UserID"] == user_id]) #Check to see the movies rated by the current user
#print(datasetReduced[datasetReduced["Title"] == "Toy Story (1995)"]) #Check to see if a specific movie was added
#print(datasetReduced.info())  #Shows column types and number of entries

In [None]:
#For Testing
#Lookup User Ratings by UserID
def get_user_ratings(user_id, datasetReduced):
    #Filter the dataset to get all ratings submitted by the user with the given UserID
    user_ratings = datasetReduced[datasetReduced['UserID'] == user_id]

    #Return the user ratings, which will include MovieID, Rating, and possibly other columns
    return user_ratings[['MovieID', 'Rating']]

In [None]:
#For Testing (same as above but returns Titles instead of Movie IDs)
def get_user_ratings_with_titles(user_id, datasetReduced, movie_mapping):
    #Filter the dataset to get all ratings submitted by the user with the given UserID
    user_ratings = datasetReduced[datasetReduced['UserID'] == user_id]

    #Debugging: Check if the UserID exists in the dataset
    print(f"UserID {user_id} found in dataset: {not user_ratings.empty}")

    #Map the MovieID to movie titles using the movie_mapping dictionary
    user_ratings['Title'] = user_ratings['MovieID'].map(movie_mapping)

    #Debugging: Check if any MovieID couldn't be mapped
    print(f"Number of ratings with valid titles: {user_ratings['Title'].notna().sum()}")

    #Return the user ratings with movie titles instead of MovieID
    return user_ratings[['Title', 'Rating']]

In [None]:
#For Testing the above 2 functions
user_id = 6037
#user_id = 4510  #Replace with the UserID that you want to look up
#user_ratings = get_user_ratings(user_id, datasetReduced)
user_ratings = get_user_ratings_with_titles(user_id, datasetReduced, movie_mapping)
#Print the user's ratings for each movie
print(user_ratings)

In [None]:
#For Testing
#Looking up all the common rated movies between our user and one of his similar neighbors
def get_common_rated_movies(target_user_id, lookup_user_id, datasetReduced, movie_mapping):
    #Get the ratings for both the target user and the lookup user
    target_user_ratings = datasetReduced[datasetReduced['UserID'] == target_user_id].copy()
    lookup_user_ratings = datasetReduced[datasetReduced['UserID'] == lookup_user_id].copy()

    #Perform an inner join to find movies that both users have rated
    common_movies = pd.merge(target_user_ratings[['MovieID', 'Rating']],
                             lookup_user_ratings[['MovieID', 'Rating']],
                             on='MovieID', how='inner', suffixes=('_target', '_lookup'))

    #Map the MovieID to movie titles using the movie_mapping dictionary
    common_movies['Title'] = common_movies['MovieID'].map(movie_mapping)

    #Rename the columns for clarity
    common_movies = common_movies.rename(columns={
        'Rating_target': 'Your_Rating',
        'Rating_lookup': 'Similar_Users_Rating'
    })

    #Return the rows where both users have rated the same movie, showing the title instead of MovieID
    return common_movies[['Title', 'Your_Rating', 'Similar_Users_Rating']]

In [None]:
#For Testing the above function
target_user_id = 6037  #Replace with the target UserID
lookup_user_id = 4725  #Replace with the UserID that you are looking up

common_movies = get_common_rated_movies(target_user_id, lookup_user_id, datasetReduced, movie_mapping)

#Print the common movies with their ratings
print(common_movies)