In [1]:
# Load the libraries
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore") 

# Data Acquisition and Loading

In [2]:
DATA_DIR = "data/"

# read a csv file and create a dataframe
ratings_df = pd.read_csv(DATA_DIR+"ratings.csv")
movies_df = pd.read_csv(DATA_DIR+"movies.csv")

In [4]:
# Display the top 5 rows of a dataframe
print("First five rows of the dataset:")
ratings_df.head()

First five rows of the dataset:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
print("Dimensions of the ratings data frame:", ratings_df.shape)

Dimensions of the ratings data frame: (100836, 4)


In [6]:
# Display the top 5 rows of a dataframe
print("First five rows of the dataset:")
movies_df.head()

First five rows of the dataset:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
print("Dimensions of the movies data frame:", movies_df.shape)

Dimensions of the movies data frame: (9742, 3)


# Approach 1 - Item-Based Collaborative Filtering

Item-based collaborative filtering recommends movies based on the similarity between items(movies). The idea is that if a user likes a particular movie, they will also like similar movies.

### Item-User Matrix:
- Created a matrix where each row represents a movie and each column represents a user. The values in the matrix are the ratings given by users to movies. Missing ratings are filled with zeros.

### Item Similarity:
- Calculated the cosine similarity between movies based on user ratings. This gives a measure of how similar each pair of movies is.
    
### Recommendation:
- For a given movie, found the most similar moivies based on the similarity scores. The movies with the highest similarity scores are recommended.


In [9]:
# Create an item-user matrix
item_user_matrix = ratings_df.pivot(index = "movieId",
                                   columns = "userId",
                                   values = "rating").fillna(0)

In [10]:
item_user_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Calculate item similarity matrix
item_similarity = cosine_similarity(item_user_matrix)

In [12]:
# FUnction to recommend movies based on item similarity
def recommend_movies_item_based(movie_title, movies_df, item_user_matrix, item_similarity, n_recommendations = 10):
    
    # Get the index of the movie that matches the title
    movie_idx = movies_df[movies_df["title"] == movie_title].index[0]
    
    # Get the similarity scores for the movie
    sim_scores = item_similarity[movie_idx]
    
    # Get the indices of the most similar movies
    similar_movies = np.argsort(sim_scores)[::-1][1:n_recommendations+1]
    
    # Return the titles of the top n most similar movies
    return movies_df["title"].iloc[similar_movies].tolist()

In [14]:
def get_movie_input():
    """Gets movie input from the user, handling potential errors."""
    while True:
        try:
            input_movie = input("\n\nEnter a movie title (or 'exit' to quit): ")
            if input_movie.lower() == 'exit':
                return None  # Signal to exit the loop
            return input_movie
        except Exception as e:  # Catch potential errors (e.g., incorrect input)
            print(f"Invalid input: {e}")


while True:
    input_movie = get_movie_input()

    if input_movie is None:  # User entered 'exit'
        break

    try:
        recommended_movies_item_based = recommend_movies_item_based(input_movie, movies_df, item_user_matrix, item_similarity)

        if recommended_movies_item_based:  # Check if recommendations were returned
            print(f"Item-based recommended Top 10 movies for '{input_movie}':")
            print("\n".join(recommended_movies_item_based))
        else:
            print(f"No recommendations found for '{input_movie}'.  Perhaps the movie title is incorrect or not in the dataset.")

    except Exception as e:  # Catch potential errors during recommendation calculation
        print(f"An error occurred during recommendation: {e}")



Enter a movie title (or 'exit' to quit): Jurassic Park (1993)
Item-based recommended Top 10 movies for 'Jurassic Park (1993)':
Terminator 2: Judgment Day (1991)
Forrest Gump (1994)
Braveheart (1995)
Fugitive, The (1993)
Speed (1994)
Batman (1989)
Independence Day (a.k.a. ID4) (1996)
Apollo 13 (1995)
True Lies (1994)
Lion King, The (1994)


Enter a movie title (or 'exit' to quit): Father of the Bride Part II (1995)
Item-based recommended Top 10 movies for 'Father of the Bride Part II (1995)':
Sabrina (1995)
Juror, The (1996)
Striptease (1996)
Mr. Holland's Opus (1995)
Grumpier Old Men (1995)
Miracle on 34th Street (1994)
Sgt. Bilko (1996)
Twister (1996)
Willy Wonka & the Chocolate Factory (1971)
Tin Cup (1996)


Enter a movie title (or 'exit' to quit): Darkest Hour (2017)
Item-based recommended Top 10 movies for 'Darkest Hour (2017)':
Lynne Koplitz: Hormonal Beast (2017)
Ernest & Célestine (Ernest et Célestine) (2012)
Darkest Hour (2017)
Mudbound (2017)
English Vinglish (2012)
Ghost i

# Approach 2 - Combining Collaborative Filtering with Content-Based Features

To improve the recommendation system, combined collaborative filtering with content-based features like genres. This hypbrid approach can provide more nuanced recommendations by considering both user preferences and movie content.

### Item-User Matrix:
- Created a matrix where each row represents a movie and each column represents a user. The values in the matrix are the ratings given by users to movies. Missing ratings are filled with zeros.

### Genre Vectorization:
- Used TF-IDF to convert the genres into a numerical format, capturing the importance of each genre.

### Combined Similarity:
- Calculated two similarity matrices: One based on user ratings and another based on genres. Then combine these matrices by averaging them.

### Recommendation:
- For a given movie, found the most similar movies based on the combined simlilarity scores.

This hybrid approach leverages both collaborating filtering and content-based filtering, providing more comprehensive recommendations by considering both user preferences and movie content.

In [16]:
# Align the data
common_movie_ids = set(ratings_df['movieId']).intersection(set(movies_df['movieId']))
aligned_ratings_df = ratings_df[ratings_df['movieId'].isin(common_movie_ids)]
aligned_movies_df = movies_df[movies_df['movieId'].isin(common_movie_ids)]

In [17]:
# Create an item-user matrix
item_user_matrix_v2 = aligned_ratings_df.pivot(index = "movieId",
                                   columns = "userId",
                                   values = "rating").fillna(0)

In [18]:
# Calculate item similarity matrix based on user ratings
item_similarity_ratings = cosine_similarity(item_user_matrix_v2)

In [19]:
# Vectorize the genres
tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
tfidf_matrix = tfidf.fit_transform(aligned_movies_df['genres'])

In [20]:
# Calculate item similarity matrix based on user ratings
item_similarity_genres = cosine_similarity(tfidf_matrix)

In [21]:
# Combine the two similarity matrices
combined_similarity = (item_similarity_ratings + item_similarity_genres) / 2

In [22]:
# Function to recommend movies based on combined similarity
def recommend_movies_combined(movie_title, movies_df, combined_similarity, n_recommendations = 10):
    
    # Get the index of the movie that matches the title
    movie_idx = movies_df[movies_df["title"] == movie_title].index[0]
    
    # Get the similarity scores for the movie
    sim_scores = combined_similarity[movie_idx]
    
    # Get the indices of the most similar movies
    similar_movies = np.argsort(sim_scores)[::-1][1:n_recommendations+1]
    
    # Return the titles of the top n most similar movies
    return movies_df["title"].iloc[similar_movies].tolist()

In [23]:
while True:
    input_movie = get_movie_input()

    if input_movie is None:  # User entered 'exit'
        break

    try:
        recommended_movies_combined = recommend_movies_combined(input_movie, aligned_movies_df, combined_similarity)

        if recommended_movies_item_based:  # Check if recommendations were returned
            print(f"Item-based recommended Top 10 movies for '{input_movie}':")
            print("\n".join(recommended_movies_combined))
        else:
            print(f"No recommendations found for '{input_movie}'.  Perhaps the movie title is incorrect or not in the dataset.")

    except Exception as e:  # Catch potential errors during recommendation calculation
        print(f"An error occurred during recommendation: {e}")



Enter a movie title (or 'exit' to quit): Jurassic Park (1993)
Item-based recommended Top 10 movies for 'Jurassic Park (1993)':
Independence Day (a.k.a. ID4) (1996)
Star Wars: Episode VI - Return of the Jedi (1983)
Total Recall (1990)
Stargate (1994)
Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode IV - A New Hope (1977)
Terminator 2: Judgment Day (1991)
Waterworld (1995)
Lost World: Jurassic Park, The (1997)
Spider-Man (2002)


Enter a movie title (or 'exit' to quit): Father of the Bride Part II (1995)
Item-based recommended Top 10 movies for 'Father of the Bride Part II (1995)':
Sgt. Bilko (1996)
Birdcage, The (1996)
Multiplicity (1996)
Happy Gilmore (1996)
Ace Ventura: Pet Detective (1994)
Ace Ventura: When Nature Calls (1995)
Bio-Dome (1996)
Down Periscope (1996)
Father of the Bride (1991)
Billy Madison (1995)


Enter a movie title (or 'exit' to quit): Darkest Hour (2017)
Item-based recommended Top 10 movies for 'Darkest Hour (2017)':
Bill Burr: Why Do I Do

# References

#### 1. Collaborative Filtering:
- https://arxiv.org/abs/1912.08932


#### 2. Content-Based Filtering:
- https://www.ibm.com/think/topics/content-based-filtering

#### 3. Hybrid Recommender Systems:
- https://link.springer.com/article/10.1023/A:1021240730564
