Step 1: Install required libraries

pip install scikit-surprise

pip install pandas


Step 2: Import required libraries and load the dataset


In [25]:
# Import relevant libraries for creating and evaluating recommendation system models using Surprise package 
import pandas as pd                          
from surprise import Dataset, Reader, SVD         
from surprise.model_selection import cross_validate   

# Load datasets of movies and ratings from files stored in csv format
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
links = pd.read_csv('data/links.csv')


Step 3: Prepare the data for collaborative filtering

Using the scikit-surprise library, we'll create a dataset object and split the data into a train and test set.



In [2]:
# Create an instance of the Reader class, defining the rating range as 0.5 to 5 
reader = Reader(rating_scale=(0.5, 5))

# Load the data from the ratings dataframe and create a Dataset object,
# passing the user id, movie id and rating columns as input to the load_from_df() method
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


Merge the data sets

In [None]:
# Convert the 'tmdbId' to integer and drop rows with missing values
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)

# Merge the MovieLens and TMDB datasets using 'movieId' column
movies = movies.merge(links, left_on='movieId', right_on='movieId')

# Merge the movies dataset with the ratings dataset
movies_with_ratings = movies.merge(ratings, on='movieId')


Step 4: Build a collaborative filtering model

We'll use the SVD algorithm to make movie recommendations based on collaborative filtering.

In [3]:
# Create an object of the SVD algorithm.
svd = SVD()

# Use the cross_validate function to apply K-fold cross-validation on the data for the svd model.
# Here, 5-fold cross-validation is used.
# The evaluation metrics used are RMSE(Root Mean Squared Error) and MAE(Mean Absolute Error).
# verbose parameter is set to True to show more details during cross-validation.
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7777  0.7779  0.7773  0.7778  0.7777  0.7777  0.0002  
MAE (testset)     0.5866  0.5869  0.5865  0.5869  0.5865  0.5867  0.0002  
Fit time          294.01  5609.21 501.17  4167.51 3482.13 2810.81 2087.63 
Test time         952.00  4490.50 608.83  3210.54 6241.48 3100.67 2127.65 


{'test_rmse': array([0.77766929, 0.77790203, 0.77733974, 0.77776941, 0.77765096]),
 'test_mae': array([0.5865782 , 0.58685871, 0.58649324, 0.58687172, 0.58646588]),
 'fit_time': (294.00930404663086,
  5609.209953069687,
  501.1718888282776,
  4167.514079093933,
  3482.1269397735596),
 'test_time': (952.0028901100159,
  4490.501410961151,
  608.8305280208588,
  3210.5446338653564,
  6241.480406761169)}

Step 5: Prepare the data for content-based filtering
To perform content-based filtering, we need to transform the movie genres into a feature vector using the TF-IDF approach.


In [4]:
# Import the necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Instantiate a TfidfVectorizer object with 'english' stop words
# This will help us remove common English words such as 'the', 'and', etc., from the text data
tfidf = TfidfVectorizer(stop_words='english')

# Replace missing genre values with an empty string
movies['genres'] = movies['genres'].fillna('')

# Fit the TfidfVectorizer to the 'genres' column of the movies DataFrame, transforming the text data into a numerical format
# This generates a sparse matrix of TF-IDF values for each genre in the dataset
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Compute the cosine similarity between each pair of movies using their corresponding TF-IDF vectors
# This will give us a measure of how similar the movies are based on their genres
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


Step 6: Create a function for content-based recommendations
This function takes a movie title as input and returns the top n similar movies based on the genre.

In [5]:
# Create a pandas Series with movie titles as the index and their corresponding DataFrame index as values
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Define a function for generating content-based recommendations
def content_based_recommendations(title, n=10):
    # Obtain the index of the movie that matches the provided title
    index = indices[title]
    
    # Create a list of tuples containing the index and cosine similarity score for each movie
    sim_scores = list(enumerate(cosine_sim[index]))
    
    # Sort the list of similarity scores in descending order, so the highest scores come first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Select the top 'n' highest scoring movies, excluding the first one (itself)
    sim_scores = sim_scores[1:n+1]
    
    # Extract the indices of the selected movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the titles of the selected movies using their indices
    return movies['title'].iloc[movie_indices]


Step 7: Combine collaborative and content-based filtering
Create a hybrid recommendation function that takes a user ID and movie title as input, and outputs a list of top n recommended movies.

In [35]:
# Define a hybrid recommendation function that combines content-based and collaborative filtering approaches
def hybrid_recommendations(user_id, title, n=10):
    # Obtain content-based recommendations for the given title and convert them to a DataFrame
    content_based = content_based_recommendations(title, n).to_frame()
    content_based.columns = ['title']
    
    # Merge the content-based recommendations with the movies_with_ratings DataFrame on the 'title' column
    content_based = content_based.merge(movies_with_ratings, on='title')
    
    # Remove duplicate movies, keeping only the first occurrence
    content_based = content_based.drop_duplicates(subset=['title'], keep='first')
    
    # Calculate the estimated rating for each recommended movie using the SVD model and the provided user_id
    content_based['est'] = content_based['movieId'].apply(lambda x: svd.predict(user_id, x).est)
    
    # Sort the movies by their estimated ratings in descending order
    content_based = content_based.sort_values('est', ascending=False)
    
    # Return the top 'n' movie titles with the highest estimated ratings
    return content_based.head(n)['title']



Step 8: Testing recommendation system

In [37]:
# Define the user ID, movie title, and the number of recommendations to generate
user_id = 1
title = 'Toy Story (1995)'
n = 10

# Call the hybrid_recommendations function to generate 'n' recommendations for the user based on the provided movie title
recommendations = hybrid_recommendations(user_id, title, n)

# Convert the recommendations to a list
recommendations_list = recommendations.tolist()

# Print the top 'n' recommendations for the user, showing which movie they are based on
print(f"Top {n} recommendations for User {user_id} who likes '{title}':")

# Iterate through the recommendations list and print each movie title with its corresponding rank
for i, movie_title in enumerate(recommendations_list, start=1):
    print(f"{i}. {movie_title}")


Top 10 recommendations for User 1 who likes 'Toy Story (1995)':
1. Traffic (2000)
2. Mighty, The (1998)
3. Godzilla vs. Destroyah (Gojira vs. Desutoroiâ) (1995) 
4. Daddy Long Legs (1919)
5. Stagecoach (1966)
6. Down in the Valley (2005)
7. Hell Up in Harlem (1973)
8. Hatchet for the Honeymoon (Rosso segno della follia, Il) (1970)
9. Airborne (1993)
10. The Alphabet Killer (2008)
