In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import zipfile
import os
%matplotlib inline

In [2]:
# Define the path to the zip file and the directory to extract to
zip_file = 'datasets.zip'

# Extract the contents of the zip file into the extraction directory
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('.')

print('Extraction complete.')

Extraction complete.


In [3]:
# Storing the movie information into a pandas dataframe
movies_df = pd.read_csv("datasets/movies.csv")

# Storing the user information into a pandas dataframe
ratings_df = pd.read_csv("datasets/ratings.csv")

print(movies_df.shape)
movies_df.head()

(34208, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Remove year from title column and store it in a new column

# Using regular expressions to find a year stored between parentheses
# We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

# Removing the parentheses from year column
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

# Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '', regex=True)

# Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [5]:
# Apply one-hot encoding to the 'genres' column
one_hot_genres_df = movies_df['genres'].str.get_dummies(sep='|')
one_hot_genres_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Copy the original DataFrame to avoid modifying it directly
moviesWithGenres_df = movies_df.copy()

# Join DataFrames based on the corresponding indices
moviesWithGenres_df = moviesWithGenres_df.join(one_hot_genres_df)

# Drop the original 'genres' column
moviesWithGenres_df.drop(columns=['genres'], inplace=True)

moviesWithGenres_df.head()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# let's look at the ratings dataframe.
print(ratings_df.shape)
ratings_df.head()

(22884377, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [8]:
# Drop review timestamp column as we don't need that information
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


# Content-Based recommendation system
Now, let's take a look at how to implement Content-Based or Item-Item recommendation systems. This technique attempts to figure out what a user's favourite aspects of an item is, and then recommends items that present those aspects. In our case, we're going to try to figure out the input's favorite genres from the movies and ratings given.

In [9]:
# Let's begin by creating an input user to recommend movies to:

# Notice: To add more movies, simply increase the amount of elements in the userInput. Just be sure to write it in with capital letters and if a movie starts with a "The", like "The Matrix" then write it in like this: 'Matrix, The' .

userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [10]:
# Add movieId to input user - extract the input movies' ID's from the movies dataframe and add them into it movieId

# We can achieve this by first filtering out the rows that contain the input movies' title and then merging this subset with the input dataframe. 
# We also drop unnecessary columns for the input to save memory space.

# Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

# Then merging it so we can get the movieId. It's merging based on title.
inputMovies = inputMovies.merge(inputId, on='title')

inputMovies.head()

Unnamed: 0,title,rating,movieId,genres,year
0,"Breakfast Club, The",5.0,1968,Comedy|Drama,1985
1,Toy Story,3.5,1,Adventure|Animation|Children|Comedy|Fantasy,1995
2,Jumanji,2.0,2,Adventure|Children|Fantasy,1995
3,Pulp Fiction,5.0,296,Comedy|Crime|Drama|Thriller,1994
4,Akira,4.5,1274,Action|Adventure|Animation|Sci-Fi,1988


In [11]:
# We're going to start by learning the input user's preferences, so let's get the subset of movies that the input user has watched from the Dataframe containing genres defined with binary values.

# Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies.head()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
293,296,Pulp Fiction,1994,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1246,1274,Akira,1988,0,1,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1885,1968,"Breakfast Club, The",1985,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# We'll only need the actual genre table, so let's clean this up a bit by resetting the index and dropping the movieId, title, and year columns.

# Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)

# Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop(columns=['year', 'movieId', 'title'])
userGenreTable.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# Now we're ready to start learning the input user's preferences!

# To do this, we're going to turn each genre into weights. We can do this by using the input's reviews and multiplying them into the input's genre table and then summing up the resulting table by column. This operation is actually a dot product between a matrix and a vector, so we can simply accomplish by calling Pandas' "dot" function.

inputMovies['rating']

0    5.0
1    3.5
2    2.0
3    5.0
4    4.5
Name: rating, dtype: float64

In [14]:
# Dot product to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])

# The user profile
userProfile

(no genres listed)     0.0
Action                 5.0
Adventure             13.5
Animation             10.0
Children               8.5
Comedy                11.5
Crime                  2.0
Documentary            0.0
Drama                  6.5
Fantasy                8.5
Film-Noir              0.0
Horror                 0.0
IMAX                   0.0
Musical                0.0
Mystery                0.0
Romance                0.0
Sci-Fi                 5.0
Thriller               2.0
War                    0.0
Western                0.0
dtype: float64

In [15]:
# Now, we have the weights for every of the user's preferences. This is known as the User Profile. 
# Using this, we can recommend movies that satisfy the user's preferences.
# Let's start by extracting the genre table from the original dataframe:

# Let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])

# And drop the unnecessary information
genreTable = genreTable.drop(columns=['year', 'movieId', 'title'])
print(genreTable.shape)
genreTable.head()

(34208, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# With the input's profile and the complete list of movies and their genres in hand, we're going to take the weighted average of every movie based on the input profile and recommend the top twenty movies that most satisfy it.

# Multiply the one hot encoded genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.717241
2    0.420690
3    0.158621
4    0.248276
5    0.158621
dtype: float64

In [17]:
# Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
recommendationTable_df.head()

movieId
26093     0.806897
673       0.786207
130520    0.786207
108932    0.786207
32031     0.786207
dtype: float64

In [18]:
# The final recommendation table!
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
542,546,Super Mario Bros.,Action|Adventure|Children|Comedy|Fantasy|Sci-Fi,1993
664,673,Space Jam,Adventure|Animation|Children|Comedy|Fantasy|Sc...,1996
2902,2987,Who Framed Roger Rabbit?,Adventure|Animation|Children|Comedy|Crime|Fant...,1988
3028,3114,Toy Story 2,Adventure|Animation|Children|Comedy|Fantasy,1999
3664,3754,"Adventures of Rocky and Bullwinkle, The",Adventure|Animation|Children|Comedy|Fantasy,2000
8605,26093,"Wonderful World of the Brothers Grimm, The",Adventure|Animation|Children|Comedy|Drama|Fant...,1962
8783,26340,"Twelve Tasks of Asterix, The (Les douze travau...",Action|Adventure|Animation|Children|Comedy|Fan...,1976
9296,27344,Revolutionary Girl Utena: Adolescence of Utena...,Action|Adventure|Animation|Comedy|Drama|Fantas...,1999
9825,32031,Robots,Adventure|Animation|Children|Comedy|Fantasy|Sc...,2005
10575,40339,Chicken Little,Action|Adventure|Animation|Children|Comedy|Sci-Fi,2005


## Advantages and Disadvantages of Content-Based Filtering

### Advantages
* Learns user's preferences
* Highly personalized for the user

### Disadvantages
* Doesn't take into account what others think of the item, so low quality item recommendations might happen
* Extracting data is not always intuitive
* Determining what characteristics of the item the user dislikes or likes is not always obvious