In [2]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import zipfile
import os
%matplotlib inline

In [3]:
# Define the path to the zip file and the directory to extract to
zip_file = 'datasets.zip'

# Extract the contents of the zip file into the extraction directory
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('.')

print('Extraction complete.')

Extraction complete.


In [4]:
# Storing the movie information into a pandas dataframe
movies_df = pd.read_csv("datasets/movies.csv")

# Storing the user information into a pandas dataframe
ratings_df = pd.read_csv("datasets/ratings.csv")

print(movies_df.shape)
movies_df.head()

(34208, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Remove year from title column and store it in a new column

# Using regular expressions to find a year stored between parentheses
# We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

# Removing the parentheses from year column
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

# Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '', regex=True)

# Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [6]:
# Drop the genres column since we won't need it for collaboration based recommendation system.
movies_df.drop(columns=['genres'], inplace=True)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [7]:
print(ratings_df.shape)
ratings_df.head()

(22884377, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [8]:
# Drop review timestamp column as we don't need that information
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


Collaborative Filtering

This technique we're going to take a look at is called Collaborative Filtering, which is also known as User-User Filtering. This technique uses other users to recommend items to the input user. It attempts to find users that have similar preferences and opinions as the input and then recommends items that they have liked to the input. There are several methods of finding similar users (Even some making use of Machine Learning), and the one we will be using here is going to be based on the Pearson Correlation Function.

The process for creating a User Based recommendation system is as follows:

1. Select a user with the movies the user has watched
2. Based on the user's rating to movies, find the top X neighbours
3. Get the watched movie record of the user for each neighbour.
4. Calculate a similarity score using a formula
5. Recommend the items with the highest score

In [9]:
# Let's begin by creating an input user to recommend movies to:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [10]:
# Add movieId to input user
# extract the input movies's ID's from the movies dataframe and add them into it movieId

# We can achieve this by first filtering out the rows that contain the input movies' title and then merging this subset with the input dataframe. 
# We also drop unnecessary columns for the input to save memory space.

# Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

# Then merging it so we can get the movieId. It's merging it based on title.
inputMovies = inputMovies.merge(inputId, on='title')

# Dropping information we won't use from the input dataframe
inputMovies.drop(columns=['year'], inplace=True)

inputMovies

Unnamed: 0,title,rating,movieId
0,"Breakfast Club, The",5.0,1968
1,Toy Story,3.5,1
2,Jumanji,2.0,2
3,Pulp Fiction,5.0,296
4,Akira,4.5,1274


In [11]:
# The users who has seen the same movies
# Now with the movie ID's in our input, we can now get the subset of users that have watched and reviewed the movies in our input

# Filtering out users that have watched movies that the input has watched and storing it
userRatingSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userRatingSubset.head(10)

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0
749,15,1,4.0
776,15,296,3.0
911,15,1968,3.0
1247,17,1,5.0
1248,17,2,3.0


In [12]:
# We now group up the rows by user ID.

# Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userRatingSubsetGroup = userRatingSubset.groupby(['userId'])

# let's look at one of the users, e.g. the one with userID=1130
print(userRatingSubsetGroup.get_group(1130))
print(userRatingSubsetGroup.get_group(1131))

        userId  movieId  rating
104167    1130        1     0.5
104168    1130        2     4.0
104214    1130      296     4.0
104363    1130     1274     4.5
104443    1130     1968     4.5
        userId  movieId  rating
105169    1131      296     4.0


In [13]:
# Let's also sort these groups so the users that share the most movies in common with the input have higher priority. 
# This provides a richer recommendation since we won't go through every single user.

# Sorting it so users with movie most in common with the input will have priority
# Iterating over a groupby object will give tuples of size 2, first index is the grouped key, 2nd index is the dataframe associated with that grouped key
userRatingSubsetGroup = sorted(userRatingSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userRatingSubsetGroup[0:4]

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0),
 (815,
         userId  movieId  rating
  73747     815        1     4.5
  73748     815        2     3.0
  73922     815      296     5.0
  74362     815     1274     3.0
  74678     815     1968     4.5)]

Similarity of users to input user

Next, we are going to compare all users (not really all !!!) to our specified user and find the one that is most similar.
We're going to find out how similar each user is to the input through the Pearson Correlation Coefficient. 
It is used to measure the strength of a linear association between two variables.

Pearson's correlation coefficient = covariance(X, Y) / (std_dev(X) * std_dev(Y))

cov(X, Y) = (sum (x - mean(X)) * (y - mean(Y)) ) /(n-1)

Why Pearson Correlation?

Pearson correlation is invariant to scaling, i.e. multiplying all elements by a nonzero constant or adding any constant to all elements. For example, if you have two vectors X and Y,then, pearson(X, Y) == pearson(X, 2 * Y + 3). This is a pretty important property in recommendation systems because for example two users might rate two series of items totally different in terms of absolute rates, but they would be similar users (i.e. with similar ideas) with similar rates in various scales .

The values given by the formula vary from r = -1 to r = 1, where 1 forms a direct correlation between the two entities (it means a perfect positive correlation) and -1 forms a perfect negative correlation.

In our case, a 1 means that the two users have similar tastes while a -1 means the opposite.

We will select a subset of users to iterate through. This limit is imposed because we don't want to waste too much time going through every single user.

In [14]:
userRatingSubsetGroup = userRatingSubsetGroup[0:100]

In [15]:
from scipy.stats import pearsonr

# Sort inputMovies by movieId
inputMovies = inputMovies.sort_values(by='movieId')

# Initialize pearsonCorrelationDict
pearsonCorrelationDict = {}

# For every user group in our subset
for userId, group in userRatingSubsetGroup:
    # Sort the group by movieId
    group = group.sort_values(by='movieId')
    
    # Get common movies between inputMovies and current user group
    common_input_movies = inputMovies[inputMovies['movieId'].isin(group['movieId'])]
    
    # Get rating vectors for common movies
    input_ratings = common_input_movies['rating'].values
    group_ratings = group[group['movieId'].isin(common_input_movies['movieId'])]['rating'].values
    
    # Calculate Pearson correlation coefficient
    correlation_coefficient, _ = pearsonr(input_ratings, group_ratings)
    
    # Store Pearson correlation coefficient in pearsonCorrelationDict
    pearsonCorrelationDict[userId] = correlation_coefficient

In [16]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [17]:
# The top X similar users to input user
# Now let's get the top 50 users that are most similar to the input.

topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False).iloc[:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
64,0.961678,12325
55,0.961538,10707
34,0.961538,6207
67,0.960769,13053
4,0.943456,1040


In [18]:
# Now, let's start recommending movies to the input user.

# Rating of selected users to all movies

# We're going to do this by taking the weighted average of the ratings of the movies using the Pearson Correlation as the weight. 

# To do this, we first need to get the movies watched by the users in our pearsonDF from the ratings dataframe and then store their correlation in a new column called similarityIndex". This is achieved below by merging of these two tables.

topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')

# Now all we need to do is simply multiply the movie rating by its weight (The similarity index), then sum up the new ratings and divide it by the sum of the weights.
# We can easily do this by simply multiplying two columns, then grouping up the dataframe by movieId and then dividing two columns:
# It shows the idea of all similar users to candidate movies for the input user:

# Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex'] * topUsersRating['rating']

topUsersRating.head(10)

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196
5,0.961678,12325,7,3.0,2.885035
6,0.961678,12325,10,3.0,2.885035
7,0.961678,12325,11,2.5,2.404196
8,0.961678,12325,17,4.0,3.846713
9,0.961678,12325,19,1.0,0.961678


In [19]:
# Group by movieId and sum up the weighted ratings and similarity indices
tempTopUsersRating = topUsersRating.groupby('movieId').agg({'similarityIndex': 'sum', 'weightedRating': 'sum'})
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']

# Calculate the weighted average recommendation score
recommendation_df = pd.DataFrame()
recommendation_df['weighted_avg_recommendation_score'] = tempTopUsersRating['sum_weightedRating'] / tempTopUsersRating['sum_similarityIndex']
recommendation_df.head()

Unnamed: 0_level_0,weighted_avg_recommendation_score
movieId,Unnamed: 1_level_1
1,3.668955
2,2.518658
3,2.657941
4,3.0
5,2.316058


In [20]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

# Now we take the weighted average
recommendation_df['weighted_avg_recommendation_score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted_avg_recommendation_score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.0,4
5,2.316058,5


In [21]:
# Now let's sort it and see the top 10 movies that the algorithm recommended!

recommendation_df = recommendation_df.sort_values(by='weighted_avg_recommendation_score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted_avg_recommendation_score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
27266,5.0,27266
3067,5.0,3067
26801,5.0,26801
3406,5.0,3406
1902,5.0,1902
6660,5.0,6660
6668,5.0,6668
121,5.0,121
3851,5.0,3851
90531,5.0,90531


In [22]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
119,121,"Boys of St. Vincent, The",1992
1819,1902,Dream for an Insomniac,1996
2981,3067,Women on the Verge of a Nervous Breakdown (Muj...,1988
3319,3406,Captain Horatio Hornblower R.N.,1951
3759,3851,I'm the One That I Want,2000
6551,6660,"Red Shoes, The",1948
6559,6668,"Road Home, The (Wo de fu qin mu qin)",1999
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
9260,27266,2046,2004
18106,90531,Shame,2011


## Advantages and Disadvantages of Collaborative Filtering

### Advantages
* Takes other user's ratings into consideration
* Doesn't need to study or extract information from the recommended item
* Adapts to the user's interests which might change over time

### Disadvantages
* Approximation function can be slow
* There might be a low of amount of users to approximate
* Privacy issues when trying to learn the user's preferences