In [1]:
! pip install pandas
import numpy as np # linear algebra
import pandas as pd # data processing
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline



In [31]:
#Reading the datasets that we have. We will need movies and the ratings files.
movie = pd.read_csv('movie.csv')
rating = pd.read_csv('ratings_small.csv')
#movie.head()

In [32]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
#First we create the new Year column in the movie dataframe.
movie['year'] = movie.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movie['year'] = movie.year.str.extract('(\d\d\d\d)',expand=False)

In [7]:
#Now removing the year from the title in the Title column.
#Removing the years from the 'title' column
movie['title'] = movie.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movie['title'] = movie['title'].apply(lambda x: x.strip())

  movie['title'] = movie.title.str.replace('(\(\d\d\d\d\))', '')


In [8]:
movie.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [9]:
#Collaborative filtering doesn't recommend based on the features of the movie.
#The recommendation is based on the likes and dislikes or ratings of the neighbours or other users. 
#So we will drop the genre column, since there is no use of it.
movie.drop(columns=['genres'], inplace=True)
movie.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [10]:
#Now, coming to the ratings dataframe, we have the movieId column that is 
#common with the movie dataframe. Each user has given multiple ratings for different movies.
#The column Timestamp is not needed for the recommendation system. So we can drop it.
rating.drop(columns=['timestamp'],inplace=True)
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [11]:
#The process for creating a User Based recommendation system is as follows
user = [
            {'title':'Breakfast Club, The', 'rating':4},
            {'title':'Toy Story', 'rating':2.5},
            {'title':'Jumanji', 'rating':3},
            {'title':"Pulp Fiction", 'rating':4.5},
            {'title':'Akira', 'rating':5}
         ] 
inputMovie = pd.DataFrame(user)
inputMovie

Unnamed: 0,title,rating
0,"Breakfast Club, The",4.0
1,Toy Story,2.5
2,Jumanji,3.0
3,Pulp Fiction,4.5
4,Akira,5.0


In [12]:
#Filtering out the movies by title
Id = movie[movie['title'].isin(inputMovie['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovie = pd.merge(Id, inputMovie)
#Dropping information we won't use from the input dataframe
inputMovie = inputMovie.drop('year', 1)
inputMovie

  inputMovie = inputMovie.drop('year', 1)


Unnamed: 0,movieId,title,rating
0,1,Toy Story,2.5
1,2,Jumanji,3.0
2,296,Pulp Fiction,4.5
3,1274,Akira,5.0
4,1968,"Breakfast Club, The",4.0


In [13]:
#Finding the users who have seen the same movies from the rating dataframe With the movie ID's 
#in our input, we can now get the subset of users that have watched and reviewed the movies in our input.
#Filtering out users that have watched movies that the input has watched and storing it
users = rating[rating['movieId'].isin(inputMovie['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
49,2,296,4.0
100,3,296,4.5
156,4,296,5.0
255,4,1968,5.0
389,5,1968,4.0


In [14]:
#Finding the users who have seen the same movies from the rating dataframe With the movie ID's 
#in our input, we can now get the subset of users that have watched and reviewed the movies in our input.
#Filtering out users that have watched movies that the input has watched and storing it

users = rating[rating['movieId'].isin(inputMovie['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
49,2,296,4.0
100,3,296,4.5
156,4,296,5.0
255,4,1968,5.0
389,5,1968,4.0


In [15]:
users.shape

(829, 3)

In [16]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = users.groupby(['userId'])

In [17]:
#showing one such group example by getting all the users of a particular uderId
userSubsetGroup.get_group(110)

Unnamed: 0,userId,movieId,rating
16910,110,296,5.0


In [18]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

  userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)


In [19]:
userSubsetGroup[0:3]

[(73,
         userId  movieId  rating
  10214      73        1     5.0
  10215      73        2     2.5
  10281      73      296     5.0
  10450      73     1274     4.5
  10557      73     1968     4.5),
 (285,
         userId  movieId  rating
  38828     285        1     4.0
  38829     285        2     3.0
  38870     285      296     4.0
  38982     285     1274     3.0
  39071     285     1968     3.0),
 (561,
         userId  movieId  rating
  82117     561        1     3.0
  82118     561        2     3.0
  82140     561      296     4.5
  82201     561     1274     3.5
  82234     561     1968     4.0)]

In [20]:
#Similarity of users to input user Next, we are going to compare all users to our specified user 
#and find the one that is most similar. we're going to find out how similar each user is to the input
#through the Pearson Correlation Coefficient. It is used to measure the strength of a linear association 
#between two variables.
userSubsetGroup = userSubsetGroup[0:100]

In [21]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovie = inputMovie.sort_values(by='movieId')
    #Get the N for the formula
    n = len(group)
    #Get the review scores for the movies that they both have in common
    temp = inputMovie[inputMovie['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp['rating'].tolist()
    #put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(n)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(n)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(n)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorDict[name] = 0

In [22]:
pearsonCorDict.items()

dict_items([(73, 0.3023255813953484), (285, -0.26413527189768443), (561, 0.6842472173461585), (564, 0.8364283610093444), (580, 0.5114957546028552), (15, 0.9737289911202953), (19, 0.8043996665398437), (30, 0.5803810000880093), (48, 0.07312724241271307), (69, -0.48686449556014766), (77, -0.5669467095138409), (119, 0.9899494936611665), (134, 0.5720775535473553), (149, 0.6446583712203042), (157, 0.6931032800836721), (177, 0), (185, -0.6859943405700353), (187, 0.6414269805898185), (212, 0.9486832980505138), (262, 0.8436614877321075), (268, 0.36514837167011077), (292, 0.8520128672302585), (306, 0.7302967433402214), (355, 0.8868440532177395), (428, 0.3651483716701107), (442, 0.7627700713964739), (466, -0.38138503569823695), (468, 0.0), (472, -0.5345224838248488), (480, -0.9486832980505138), (518, 0.3651483716701107), (607, 0.3), (624, 0.0), (654, 0.2893456933022473), (47, -0.9707253433941461), (56, 0.2773500981126172), (72, 0.5765566601970545), (78, 0.7857142857142853), (88, 0), (92, -0.24019

In [23]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.302326,73
1,-0.264135,285
2,0.684247,561
3,0.836428,564
4,0.511496,580


In [24]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
99,1.0,59
95,1.0,23
11,0.989949,119
78,0.981981,514
71,0.980316,457


In [25]:
#Rating of selected users to all movies 
topUsersRating=topUsers.merge(rating, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,59,11,2.5
1,1.0,59,32,4.5
2,1.0,59,50,5.0
3,1.0,59,111,2.5
4,1.0,59,150,4.0


In [26]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,59,11,2.5,2.5
1,1.0,59,32,4.5,4.5
2,1.0,59,50,5.0,5.0
3,1.0,59,111,2.5,2.5
4,1.0,59,150,4.0,4.0


In [27]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,36.721208,120.227544
2,22.416929,70.126219
3,9.8133,21.846258
4,2.140273,4.234435
5,10.76654,26.189378


In [28]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.274063,1
2,3.128271,2
3,2.226189,3
4,1.978455,4
5,2.432479,5


In [29]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3837,5.0,3837
2475,5.0,2475
2131,5.0,2131
3807,5.0,3807
2177,5.0,2177
3840,5.0,3840
3851,5.0,3851
46,5.0,46
49,5.0,49
341,5.0,341


In [30]:
movie.loc[movie['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
45,46,How to Make an American Quilt,1995
48,49,When Night Is Falling,1995
337,341,Double Happiness,1994
2047,2131,Autumn Sonata (Höstsonaten),1978
2093,2177,Family Plot,1976
2390,2475,52 Pick-Up,1986
3716,3807,Sinbad and the Eye of the Tiger,1977
3745,3837,Phantasm II,1988
3748,3840,Pumpkinhead,1988
3758,3851,I'm the One That I Want,2000
