In [13]:
import pandas as pd
from math import sqrt
import numpy as np

In [14]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [15]:
userInput = [{'title':'Toy Story (1995)', 'rating':4},
             {'title':'Misérables, Les (1995)', 'rating':5},
             {'title':'Pulp Fiction (1994)', 'rating':4},
             {'title':'Star Trek: Generations (1994)', 'rating':2},
             {'title':'Jurassic Park (1993)', 'rating':3}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                           title  rating
0               Toy Story (1995)       4
1         Misérables, Les (1995)       5
2            Pulp Fiction (1994)       4
3  Star Trek: Generations (1994)       2
4           Jurassic Park (1993)       3


In [16]:
inputId = movies[movies['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                          title  rating
0        1               Toy Story (1995)       4
1       73         Misérables, Les (1995)       5
2      296            Pulp Fiction (1994)       4
3      329  Star Trek: Generations (1994)       2
4      480           Jurassic Park (1993)       3


In [17]:
userSubset = ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1           215     215        215
73           13      13         13
296         307     307        307
329         108     108        108
480         238     238        238


In [18]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(288,        userId  movieId  rating   timestamp
42114     288        1     4.5  1054568869
42128     288       73     3.0   979163671
42150     288      296     5.0   978647227
42153     288      329     3.5  1054569431
42176     288      480     2.0   975691429), (304,        userId  movieId  rating  timestamp
45728     304        1     5.0  881428344
45742     304       73     4.0  920884193
45768     304      296     5.0  891173194
45772     304      329     4.0  914132642
45785     304      480     4.0  891173581), (476,        userId  movieId  rating  timestamp
75355     476        1     4.0  835021447
75364     476       73     4.0  835022035
75380     476      296     3.0  835021274
75386     476      329     3.0  835021384
75409     476      480     4.0  835021636), (599,        userId  movieId  rating   timestamp
92623     599        1     3.0  1498524204
92660     599       73     3.0  1519421396
92742     599      296     5.0  1498456867
92754     599      329     4.0  149

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [19]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [20]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.220416     288
1         0.320256     304
2         0.480384     476
3        -0.366900     599
4         0.318073     600


In [21]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
79         1.000000     177
50         1.000000      63
48         1.000000      40
70         1.000000     141
60         0.981981     102


In [22]:
topUsersRating=topUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     177        1     5.0  1435533535
1               1.0     177        2     3.5  1435534109
2               1.0     177        7     1.0  1435534432
3               1.0     177       11     3.0  1435890660
4               1.0     177       16     3.0  1435890664
..              ...     ...      ...     ...         ...
95              1.0     177      912     5.0  1435536836
96              1.0     177      914     4.0  1435534752
97              1.0     177      915     5.0  1435535580
98              1.0     177      916     5.0  1435535343
99              1.0     177      918     4.0  1435719333

[100 rows x 5 columns]


In [23]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     177        1     5.0  1435533535             5.0
1              1.0     177        2     3.5  1435534109             3.5
2              1.0     177        7     1.0  1435534432             1.0
3              1.0     177       11     3.0  1435890660             3.0
4              1.0     177       16     3.0  1435890664             3.0


In [24]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  32.656926          132.858725
2                  14.980024           52.934357
3                   8.931688           32.119712
4                   0.870388            1.740777
5                   7.323978           22.585927


In [25]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     4.068317        1
2                                     3.533663        2
3                                     3.596152        3
4                                     2.000000        4
5                                     3.083833        5
6                                     3.672136        6
7                                     2.625757        7
8                                     3.000000        8
9                                     3.000000        9
10                                    3.192804       10


In [26]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
4021                                       5.0     4021
85                                         5.0       85
4055                                       5.0     4055
2314                                       5.0     2314
2648                                       5.0     2648
...                                        ...      ...
2534                                       0.5     2534
8136                                       0.5     8136
77427                                      0.5    77427
4051                                       0.5     4051
6827                                       0.5     6827

[5759 rows x 2 columns]


In [27]:
recommended_movie=movies.loc[movies['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                      title  \
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
5           6                                Heat (1995)   
...       ...                                        ...   
9692   184471                         Tomb Raider (2018)   
9695   184791  Fred Armisen: Standup for Drummers (2018)   
9709   187593                          Deadpool 2 (2018)   
9710   187595             Solo: A Star Wars Story (2018)   
9713   188301                Ant-Man and the Wasp (2018)   

                                      genres  
1                 Adventure|Children|Fantasy  
2                             Comedy|Romance  
3                       Comedy|Drama|Romance  
4                                     Comedy  
5                      Action|Crime|Thriller