In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [2]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('movies.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')

In [3]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [4]:
movies_df = movies_df.drop('genres', 1)

In [5]:
ratings_df = ratings_df.drop('timestamp', 1)

In [7]:
inputMovies = ratings_df[ratings_df["userId"] == 1988]

In [11]:
inputMovies = inputMovies.merge(movies_df, how='inner')[['movieId', 'title', 'rating']]

#### The users who has seen the same movies

Now with the movie ID's in our input, we can now get the subset of users that have watched and reviewed the movies in our input.


In [12]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
38,4,778,4.0
69,4,1393,2.0
238,7,260,4.0
295,10,832,3.0
318,11,3,1.5


In [14]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [17]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [23]:
userSubsetGroup = userSubsetGroup[1:100]

#### Similarity of users to input user

In [24]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [26]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,-0.316057,8583
1,0.195269,22652
2,-0.045273,46750
3,-0.037708,74348
4,0.089351,77231


In [27]:
# top 50 users similar to the input user

topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
16,0.456252,186831
15,0.338928,154083
80,0.299282,117638
70,0.270888,56395
74,0.241136,88682


In [28]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.456252,186831,1,5.0
1,0.456252,186831,3,1.0
2,0.456252,186831,5,3.0
3,0.456252,186831,6,1.0
4,0.456252,186831,7,3.0


In [29]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.456252,186831,1,5.0,2.281261
1,0.456252,186831,3,1.0,0.456252
2,0.456252,186831,5,3.0,1.368757
3,0.456252,186831,6,1.0,0.456252
4,0.456252,186831,7,3.0,1.368757


In [30]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.898746,23.886716
2,3.412892,9.633242
3,5.856321,16.36864
4,2.194534,5.142915
5,5.771207,14.276938


In [31]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.049457,1
2,2.822604,2
3,2.795038,3
4,2.343511,4
5,2.473822,5


In [32]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(20)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
43652,5.162333,43652
129401,5.129867,129401
95654,5.129867,95654
81156,5.129867,81156
77154,5.129867,77154
100617,5.129867,100617
124273,5.129867,124273
130518,5.129867,130518
95311,5.112549,95311
103141,5.0974,103141


In [36]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(20)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
802,816,Two Deaths,1995
7837,8478,"Distant Voices, Still Lives",1988
8515,25960,"Terrible Kids (Enfants terribles, Les) (Strang...",1950
10372,36363,Kin-Dza-Dza!,1986
10831,43652,"Comedians of Comedy, The",2005
13981,69957,Sink or Swim,1990
14509,72583,Brightness (Yeelen),1987
14533,72647,Zorn's Lemma,1970
14949,74727,Gentlemen of Fortune (Dzhentlmeny udachi),1972
15153,77154,Waking Sleeping Beauty,2009
