$$Collaborative Filtering of Movies$$

In [59]:
#Dataframe manipulation library
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [60]:
# import the movies and ratings csvs
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [32]:
print(movies_df.head())
print(ratings_df .head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating   timestamp
0       1      169     2.5  1204927694
1       1     2471     3.0  1204927438
2       1    48516     5.0  1204927435
3       2     2571     3.5  1436165433
4       2   109487     4.0  1436165496


In [33]:
#Using regex to place the year of the film in another column
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


In [34]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [35]:
#Drop unnecessary column(s)
movies_df = movies_df.drop('genres', 1)

In [36]:
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [37]:
# Examine the data in the other dataframe
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [38]:
#Drop unnecessary columns
ratings_df = ratings_df.drop('timestamp', 1)

In [39]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [41]:
# Use my taste in movies for user input
my_favs = [
            {'title':'To Die For', 'rating':5},
            {'title':'Juror, The', 'rating':4},
            {'title':'Bridges of Madison County, The', 'rating':5},
            {'title':"Birdcage, The", 'rating':5},
            {'title':'Dolores Claiborne', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(my_favs)
inputMovies

Unnamed: 0,title,rating
0,To Die For,5.0
1,"Juror, The",4.0
2,"Bridges of Madison County, The",5.0
3,"Birdcage, The",5.0
4,Dolores Claiborne,4.5


In [42]:
#Find the movies' ID from the movies df
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('year', 1)

print(inputMovies)

   movieId                           title  rating
0       45                      To Die For     5.0
1       79                      Juror, The     4.0
2      105  Bridges of Madison County, The     5.0
3      141                   Birdcage, The     5.0
4      230               Dolores Claiborne     4.5


In [43]:
# Get the set of users who have watched and reviewed these films
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
13,4,45,4.0
644,14,45,3.0
648,14,79,4.0
655,14,141,3.0
673,14,230,3.0


In [44]:
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
104204,1130,230,2.0


In [45]:
# Sort so greater overlap has a higher priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [46]:
userSubsetGroup[0:3]

[(2108,
          userId  movieId  rating
  188765    2108       45     4.0
  188771    2108       79     3.0
  188772    2108      105     3.0
  188776    2108      141     2.5
  188784    2108      230     3.5),
 (2849,
          userId  movieId  rating
  260946    2849       45     3.0
  260957    2849       79     3.0
  260964    2849      105     4.0
  260969    2849      141     4.0
  260993    2849      230     3.0),
 (3410,
          userId  movieId  rating
  315307    3410       45     3.0
  315319    3410       79     3.0
  315329    3410      105     4.0
  315336    3410      141     5.0
  315377    3410      230     4.0)]

In [47]:
# Select a subset of users to iterate over.
userSubsetGroup = userSubsetGroup[0:100]

In [50]:
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    nRatings = len(group)
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    # Calculate the correlation between two users
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [51]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.049029,2108
1,0.612372,2849
2,0.467707,3410
3,0.801784,4208
4,0.559017,4666


In [52]:
# Retrieve the top 40 users with the greatest similarity
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:40]
topUsers.head()

Unnamed: 0,similarityIndex,userId
69,0.918559,60838
90,0.918559,77231
65,0.912871,56395
17,0.912871,15466
45,0.875,35887


In [53]:
# Merge the 2 tables and take the weighted average of the ratings using Pearson Correlation as the weight.
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.918559,60838,2,3.0
1,0.918559,60838,3,3.0
2,0.918559,60838,4,3.0
3,0.918559,60838,5,3.0
4,0.918559,60838,6,4.0


In [54]:
#Multiply the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.918559,60838,2,3.0,2.755676
1,0.918559,60838,3,3.0,2.755676
2,0.918559,60838,4,3.0,2.755676
3,0.918559,60838,5,3.0,2.755676
4,0.918559,60838,6,4.0,3.674235


In [55]:
# Adds the TopUsers after grouping
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,20.004013,80.487773
2,19.89923,57.355364
3,20.904688,58.800923
4,13.043575,33.840399
5,19.819072,53.323673


In [56]:
#Create an empty dataframe
recommendation_df = pd.DataFrame()
#Take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.023581,1
2,2.882291,2
3,2.81281,3
4,2.594411,4
5,2.690523,5


In [57]:
# Sort it to find the top 10 recommended movies
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
127021,5.0,127021
86061,5.0,86061
77154,5.0,77154
7074,5.0,7074
78235,5.0,78235
79311,5.0,79311
6453,5.0,6453
79333,5.0,79333
33779,5.0,33779
7056,5.0,7056


In [58]:
# Find the titles of the recommendations
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
6344,6453,Man of Aran,1934
6945,7056,"Public Enemy, The",1931
6963,7074,"Navigator, The",1924
10173,33779,Eddie Izzard: Dress to Kill,1999
15153,77154,Waking Sleeping Beauty,2009
15375,78235,Betrayal,1983
15604,79311,Hamster Factor and Other Tales of Twelve Monke...,1997
15609,79333,Watch Out for the Automobile (Beregis avtomobi...,1966
17017,86061,"Question of Silence, A (De stilte rond Christi...",1982
27366,127021,Rewind This!,2013
