In [16]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
moviesdf = pd.read_csv('movies.csv')
ratingsdf = pd.read_csv('ratings.csv')
moviesdf.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
# removing the year from the title column and placing it in its own year column using extract
moviesdf['year'] = moviesdf.title.str.extract('(\(\d\d\d\d\))',expand = False)
moviesdf['year'] = moviesdf.year.str.extract('(\d\d\d\d)',expand = False)
moviesdf['title'] = moviesdf.title.str.replace('(\(\d\d\d\d\))', '')
moviesdf['title'] = moviesdf['title'].apply(lambda x: x.strip())
moviesdf.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [19]:
moviesdf = moviesdf.drop('genres', 1)
moviesdf.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [20]:
ratingsdf.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [21]:
ratingsdf = ratingsdf.drop('timestamp', 1)
ratingsdf.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


### Collaborative Filtering

Also called User-User Filtering which uses other users to recommend items to the input user. It finds users that have similar preferences and opinions as the input and then recommends items that they have liked  to the input. Persona correlation is one of the methods that can be used to find similar users. The process is as follows :


- Select a user with the movies the user has watched
- Based on his rating to movies, find the top X neighbors
- Get the watched movie record of the user for each neighbor
- Calculate the similarity score using some method
- Recommend the items with the highest score

In [22]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [23]:
inputId = moviesdf[moviesdf['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('year', 1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [24]:
# filtering out users that have watched movies that the input has watched and storing it
userSubset = ratingsdf[ratingsdf['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [25]:
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [26]:
# sorting these groups so that users that share the most movies in common with the input have higher priority which provides richer recommendations
userSubsetGroup = sorted(userSubsetGroup, key = lambda x:  len(x[1]), reverse = True)
userSubsetGroup[0:3]

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0)]

### Similarity of users

How similar each user is to the input can be found using the Pearson Correlation Coefficient which is used to measure the strength of a linear association between two variables. The formula for finding this coefficient between sets X and Y with N values can be given as:


![alt text](https://wikimedia.org/api/rest_v1/media/math/render/svg/bd1ccc2979b0fd1c1aec96e386f686ae874f9ec0 "Pearson Correlation")


Pearson Correlation is invariant to scaling i.e. multiplying all elements by a nonzero constant or adding any constant to all elements. For example, if there are two vectors X and Y, then pearson(X, Y) == pearson(X, 2 * Y + 3). This is important for recommendation systems because for example two users might rate two series of items totally different in terms of absolute rates, but they would be similar users (i.e with simialar ideas) with similar rates in various scales.


The values given by the formula vary from r = -1 to  r = 1, where 1 forms a direct correlation between the two entities(perfect positive correlation) and -1 forms a perfect negative correlation. 1 means that the two users have similar tastes while a -1 means the opposite.

In [27]:
userSubsetGroup = userSubsetGroup[0:100]

# store the pearson correlation in a dictionary where the key is the user ID and the value is the coefficient
pearsonDict = {}

# for every user group in our subset
for name, group in userSubsetGroup:
    group = group.sort_values(by = 'movieId')
    # N for the formula
    nRatings = len(group)
    # get review scores for the  movies that they both have in common
    tempdf = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    # store the review scores in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = tempdf['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    # calculating the pearson correlation between two users
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList), 2) / float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList), 2) / float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList) * sum(tempGroupList) / float(nRatings)
    
    # if the denominator is different than zero, then divide else  0 correlation
    if Sxx != 0 and Syy != 0:
        pearsonDict[name] = Sxy / sqrt(Sxx * Syy)
    else:
        pearsonDict[name] = 0
        
pearsonDict.items()

dict_items([(75, 0.8272781516947562), (106, 0.5860090386731182), (686, 0.8320502943378437), (815, 0.5765566601970551), (1040, 0.9434563530497265), (1130, 0.2891574659831201), (1502, 0.8770580193070299), (1599, 0.4385290096535153), (1625, 0.716114874039432), (1950, 0.179028718509858), (2065, 0.4385290096535153), (2128, 0.5860090386731196), (2432, 0.1386750490563073), (2791, 0.8770580193070299), (2839, 0.8204126541423674), (2948, -0.11720180773462392), (3025, 0.45124262819713973), (3040, 0.89514359254929), (3186, 0.6784622064861935), (3271, 0.26989594817970664), (3429, 0.0), (3734, -0.15041420939904673), (4099, 0.05860090386731196), (4208, 0.29417420270727607), (4282, -0.4385290096535115), (4292, 0.6564386345361464), (4415, -0.11183835382312353), (4586, -0.9024852563942795), (4725, -0.08006407690254357), (4818, 0.4885967564883424), (5104, 0.7674257668936507), (5165, -0.4385290096535153), (5547, 0.17200522903844556), (6082, -0.04728779924109591), (6207, 0.9615384615384616), (6366, 0.65779

In [28]:
pearsonDf = pd.DataFrame.from_dict(pearsonDict, orient = 'index')
pearsonDf.columns = ['similarityIndex']
pearsonDf['userId'] = pearsonDf.index
pearsonDf.index = range(len(pearsonDf))
pearsonDf.head()

Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [29]:
# top 50 users that are most similar to the input
topUsers = pearsonDf.sort_values(by = 'similarityIndex', ascending = False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


Movies will be recommended by taking the weighted average of the ratings of the movies using Pearson Correlation as the weight. First we get the movies watched by the users in the pearsonDf from the ratings dataframe and then store their correlation in a new column called _similarityIndex achieved by merging the two tables

In [30]:
topUserRating = topUsers.merge(ratingsdf, left_on = 'userId', right_on = 'userId', how = 'inner')
topUserRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5


In [31]:
# multiply the movie rating by its weight(similarity index) then sum up the new ratings and divide it by the sum of the weights
# multiply the similarity by the user's rating
topUserRating['weightedRating'] = topUserRating['similarityIndex'] * topUserRating['rating']
topUserRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196


In [32]:
# applies a sum to the topUsers after grouping it by user ID
tempTopUserRating = topUserRating.groupby('movieId').sum()[['similarityIndex', 'weightedRating']]
tempTopUserRating.columns = ['sumSimilarityIndex', 'sumWeightedRating']
tempTopUserRating.head()

Unnamed: 0_level_0,sumSimilarityIndex,sumWeightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.376281,140.800834
2,38.376281,96.656745
3,10.253981,27.254477
4,0.929294,2.787882
5,11.723262,27.151751


In [33]:
# creates an empty dataframe
recommendationDf = pd.DataFrame()
# taking the weighted average
recommendationDf['weighted average recommendation score'] = tempTopUserRating['sumWeightedRating'] / tempTopUserRating['sumSimilarityIndex']
recommendationDf['movieId'] = tempTopUserRating.index
recommendationDf.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.0,4
5,2.316058,5


In [34]:
# top 20 movies recommended by the algorithm
recommendationDf  =recommendationDf.sort_values(by = 'weighted average recommendation score', ascending = False)
recommendationDf.head(20)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5073,5.0,5073
3329,5.0,3329
2284,5.0,2284
26801,5.0,26801
6776,5.0,6776
6672,5.0,6672
3759,5.0,3759
3769,5.0,3769
3775,5.0,3775
90531,5.0,90531


In [36]:
moviesdf.loc[moviesdf['movieId'].isin(recommendationDf.head(20)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
97,99,Heidi Fleiss: Hollywood Madam,1995
119,121,"Boys of St. Vincent, The",1992
2200,2284,Bandit Queen,1994
3243,3329,"Year My Voice Broke, The",1987
3449,3539,"Filth and the Fury, The",2000
3669,3759,Fun and Fancy Free,1947
3679,3769,Thunderbolt and Lightfoot,1974
3685,3775,Make Mine Music,1946
3686,3776,Melody Time,1948
3759,3851,I'm the One That I Want,2000
