In [1]:
# For downloadin the dataset required for this program
# Link for downloading the dataset for this program: https://grouplens.org/datasets/movielens/25m/ 

In [2]:
# Importing required libraries
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Storing the data in the variables 
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Seperating the year from the title column and pacing it in a new year column
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

# To remove any ending whitespace characters
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

# To convert genre column into a list of genre
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [5]:
moviesWithGenres_df = movies_df.copy()

# Creating differnt columns for each genre and placing 1 into corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1

#Filling in the NaN values with 0
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [7]:
# Removing timestamp column since it's not required
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


Now applying collaborative filtering recommendation system

In [8]:
# Please check the title and enter proper inputs to get the required result ,like, Capitalisation.
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,5.0,"Breakfast Club, The"
1,3.5,Toy Story
2,2.0,Jumanji
3,5.0,Pulp Fiction
4,4.5,Akira


In [9]:
# Getting inputId for the user Input
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
# Merge requires common columns so I can't drop the columns I do not need before merging.
inputMovies = pd.merge(inputId, inputMovies) 

inputMovies = inputMovies.drop('genres', 1).drop('year', 1)

inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,164600,Akira,4.5
5,1968,"Breakfast Club, The",5.0


In [10]:
#Filtering out other users that have watched same movies as the input user 
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
70,2,1,3.5
141,2,1968,1.0
254,3,1,4.0
264,3,296,5.0


In [12]:
# Grouping based on users
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.get_group(130)

Unnamed: 0,userId,movieId,rating
16412,130,1,3.0


In [14]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:5]

[(43,       userId  movieId  rating
  5858      43        1     4.0
  5859      43        2     3.5
  5884      43      296     5.0
  5974      43     1274     4.5
  6018      43     1968     4.5), (171,        userId  movieId  rating
  20794     171        1     4.5
  20795     171        2     4.0
  20833     171      296     5.0
  20973     171     1274     4.0
  21047     171     1968     4.0), (440,        userId  movieId  rating
  54640     440        1     3.5
  54641     440        2     2.0
  54669     440      296     5.0
  54756     440     1274     5.0
  54798     440     1968     4.5), (597,        userId  movieId  rating
  77094     597        1     4.0
  77095     597        2     3.0
  77145     597      296     3.0
  77280     597     1274     4.0
  77370     597     1968     5.0), (695,        userId  movieId  rating
  90765     695        1     4.0
  90766     695        2     3.0
  90866     695      296     4.0
  91100     695     1274     4.0
  91266     695     1

In [15]:
# Consider only top 100 data in order to save time from going through unnecessary data
userSubsetGroup = userSubsetGroup[0:100]

In [16]:
# The key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #print(name,group)
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.head()

Unnamed: 0,0
43,0.946029
171,0.328897
440,0.961538
597,0.468807
695,0.877058


In [17]:
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.946029,43
1,0.328897,171
2,0.961538,440
3,0.468807,597
4,0.877058,695


In [18]:
# Considering top 50 movies for recommendation table
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
99,0.983092,11495
64,0.970725,7723
55,0.962435,6550
94,0.961678,10960
63,0.961678,7571


In [19]:
# Merging ratings with the generated table
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.983092,11495,1,3.0
1,0.983092,11495,2,1.0
2,0.983092,11495,6,4.5
3,0.983092,11495,10,4.5
4,0.983092,11495,16,4.5


In [20]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.983092,11495,1,3.0,2.949275
1,0.983092,11495,2,1.0,0.983092
2,0.983092,11495,6,4.5,4.423913
3,0.983092,11495,10,4.5,4.423913
4,0.983092,11495,16,4.5,4.423913


In [21]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,39.973168,152.887748
2,39.973168,98.535608
3,12.129644,31.955678
4,2.447945,5.662576
5,11.35565,23.548374


In [22]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.824759,1
2,2.465044,2
3,2.634511,3
4,2.313196,4
5,2.073714,5


In [23]:
recommendation_df = recommendation_df.sort_values(by='weighted average score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
70927,5.0,70927
111235,5.0,111235
2931,5.0,2931
6660,5.0,6660
1169,5.0,1169
299,5.0,299
96606,5.0,96606
6583,5.0,6583
320,5.0,320
84952,5.0,84952


Required Recommendation Table

In [24]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
295,299,Priest,[Drama],1994
316,320,Suture,"[Film-Noir, Thriller]",1993
1141,1169,American Dream,[Documentary],1990
2839,2931,Time of the Gypsies (Dom za vesanje),"[Comedy, Crime, Drama, Fantasy]",1989
6460,6583,"Blood of Heroes, The (Salute of the Jugger, The)","[Action, Sci-Fi]",1989
6537,6660,"Red Shoes, The","[Drama, Fantasy, Musical, Romance]",1948
13693,70927,To Each His Own Cinema (Chacun son cinéma ou C...,"[Comedy, Drama]",2007
16097,84952,Confessions (Kokuhaku),"[Drama, Horror]",2010
18452,96606,Samsara,[Documentary],2011
21575,111235,Jodorowsky's Dune,"[Documentary, Sci-Fi]",2013
