FoDS User-based Movie Recommender System

Priscilla Abigail Munthe (2602109883)

Dataset used: http://grouplens.org/datasets/movielens/ - MovieLens 25M Dataset

Steps:
1. Select a user with the movies the user has watched
2. Based on his rating to movies, find the top x neighbours
3. Get the watched movie record of the user for each neighbour.
4. Calculate a similarity score using some formula
5. Recommend the items with the highest score



In [77]:
import pandas as pd
import numpy as np
from math import sqrt

In [78]:
moviesDF = pd.read_csv('movies.csv')
ratingsDF = pd.read_csv('ratings.csv')
print(moviesDF.info())
print(ratingsDF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17879053 entries, 0 to 17879052
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 545.6 MB
None


In [79]:
userInput = [{'title':'Divergent (2014)', 'rating':1},
             {'title':'Letters to Juliet (2010)', 'rating':5},
             {'title':'Parent Trap, The (1961)', 'rating':4},
             {'title':'13 Going on 30 (2004)', 'rating':4.5},
             {'title':'Split (2017)', 'rating':1}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                      title  rating
0          Divergent (2014)     1.0
1  Letters to Juliet (2010)     5.0
2   Parent Trap, The (1961)     4.0
3     13 Going on 30 (2004)     4.5
4              Split (2017)     1.0


In [80]:
inputId = moviesDF[moviesDF['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', 1)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                     title  rating
0     1013   Parent Trap, The (1961)     4.0
1     7444     13 Going on 30 (2004)     4.5
2    78316  Letters to Juliet (2010)     5.0
3   108190          Divergent (2014)     1.0
4   166534              Split (2017)     1.0


  inputMovies = inputMovies.drop('genres', 1)


In [81]:
userSubset = ratingsDF[ratingsDF['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1013       3096    3096       3096
7444       3741    3741       3741
78316       750     750        750
108190     2803    2803       2803
166534     1992    1992       1992


In [82]:
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
  return len(x[1])

userSubsetGroup = sorted(userSubsetGroup, key= take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

  userSubsetGroup = sorted(userSubsetGroup, key= take_5_elem, reverse=True)


[(15560,          userId  movieId  rating   timestamp
2336579   15560     1013     3.0  1328039036
2337191   15560     7444     3.0  1328036052
2337443   15560    78316     1.5  1329173120
2337570   15560   108190     3.5  1427655833
2337648   15560   166534     3.5  1485733899), (53015,          userId  movieId  rating   timestamp
8146191   53015     1013     3.0  1208735685
8146626   53015     7444     3.5  1205205619
8147174   53015    78316     3.0  1284584578
8147466   53015   108190     2.5  1411855903
8147692   53015   166534     2.0  1550959877), (57548,          userId  movieId  rating   timestamp
8824325   57548     1013     0.5  1480865175
8826466   57548     7444     1.0  1480694310
8827424   57548    78316     0.5  1482956655
8827956   57548   108190     0.5  1452415184
8828589   57548   166534     0.5  1487843584), (72315,           userId  movieId  rating   timestamp
11119932   72315     1013     3.5  1566577446
11124661   72315     7444     3.0  1535602352
11130276   72

In [83]:
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:

  group = group.sort_values(by='movieId')
  inputMovies = inputMovies.sort_values(by='movieId')

  nRatings = len(group)

  tempDF = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

  tempRatings = tempDF['rating'].tolist()

  tempGroups = group['rating'].tolist()

  Sxx = sum([i**2 for i in tempRatings]) - pow(sum(tempRatings),2)/float(nRatings)
  Syy = sum([i**2 for i in tempGroups]) - pow(sum(tempGroups),2)/float(nRatings)
  Sxy = sum( i*j for i, j in zip(tempRatings, tempGroups)) - sum(tempRatings)*sum(tempGroups)/float(nRatings)

  if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
  else:
        pearsonCorrelationDict[name] = 0

In [85]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0        -0.772683   15560
1         0.866098   53015
2         0.401478   57548
3         0.258093   72315
4        -0.181369   89464


In [86]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
67         0.994135   79381
43         0.992207   44364
48         0.973852   49000
60         0.908739   74963
49         0.877220   52123


In [87]:
topUsersRating=topUsers.merge(ratingsDF, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0          0.994135   79381        1     4.0  1371334767
1          0.994135   79381        5     3.0  1442179733
2          0.994135   79381        6     2.0  1371564031
3          0.994135   79381        7     2.5  1442611730
4          0.994135   79381       11     4.0  1371395905
..              ...     ...      ...     ...         ...
95         0.994135   79381      377     4.0  1371251848
96         0.994135   79381      380     3.5  1371917498
97         0.994135   79381      437     2.0  1442178067
98         0.994135   79381      440     4.5  1371396654
99         0.994135   79381      442     3.0  1392498075

[100 rows x 5 columns]


In [88]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0         0.994135   79381        1     4.0  1371334767        3.976539
1         0.994135   79381        5     3.0  1442179733        2.982405
2         0.994135   79381        6     2.0  1371564031        1.988270
3         0.994135   79381        7     2.5  1442611730        2.485337
4         0.994135   79381       11     4.0  1371395905        3.976539


In [89]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  16.745271           63.312993
2                  13.908934           49.300677
3                   4.057297           11.954743
4                   3.465587            7.621289
5                   9.498582           30.592048


In [90]:
recommendation_df = pd.DataFrame()

recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.780948        1
2                                     3.544533        2
3                                     2.946480        3
4                                     2.199134        4
5                                     3.220696        5
6                                     2.955963        6
7                                     3.244352        7
8                                     2.515735        8
9                                     1.126591        9
10                                    2.634261       10


In [91]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
203513                               21.553128   203513
41889                                 6.110354    41889
70670                                 5.692387    70670
8153                                  5.692387     8153
6427                                  5.692387     6427
...                                        ...      ...
208096                                     NaN   208096
208172                                     NaN   208172
208385                                     NaN   208385
208747                                     NaN   208747
208939                                     NaN   208939

[35441 rows x 2 columns]


In [92]:
recommended_movie=moviesDF.loc[moviesDF['movieId'].isin(recommendation_df['movieId'])]

recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62226   208385          Holiday in the Wild (2019)   
62313   208715                  Let It Snow (2019)   
62323   208747                The Good Liar (2019)   
62346   208813                       Noelle (2019)   
62374   208939                        Klaus (2019)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              