In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [41]:
movies.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [5]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
unique_users = ratings['userId'].unique()


In [7]:
users = pd.DataFrame(unique_users, columns=['userId'])


In [8]:
users

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5
...,...
605,606
606,607
607,608
608,609


In [9]:
users.isnull().sum()

userId    0
dtype: int64

In [10]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [11]:
movies.duplicated().sum()

0

In [12]:
ratings.duplicated().sum()

0

In [13]:
users.duplicated().sum()

0

## Popularity Based Recommender System

In [14]:
ratings_with_name = ratings.merge(movies,on='movieId')

In [15]:
ratings_with_name

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [16]:
num_rating_df = ratings_with_name.groupby('title').count()['rating'].reset_index()
num_rating_df.rename(columns={'rating':'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,title,num_ratings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [17]:
avg_rating_df = ratings_with_name.groupby('title').mean()['rating'].reset_index()
avg_rating_df.rename(columns={'rating':'avg_rating'},inplace=True)
avg_rating_df

  avg_rating_df = ratings_with_name.groupby('title').mean()['rating'].reset_index()


Unnamed: 0,title,avg_rating
0,'71 (2014),4.000000
1,'Hellboy': The Seeds of Creation (2004),4.000000
2,'Round Midnight (1986),3.500000
3,'Salem's Lot (2004),5.000000
4,'Til There Was You (1997),4.000000
...,...,...
9714,eXistenZ (1999),3.863636
9715,xXx (2002),2.770833
9716,xXx: State of the Union (2005),2.000000
9717,¡Three Amigos! (1986),3.134615


In [18]:
popular_df = num_rating_df.merge(avg_rating_df,on='title')
popular_df

Unnamed: 0,title,num_ratings,avg_rating
0,'71 (2014),1,4.000000
1,'Hellboy': The Seeds of Creation (2004),1,4.000000
2,'Round Midnight (1986),2,3.500000
3,'Salem's Lot (2004),1,5.000000
4,'Til There Was You (1997),2,4.000000
...,...,...,...
9714,eXistenZ (1999),22,3.863636
9715,xXx (2002),24,2.770833
9716,xXx: State of the Union (2005),5,2.000000
9717,¡Three Amigos! (1986),26,3.134615


In [19]:
popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_rating',ascending=False).head(50)

In [20]:
popular_df = popular_df.merge(movies,on='title').drop_duplicates('title')

In [21]:
popular_df

Unnamed: 0,title,num_ratings,avg_rating,movieId,genres
0,"Shawshank Redemption, The (1994)",317,4.429022,318,Crime|Drama
1,Star Wars: Episode IV - A New Hope (1977),251,4.231076,260,Action|Adventure|Sci-Fi
2,Pulp Fiction (1994),307,4.197068,296,Comedy|Crime|Drama|Thriller
3,"Matrix, The (1999)",278,4.192446,2571,Action|Sci-Fi|Thriller
4,Forrest Gump (1994),329,4.164134,356,Comedy|Drama|Romance|War
5,"Silence of the Lambs, The (1991)",279,4.16129,593,Crime|Horror|Thriller


## Collaborative Filtering Based Recommender System

In [22]:
 ratings_with_name.groupby('userId').count()['rating'] > 100

userId
1       True
2      False
3      False
4       True
5      False
       ...  
606     True
607     True
608     True
609    False
610     True
Name: rating, Length: 610, dtype: bool

In [23]:
x = ratings_with_name.groupby('userId').count()['rating'] > 100
movies_nerds = x[x].index

In [24]:
x

userId
1       True
2      False
3      False
4       True
5      False
       ...  
606     True
607     True
608     True
609    False
610     True
Name: rating, Length: 610, dtype: bool

In [25]:
movies_nerds

Int64Index([  1,   4,   6,   7,  10,  15,  17,  18,  19,  20,
            ...
            599, 600, 601, 602, 603, 605, 606, 607, 608, 610],
           dtype='int64', name='userId', length=245)

In [26]:
filtered_rating = ratings_with_name[ratings_with_name['userId'].isin(movies_nerds)]

In [27]:
filtered_rating

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,18,1,3.5,1455209816,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [28]:
y = filtered_rating.groupby('title').count()['rating']>=20
famous_movies = y[y].index

In [29]:
famous_movies

Index(['(500) Days of Summer (2009)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1996)',
       '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
       '12 Angry Men (1957)', '13th Warrior, The (1999)', '1408 (2007)',
       '2001: A Space Odyssey (1968)', '21 Grams (2003)',
       '21 Jump Street (2012)',
       ...
       'You've Got Mail (1998)', 'Young Frankenstein (1974)',
       'Young Guns (1988)', 'Zack and Miri Make a Porno (2008)',
       'Zodiac (2007)', 'Zombieland (2009)', 'Zoolander (2001)',
       'Zootopia (2016)', 'xXx (2002)', '¡Three Amigos! (1986)'],
      dtype='object', name='title', length=1141)

In [30]:
final_ratings = filtered_rating[filtered_rating['title'].isin(famous_movies)]

In [31]:
final_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,18,1,3.5,1455209816,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
91661,594,2986,4.5,1108975379,RoboCop 2 (1990),Action|Crime|Sci-Fi|Thriller
91662,599,2986,4.0,1498499208,RoboCop 2 (1990),Action|Crime|Sci-Fi|Thriller
91663,603,2986,2.0,963177931,RoboCop 2 (1990),Action|Crime|Sci-Fi|Thriller
91664,608,2986,1.5,1117674822,RoboCop 2 (1990),Action|Crime|Sci-Fi|Thriller


In [32]:
pt = final_ratings.pivot_table(index='title',columns='userId',values='rating')

In [33]:
pt

userId,1,4,6,7,10,15,17,18,19,20,...,599,600,601,602,603,605,606,607,608,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),,,,,,4.0,,4.0,,,...,2.5,,,,,,,,,3.5
10 Things I Hate About You (1999),,,,,,,,,3.0,,...,3.0,3.0,,,3.0,5.0,,,,
101 Dalmatians (1996),,,,,,,,,1.0,3.0,...,,2.0,,,4.0,3.0,,,,
101 Dalmatians (One Hundred and One Dalmatians) (1961),,,,,,1.5,,,,4.0,...,,3.0,,,,,,,,
12 Angry Men (1957),,5.0,,,,,,5.0,,,...,,,5.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zombieland (2009),,,,,,,,,,,...,3.0,,,,,,,,,3.5
Zoolander (2001),,,,,,,,3.0,,3.5,...,,4.5,,,,,,,3.0,4.0
Zootopia (2016),,,,,,3.0,,,,,...,,,4.5,,,,,,,4.0
xXx (2002),,,,,,,,,,0.5,...,,,,,,,,,3.5,2.0


In [34]:
pt.fillna(0,inplace=True)

In [35]:
pt

userId,1,4,6,7,10,15,17,18,19,20,...,599,600,601,602,603,605,606,607,608,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,3.0,3.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,...,0.0,2.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,4.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zombieland (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
Zoolander (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.5,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0
Zootopia (2016),0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,4.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,2.0


In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
similarity_scores = cosine_similarity(pt)

In [38]:
similarity_scores.shape

(1141, 1141)

In [39]:
def recommend(title):
    # index fetch
    index = np.where(pt.index==title)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:10]
    
    for i in similar_items:
        print(pt.index[i[0]])
    


In [40]:
recommend('¡Three Amigos! (1986)')


Money Pit, The (1986)
Good Morning, Vietnam (1987)
Spaceballs (1987)
Cocoon (1985)
Rocketeer, The (1991)
Flight of the Navigator (1986)
Waterboy, The (1998)
Risky Business (1983)
Splash (1984)
