# Case Study Recommender System

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import *

In [3]:
movies = pd.read_csv("movies.csv")

In [4]:
ratings=pd.read_csv("ratings.csv")

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [7]:
movies.shape

(10329, 3)

In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [9]:
ratings.shape

(105339, 4)

In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [11]:
ratings["userId"].nunique()

668

In [12]:
ratings.movieId.nunique()

10325

In [13]:
movies.movieId.nunique()

10329

In [14]:
# 668 users
# 10325 movie ids in ratings
# 10329 movie ida in movies

In [15]:
movies["genres"]=movies["genres"].str.split("|")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),"[Animation, Children, Comedy]"
10325,146878,Le Grand Restaurant (1966),[Comedy]
10326,148238,A Very Murray Christmas (2015),[Comedy]
10327,148626,The Big Short (2015),[Drama]


In [16]:
movies2=movies.explode("genres")
movies2

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [17]:
movies2["genres"].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'War', 'Musical', 'Documentary',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [18]:
movies2 = movies2[movies2["genres"]!='(no genres listed)']

In [19]:
movies2.genres.nunique()

19

In [20]:
merged_info=pd.merge(ratings,movies2,on=["movieId"],how="inner")
merged_info

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime
1,1,16,4.0,1217897793,Casino (1995),Drama
2,9,16,4.0,842686699,Casino (1995),Crime
3,9,16,4.0,842686699,Casino (1995),Drama
4,12,16,1.5,1144396284,Casino (1995),Crime
...,...,...,...,...,...,...
281892,668,140098,2.5,1450415424,Runoff (2015),Drama
281893,668,140816,2.5,1443288791,Tangerine (2015),Comedy
281894,668,140816,2.5,1443288791,Tangerine (2015),Drama
281895,668,142488,4.0,1451535844,Spotlight (2015),Thriller


In [21]:
popularity_df = merged_info.groupby(["genres","title"]).agg({"rating":["mean","size"]}).reset_index()

In [22]:
popularity_df.columns=["genre","title","avg_ratings","no. of ratings"]

In [23]:
popularity_df

Unnamed: 0,genre,title,avg_ratings,no. of ratings
0,Action,'71 (2014),3.500000,1
1,Action,'Hellboy': The Seeds of Creation (2004),3.000000,1
2,Action,10 to Midnight (1983),2.500000,1
3,Action,12 Rounds (2009),2.875000,4
4,Action,13 Assassins (Jûsan-nin no shikaku) (2010),3.500000,5
...,...,...,...,...
23093,Western,Wyatt Earp (1994),3.200000,30
23094,Western,Young Guns (1988),3.375000,36
23095,Western,Young Guns II (1990),3.083333,12
23096,Western,Young Ones (2014),2.000000,1


In [24]:
# genre = "adventure"
# rating threshold = 20
# top n=5

In [31]:
popularity_df[(popularity_df["genre"]=="Adventure")&(popularity_df["no. of ratings"]>20)]

Unnamed: 0,genre,title,avg_ratings,no. of ratings
1739,Adventure,101 Dalmatians (1996),3.071429,42
1740,Adventure,101 Dalmatians (One Hundred and One Dalmatians...,3.635135,37
1743,Adventure,"13th Warrior, The (1999)",3.317073,41
1745,Adventure,"20,000 Leagues Under the Sea (1954)",3.645833,24
1746,Adventure,2001: A Space Odyssey (1968),3.960317,126
...,...,...,...,...
2882,Adventure,"World Is Not Enough, The (1999)",3.455556,45
2884,Adventure,X-Men (2000),3.557692,130
2886,Adventure,X-Men: First Class (2011),3.660714,28
2887,Adventure,X2: X-Men United (2003),3.567073,82


In [26]:
# Popularity Based Recommender System

def TopNPopularMovies(genre,num_threshold,topN):
    #popularity df
    popularity_df = merged_info.groupby(["genres","title"]).agg({"rating":["mean","size"]}).reset_index()
    popularity_df.columns=["genre","title","avg_ratings","no. of ratings"]
    
    #filter data
    filter_pop = popularity_df[(popularity_df["genre"]==genre)&(popularity_df["no. of ratings"]>num_threshold)].sort_values(by='avg_ratings',ascending=False).head(topN)
    return(filter_pop)

In [27]:
TopNPopularMovies("Action",15,5)

Unnamed: 0,genre,title,avg_ratings,no. of ratings
1179,Action,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1076,Action,North by Northwest (1959),4.273973,73
1183,Action,"Professional, The (Le professionnel) (1981)",4.272727,22
680,Action,Henry V (1989),4.272727,22
975,Action,"Matrix, The (1999)",4.264368,261


In [28]:
def TopNPopularMovies(genre,num_threshold,topN):
    filter_pop = popularity_df[(popularity_df["genre"]==genre)&(popularity_df["no. of ratings"]>num_threshold)].sort_values(by='avg_ratings',ascending=False).head(topN).reset_index()
    return(filter_pop.drop(['index','genre'],axis=1))

In [29]:
TopNPopularMovies("Action",15,5)

Unnamed: 0,title,avg_ratings,no. of ratings
0,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1,North by Northwest (1959),4.273973,73
2,"Professional, The (Le professionnel) (1981)",4.272727,22
3,Henry V (1989),4.272727,22
4,"Matrix, The (1999)",4.264368,261


# Content Based Recommender

In [30]:
movies2

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Children
10324,146684,Cosmic Scrat-tastrophe (2015),Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy


In [32]:
x=["ADVENTURE","COMEDY","FANTASY"]
" ".join(x)

'ADVENTURE COMEDY FANTASY'

In [36]:
movies3 = movies2.groupby("title").agg({"genres":lambda x:" ".join(list(x))}).reset_index()
movies3

Unnamed: 0,title,genres
0,'71 (2014),Action Drama Thriller War
1,'Hellboy': The Seeds of Creation (2004),Action Adventure Comedy Documentary Fantasy
2,'Round Midnight (1986),Drama Musical
3,'Til There Was You (1997),Drama Romance
4,"'burbs, The (1989)",Comedy
...,...,...
10315,loudQUIETloud: A Film About the Pixies (2006),Documentary
10316,xXx (2002),Action Crime Thriller
10317,xXx: State of the Union (2005),Action Crime Thriller
10318,¡Three Amigos! (1986),Comedy Western


In [39]:
tf = TfidfVectorizer(analyzer="word",ngram_range=(1,3),stop_words='english',min_df=0)

In [42]:
tf_matrix = tf.fit_transform(movies3["genres"])

In [43]:
tf_matrix

<10320x597 sparse matrix of type '<class 'numpy.float64'>'
	with 44518 stored elements in Compressed Sparse Row format>

# Cosine Similarity

In [46]:
cosine_sim = cosine_similarity(tf_matrix,tf_matrix)
cosine_sim

array([[1.        , 0.02677945, 0.02931913, ..., 0.10229517, 0.        ,
        0.        ],
       [0.02677945, 1.        , 0.        , ..., 0.03626651, 0.02411583,
        0.02863994],
       [0.02931913, 0.        , 1.        , ..., 0.        , 0.        ,
        0.35526663],
       ...,
       [0.10229517, 0.03626651, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.02411583, 0.        , ..., 0.        , 1.        ,
        0.07090711],
       [0.        , 0.02863994, 0.35526663, ..., 0.        , 0.07090711,
        1.        ]])

In [55]:
indices=pd.Series(movies3['title'].index,index=movies3['title'])
indices

title
'71 (2014)                                           0
'Hellboy': The Seeds of Creation (2004)              1
'Round Midnight (1986)                               2
'Til There Was You (1997)                            3
'burbs, The (1989)                                   4
                                                 ...  
loudQUIETloud: A Film About the Pixies (2006)    10315
xXx (2002)                                       10316
xXx: State of the Union (2005)                   10317
¡Three Amigos! (1986)                            10318
À nous la liberté (Freedom for Us) (1931)        10319
Length: 10320, dtype: int64

In [75]:
index=indices["Toy Story (1995)"]

In [76]:
sim_scores=cosine_sim[index]
l=list(enumerate(sim_scores))
l

[(0, 0.0),
 (1, 0.08523498248083886),
 (2, 0.0),
 (3, 0.0),
 (4, 0.13129724316432967),
 (5, 0.0),
 (6, 0.03567220208744299),
 (7, 0.4318316465143841),
 (8, 0.0),
 (9, 0.050235901098503954),
 (10, 0.03567220208744299),
 (11, 0.050235901098503954),
 (12, 0.057966124016070465),
 (13, 0.04750827819431324),
 (14, 0.050235901098503954),
 (15, 0.2917769253786178),
 (16, 0.6964846697441013),
 (17, 0.03567220208744299),
 (18, 0.40369483869702805),
 (19, 0.0),
 (20, 0.29918796180849927),
 (21, 0.0),
 (22, 0.019206402336461165),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.06291308317909093),
 (30, 0.016951812643688488),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.2372685497183724),
 (35, 0.0),
 (36, 0.13983002091003893),
 (37, 0.0),
 (38, 0.10762846621009421),
 (39, 0.0),
 (40, 0.0),
 (41, 0.06296617662722356),
 (42, 0.07574565870959844),
 (43, 0.0),
 (44, 0.0),
 (45, 0.03207591544507681),
 (46, 0.0),
 (47, 0.0),
 (48, 0.13129724316432967),
 (49, 0.035672202087

In [77]:
sort_sc=sorted(l,key=lambda x:x[1],reverse=True)

In [78]:
sort_sc

[(242, 1.0000000000000002),
 (550, 1.0000000000000002),
 (635, 1.0000000000000002),
 (1332, 1.0000000000000002),
 (2752, 1.0000000000000002),
 (2883, 1.0000000000000002),
 (6133, 1.0000000000000002),
 (8208, 1.0000000000000002),
 (8890, 1.0000000000000002),
 (9377, 1.0000000000000002),
 (9378, 1.0000000000000002),
 (9490, 1.0000000000000002),
 (10064, 1.0000000000000002),
 (8862, 0.8937209964057728),
 (9088, 0.8937209964057728),
 (3674, 0.8645951852315636),
 (8205, 0.8645951852315636),
 (1933, 0.8610003252639898),
 (6912, 0.8610003252639898),
 (7220, 0.8610003252639898),
 (438, 0.8403193635914471),
 (637, 0.8403193635914471),
 (1504, 0.8403193635914471),
 (3210, 0.8403193635914471),
 (4128, 0.8403193635914471),
 (4269, 0.8403193635914471),
 (4486, 0.8403193635914471),
 (4487, 0.8403193635914471),
 (4776, 0.8403193635914471),
 (4846, 0.8403193635914471),
 (5656, 0.8403193635914471),
 (6044, 0.8403193635914471),
 (6828, 0.8403193635914471),
 (6968, 0.8403193635914471),
 (7066, 0.84031936

In [79]:
im = [i[0] for i in sort_sc]
im

[242,
 550,
 635,
 1332,
 2752,
 2883,
 6133,
 8208,
 8890,
 9377,
 9378,
 9490,
 10064,
 8862,
 9088,
 3674,
 8205,
 1933,
 6912,
 7220,
 438,
 637,
 1504,
 3210,
 4128,
 4269,
 4486,
 4487,
 4776,
 4846,
 5656,
 6044,
 6828,
 6968,
 7066,
 7089,
 7646,
 7647,
 7783,
 8520,
 9810,
 541,
 8207,
 9379,
 4225,
 8463,
 9669,
 2890,
 1273,
 4596,
 16,
 786,
 1470,
 2543,
 2551,
 3881,
 7558,
 8135,
 7700,
 3098,
 187,
 218,
 219,
 220,
 2863,
 3282,
 3293,
 3394,
 3395,
 3762,
 4345,
 4774,
 4985,
 5394,
 5875,
 7076,
 8713,
 304,
 4115,
 6676,
 7687,
 910,
 1505,
 1662,
 1756,
 1789,
 2090,
 3331,
 3538,
 3956,
 4248,
 4253,
 4295,
 4638,
 4754,
 5509,
 7784,
 7785,
 7864,
 8128,
 8351,
 8795,
 9811,
 4489,
 1663,
 3965,
 5657,
 6749,
 3784,
 7105,
 305,
 8206,
 653,
 1121,
 1689,
 5190,
 5518,
 6702,
 7027,
 7139,
 7168,
 8266,
 9890,
 7435,
 1368,
 636,
 1308,
 5316,
 6493,
 8416,
 5141,
 5658,
 1788,
 3399,
 5934,
 6241,
 614,
 8010,
 8933,
 9639,
 6856,
 7888,
 10017,
 3759,
 5170,
 4

In [83]:
movies3.iloc[8208]

title                          Shrek the Third (2007)
genres    Adventure Animation Children Comedy Fantasy
Name: 8208, dtype: object