In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import *

In [2]:
movies=pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
#checking Null values present in the dataset

In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [7]:
movies.shape

(27278, 3)

In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [9]:
ratings.shape

(100836, 4)

In [10]:
#No Null present in the dataset


In [11]:
ratings.describe()  #0-5 and avg=3.5

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [12]:
#different genres available
movies['genres']=movies['genres'].str.split("|")

In [13]:
movies2=movies.explode('genres')

In [14]:
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [15]:
#unique genres
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [16]:
movies2['genres'].nunique()

20

In [17]:
#remove no genres listed
movies2=movies2[movies2['genres']!='(no genres listed)']

In [18]:
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Western', 'Film-Noir'], dtype=object)

In [19]:
movies2['genres'].nunique()

19

In [20]:
#group the dataframe based on genres and we will get the average rating for each genre
#merege movies2 and rating df
merged_data=pd.merge(ratings,movies2,on=['movieId'],how='inner')
merged_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure
1,1,1,4.0,964982703,Toy Story (1995),Animation
2,1,1,4.0,964982703,Toy Story (1995),Children
3,1,1,4.0,964982703,Toy Story (1995),Comedy
4,1,1,4.0,964982703,Toy Story (1995),Fantasy
...,...,...,...,...,...,...
267362,610,130052,3.5,1493848925,Clown (2014),Drama
267363,610,130052,3.5,1493848925,Clown (2014),Horror
267364,610,130840,4.5,1493848621,Spring (2015),Horror
267365,610,130840,4.5,1493848621,Spring (2015),Romance


In [21]:
popularity=merged_data.groupby(['genres','title']).agg({'rating':['mean','size']}).reset_index()
popularity.columns=['genres','Title','Average_Ratings',"Number_of_Ratings"]
popularity

Unnamed: 0,genres,Title,Average_Ratings,Number_of_Ratings
0,Action,'71 (2014),4.000000,1
1,Action,'Hellboy': The Seeds of Creation (2004),4.000000,1
2,Action,"...All the Marbles (California Dolls, The) (1981)",2.000000,1
3,Action,"10th Victim, The (La decima vittima) (1965)",4.000000,1
4,Action,12 Rounds (2009),3.000000,3
...,...,...,...,...
19953,Western,Winds of the Wasteland (1936),3.500000,1
19954,Western,Wyatt Earp (1994),3.095238,21
19955,Western,Young Guns (1988),3.100000,25
19956,Western,Young Guns II (1990),3.000000,11


In [22]:
#genres=action
#threshold=50
#top=7
popularity[(popularity['genres']=='Action')&(popularity['Number_of_Ratings']>=50)].sort_values(by='Average_Ratings',ascending=False).head(7)

Unnamed: 0,genres,Title,Average_Ratings,Number_of_Ratings
511,Action,Fight Club (1999),4.272936,218
344,Action,"Dark Knight, The (2008)",4.238255,149
1134,Action,"Princess Bride, The (1987)",4.232394,142
1359,Action,Star Wars: Episode IV - A New Hope (1977),4.231076,251
87,Action,Apocalypse Now (1979),4.219626,107
1360,Action,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211
1158,Action,Raiders of the Lost Ark (Indiana Jones and the...,4.2075,200


In [23]:
#popularity  recommender system
def TopNPopularMovies(genres,threshold,topN):
  popularity=merged_data.groupby(['genres','title']).agg({'rating':['mean','size']}).reset_index()
  popularity.columns=['genres','Title','Average_Ratings',"Number_of_Ratings"]

  #filter the data
  topNrecommendations=popularity[(popularity['genres']==genres)&(popularity['Number_of_Ratings']>=threshold)].sort_values(by='Average_Ratings',ascending=False).head(topN)
  #output
  topNrecommendations['Sno.']=range(1,len(topNrecommendations)+1)
  topNrecommendations.index=range(0,len(topNrecommendations))
  #Assign the new column names using a list
  topNrecommendations.columns = ['Genres','Movie Title','Average Movie Rating','Number of Reviews','Sno.']
  return topNrecommendations[['Sno.','Movie Title','Average Movie Rating','Number of Reviews']]

In [24]:
#test case1
genre='Adventure'
threshold=50
topN=8
TopNPopularMovies(genres=genre,threshold=threshold,topN=topN)

Unnamed: 0,Sno.,Movie Title,Average Movie Rating,Number of Reviews
0,1,"Princess Bride, The (1987)",4.232394,142
1,2,Star Wars: Episode IV - A New Hope (1977),4.231076,251
2,3,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211
3,4,Raiders of the Lost Ark (Indiana Jones and the...,4.2075,200
4,5,North by Northwest (1959),4.184211,57
5,6,Monty Python and the Holy Grail (1975),4.161765,136
6,7,Spirited Away (Sen to Chihiro no kamikakushi) ...,4.155172,87
7,8,City of God (Cidade de Deus) (2002),4.146667,75


In [25]:
#content Based recommender system

In [26]:
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [27]:
movies3=movies2.groupby('title').agg({"genres":lambda x:" ".join(list(x))}).reset_index()

In [28]:
movies3.head()

Unnamed: 0,title,genres
0,"""Great Performances"" Cats (1998)",Musical
1,#chicagoGirl: The Social Network Takes on a Di...,Documentary
2,$ (Dollars) (1971),Comedy Crime Drama
3,$5 a Day (2008),Comedy Drama
4,$9.99 (2008),Animation


In [29]:
#instance of tfidvectorizer>>>>.used to apply the function upon genres
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english')
tf

In [30]:
tf_matrix=tf.fit_transform(movies3['genres'])

In [31]:
cosine_sim=cosine_similarity(tf_matrix,tf_matrix)

In [32]:
cosine_sim

array([[1.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 1.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 1.       , ..., 0.2309757, 0.2309757,
        0.       ],
       ...,
       [0.       , 0.       , 0.2309757, ..., 1.       , 1.       ,
        0.       ],
       [0.       , 0.       , 0.2309757, ..., 1.       , 1.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        1.       ]])

In [33]:
def recommendation_genre(movie_df, similarity_matrix, movie_title, topN):
    # Normalize titles
    movie_df['title'] = movie_df['title'].str.strip().str.lower()
    movie_title = movie_title.strip().lower()

    # Create indices for all movies
    indices = pd.Series(movie_df.index, index=movie_df['title'])

    # Check if the movie title exists in the dataset
    if movie_title not in indices:
        raise ValueError(f"The movie title '{movie_title}' was not found in the dataset.")

    # Index of the target movie
    index = indices[movie_title]

    # Cosine similarity scores
    cosine_scores = list(enumerate(similarity_matrix[index]))
    cosine_scores = sorted(cosine_scores, key=lambda x: x[1], reverse=True)[1:topN+2]

    # Extract matching movies
    matched = [i[0] for i in cosine_scores]
    matching_df = movie_df.iloc[matched]

    # Filter out the target movie
    matching_df = matching_df[matching_df['title'] != movie_title]

    # Output
    matching_df.rename(columns={'title': 'Movie Title'}, inplace=True)
    matching_df['Sno.'] = range(1, len(matching_df) + 1)
    matching_df.index = range(0, len(matching_df))
    return matching_df[['Sno.', 'Movie Title']].head(topN)


In [34]:
#test case
recommendation_genre(movie_df=movies3, similarity_matrix=cosine_sim, movie_title='Toy Story (1995)', topN=10)


Unnamed: 0,Sno.,Movie Title
0,1,antz (1998)
1,2,asterix and the vikings (astérix et les viking...
2,3,"boxtrolls, the (2014)"
3,4,brother bear 2 (2006)
4,5,ducktales: the movie - treasure of the lost la...
5,6,"emperor's new groove, the (2000)"
6,7,"monsters, inc. (2001)"
7,8,shrek the third (2007)
8,9,"tale of despereaux, the (2008)"
9,10,the magic crystal (2011)


In [35]:
#interactive widgets

In [36]:
#popularity
#inputs
genres=Dropdown(options=list(set(movies2['genres'])),description='Genres',style={"description_width":'initial'})
num_reviews=IntText(description="Minimum Reviews",style={"description_width":'initial'})
num_recommendations_1=IntText(description="Number of Recommendations",style={"description_width":'initial'})

#tabs
b1=Button(description="RECOMMEND ME",style={"description_width":'initial'})
h1=HBox([num_reviews,num_recommendations_1])
popularity_tab=VBox([genres,h1,b1])

#content base system
title=Textarea(description="Movie Title",style={"description_width":'initial'})
num_recommendations_2=IntText(description="Number of Recommendations",style={"description_width":'initial'})

#tabs
h2=HBox([title,num_recommendations_2])
b2=Button(description="RECOMMEND ME",style={"description_width":'initial'})

content_tab=VBox([h2,b2])

#creating final tabs
tabs=[popularity_tab,content_tab]
wid=widgets.Tab(tabs)

#set titles to the tabs
names=['Popularity Based Recommendation','Content Based Recommendation']
[wid.set_title(i,title) for i,title in enumerate(names)]

display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Comedy', 'Thriller', 'Romance', 'War', '…

In [37]:
#setting up events to respond when clicked upon

#popularity
def b1_clicked(b):
  global output
  output=TopNPopularMovies(genres=genres.value,threshold=num_reviews.value,topN=num_recommendations_1.value)
b1.on_click(b1_clicked)

#content
def b2_clicked(b):
  global output
  result=recommendation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title=title.value,topN=num_recommendations_2.value)
  output=result
b2.on_click(b2_clicked)

In [38]:
display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Comedy', 'Thriller', 'Romance', 'War', '…

In [41]:
output

Unnamed: 0,Sno.,Movie Title
0,1,antz (1998)
1,2,asterix and the vikings (astérix et les viking...
2,3,brother bear 2 (2006)
3,4,ducktales: the movie - treasure of the lost la...
4,5,"emperor's new groove, the (2000)"
