In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

In [4]:
try:
    movies_df = pd.read_csv('/content/movies (1).csv')
    ratings_df = pd.read_csv('/content/ratings.csv')
except FileNotFoundError:
    print("Please make sure 'movies.csv' and 'ratings.csv' are in the correct directory.")
    movies_df = pd.DataFrame(columns=['movieId', 'title', 'genres'])
    ratings_df = pd.DataFrame(columns=['userId', 'movieId', 'rating', 'timestamp'])

print("Datasets loaded")
print("\nMovies:")
print(movies_df.head())
print("\nRatings:")
print(ratings_df.head())

Datasets loaded

Movies:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings:
   userId  movieId  rating   timestamp
0       1       16     4.0  1217897793
1       1       24     1.5  1217895807
2       1       32     4.0  1217896246
3       1       47     4.0  1217896556
4       1       50     4.0  1217896523


In [5]:
print(f"Shape of movies data: {movies_df.shape}")
print(f"Shape of ratings data: {ratings_df.shape}")

Shape of movies data: (10329, 3)
Shape of ratings data: (105339, 4)


In [6]:
df = pd.merge(ratings_df, movies_df, on='movieId')
print("\nMerged DataFrame Head:")
print(df.head())


Merged DataFrame Head:
   userId  movieId  rating   timestamp  \
0       1       16     4.0  1217897793   
1       1       24     1.5  1217895807   
2       1       32     4.0  1217896246   
3       1       47     4.0  1217896556   
4       1       50     4.0  1217896523   

                                       title                   genres  
0                              Casino (1995)              Crime|Drama  
1                              Powder (1995)             Drama|Sci-Fi  
2  Twelve Monkeys (a.k.a. 12 Monkeys) (1995)  Mystery|Sci-Fi|Thriller  
3                Seven (a.k.a. Se7en) (1995)         Mystery|Thriller  
4                 Usual Suspects, The (1995)   Crime|Mystery|Thriller  


In [7]:
n_users = df['userId'].nunique()
n_movies = df['movieId'].nunique()
print(f"\nNumber of unique users: {n_users}")
print(f"Number of unique movies: {n_movies}")


Number of unique users: 668
Number of unique movies: 10325


In [8]:
print("\nRating Distribution:")
print(df['rating'].describe())


Rating Distribution:
count    105339.000000
mean          3.516850
std           1.044872
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


In [9]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
unique_genres = list(genres_df.columns)
print(f"\nFound {len(unique_genres)} unique genres.")


Found 20 unique genres.


In [10]:
print("\nTotal movies per genre:")
print(genres_df.sum().sort_values(ascending=False))


Total movies per genre:
Drama                 5220
Comedy                3515
Thriller              2187
Romance               1788
Action                1737
Crime                 1440
Adventure             1164
Horror                1001
Sci-Fi                 860
Mystery                675
Fantasy                670
Children               540
War                    503
Documentary            415
Musical                409
Animation              401
Western                235
Film-Noir              195
IMAX                   152
(no genres listed)       7
dtype: int64


In [11]:
genre_ratings = df.drop(['movieId', 'userId', 'timestamp', 'title'], axis=1)
genre_ratings = genre_ratings.join(genre_ratings['genres'].str.get_dummies(sep='|'))
genre_ratings = genre_ratings.drop('genres', axis=1)

In [12]:
print("\nAverage rating per genre:")
avg_genre_rating = {}
for genre in unique_genres:
    avg_genre_rating[genre] = genre_ratings[genre_ratings[genre] == 1]['rating'].mean()

avg_rating_df = pd.Series(avg_genre_rating).sort_values(ascending=False)
print(avg_rating_df)


Average rating per genre:
Film-Noir             3.913636
War                   3.783202
Mystery               3.652043
Drama                 3.650266
Documentary           3.643035
Crime                 3.642392
IMAX                  3.641821
Animation             3.635350
Musical               3.571962
Western               3.565687
Romance               3.544255
Adventure             3.518027
Fantasy               3.500459
Thriller              3.495561
Sci-Fi                3.454481
Action                3.451450
Children              3.439429
Comedy                3.420996
Horror                3.281097
(no genres listed)    3.071429
dtype: float64


In [13]:
def popularity_recommender(genre, min_reviews_threshold, N):
    genre_df = df[df['genres'].str.contains(genre, case=False, na=False)]
    if genre_df.empty:
        print(f"No movies found for the genre: {genre}")
        return
    movie_stats = genre_df.groupby('title').agg(
        Num_Reviews=('rating', 'count'),
        Average_Movie_Rating=('rating', 'mean')
    ).reset_index()
    qualified_movies = movie_stats[movie_stats['Num_Reviews'] >= min_reviews_threshold]
    top_n_movies = qualified_movies.sort_values(by='Average_Movie_Rating', ascending=False).head(N)
    top_n_movies = top_n_movies.reset_index(drop=True)
    top_n_movies.index = top_n_movies.index + 1
    top_n_movies.index.name = 'S.No'
    display(top_n_movies[['title', 'Average_Movie_Rating', 'Num_Reviews']].rename(columns={'title': 'Movie Title'}))

In [14]:
print(f"Top 5 Popular Movies in 'Comedy' Genre (with at least 100 reviews)")
popularity_recommender(genre='Comedy', min_reviews_threshold=100, N=5)

Top 5 Popular Movies in 'Comedy' Genre (with at least 100 reviews)


Unnamed: 0_level_0,Movie Title,Average_Movie_Rating,Num_Reviews
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Monty Python and the Holy Grail (1975),4.301948,154
2,Fargo (1996),4.271144,201
3,"Princess Bride, The (1987)",4.163743,171
4,Pulp Fiction (1994),4.16,325
5,Forrest Gump (1994),4.138264,311


In [15]:
tfidf = TfidfVectorizer(stop_words='english')
movies_df['genres'] = movies_df['genres'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()

In [16]:
def content_based_recommender(movie_title, N):
    if movie_title not in indices:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return
    idx = indices[movie_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:N+1]
    movie_indices = [i[0] for i in sim_scores]
    recommended_movies = movies_df['title'].iloc[movie_indices].to_frame()
    recommended_movies = recommended_movies.reset_index(drop=True)
    recommended_movies.index = recommended_movies.index + 1
    recommended_movies.index.name = 'Sl.No'
    display(recommended_movies.rename(columns={'title': 'Movie Title'}))

In [18]:
print(f"\nTop 5 Movies Similar to 'Toy Story (1995)' Based on Genre")
content_based_recommender(movie_title='Toy Story (1995)', N=5)


Top 5 Movies Similar to 'Toy Story (1995)' Based on Genre


Unnamed: 0_level_0,Movie Title
Sl.No,Unnamed: 1_level_1
1,Antz (1998)
2,Toy Story 2 (1999)
3,"Adventures of Rocky and Bullwinkle, The (2000)"
4,"Emperor's New Groove, The (2000)"
5,"Monsters, Inc. (2001)"


In [19]:
user_movie_matrix = df.pivot_table(index='userId', columns='title', values='rating').fillna(0)
user_similarity = cosine_similarity(user_movie_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

In [20]:
def collaborative_recommender(user_id, k, N):
    if user_id not in user_movie_matrix.index:
        print(f"User ID {user_id} not found.")
        return
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).iloc[1:k+1]
    user_watched_movies = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id] > 0].index
    recommendations = {}
    for similar_user_id, similarity_score in similar_users.items():
        similar_user_ratings = user_movie_matrix.loc[similar_user_id]
        unseen_movies = similar_user_ratings[similar_user_ratings > 0].index.difference(user_watched_movies)
        for movie in unseen_movies:
            if movie not in recommendations:
                recommendations[movie] = {'total_score': 0, 'total_similarity': 0}
            recommendations[movie]['total_score'] += similarity_score * similar_user_ratings[movie]
            recommendations[movie]['total_similarity'] += similarity_score
    if not recommendations:
        print("Could not find any new movies to recommend based on the given users.")
        return
    predicted_ratings = {movie: data['total_score'] / data['total_similarity'] for movie, data in recommendations.items()}
    top_n_movies = sorted(predicted_ratings.items(), key=lambda item: item[1], reverse=True)[:N]
    rec_df = pd.DataFrame(top_n_movies, columns=['Movie Title', 'Predicted Rating'])
    rec_df = rec_df.reset_index()
    rec_df['index'] = rec_df['index'] + 1
    rec_df = rec_df.rename(columns={'index': 'S.No'})
    display(rec_df[['S.No', 'Movie Title']])

In [21]:
print(f"\nTop 5 Movie Recommendations for User ID 1 (based on 100 similar users)")
collaborative_recommender(user_id=1, k=100, N=5)


Top 5 Movie Recommendations for User ID 1 (based on 100 similar users)


Unnamed: 0,S.No,Movie Title
0,1,Waking Life (2001)
1,2,Sanjuro (Tsubaki Sanjûrô) (1962)
2,3,Crash (1996)
3,4,Kundun (1997)
4,5,"Nasty Girl, The (schreckliche Mädchen, Das) (1..."


In [22]:
print("\n### Popularity-Based Recommender GUI ###")
style = {'description_width': 'initial'}
popularity_ui = widgets.interactive(
    popularity_recommender,
    genre=widgets.Dropdown(options=sorted(unique_genres), description='Select Genre:', style=style),
    min_reviews_threshold=widgets.IntSlider(min=50, max=500, step=10, value=100, description='Min Reviews Threshold:', style=style),
    N=widgets.IntSlider(min=1, max=10, step=1, value=5, description='Num Recommendations:', style=style)
)
display(popularity_ui)

print("\n### Content-Based Recommender GUI ###")
content_ui = widgets.interactive(
    content_based_recommender,
    movie_title=widgets.Dropdown(options=sorted(movies_df['title'].tolist()), description='Select Movie:', style=style),
    N=widgets.IntSlider(min=1, max=10, step=1, value=5, description='Num Recommendations:', style=style)
)
display(content_ui)

print("\n### Collaborative Filtering Recommender GUI ###")
collab_ui = widgets.interactive(
    collaborative_recommender,
    user_id=widgets.Dropdown(options=sorted(ratings_df['userId'].unique().tolist()), description='Select UserID:', style=style),
    k=widgets.IntSlider(min=10, max=200, step=10, value=100, description='Num Similar Users (k):', style=style),
    N=widgets.IntSlider(min=1, max=10, step=1, value=5, description='Num Recommendations:', style=style)
)
display(collab_ui)


### Popularity-Based Recommender GUI ###


interactive(children=(Dropdown(description='Select Genre:', options=('(no genres listed)', 'Action', 'Adventur…


### Content-Based Recommender GUI ###


interactive(children=(Dropdown(description='Select Movie:', options=("'71 (2014)", "'Hellboy': The Seeds of Cr…


### Collaborative Filtering Recommender GUI ###


interactive(children=(Dropdown(description='Select UserID:', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…