# Build Recommendation Systems

# Popularity-based recommendations

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import statistics


# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass

import warnings

warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
movie_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/BxZuF3FrO7Bdw6McwsBaBw/movies.csv')
rating_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/R-bYYyyf7s3IUE5rsssmMw/ratings.csv')
tag_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/UZKHhXSl7Ft7t9mfUFZJPQ/tags.csv')

In [3]:
movie_df.sample(5)

Unnamed: 0,movieId,title,genres
7797,92094,Einstein and Eddington (2008),Drama
4363,6380,Capturing the Friedmans (2003),Documentary
2607,3484,"Skulls, The (2000)",Thriller
4928,7390,Prey for Rock & Roll (2003),Drama|Musical
972,1273,Down by Law (1986),Comedy|Drama|Film-Noir


In [4]:
tag_df.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
3170,567,95558,Beautiful,1525287504
3499,599,296,notable soundtrack,1498456393
1741,474,3101,adultery,1138032312
1270,474,1041,In Netflix queue,1137201463
3382,599,296,achronological,1498456475


In [5]:
rating_df.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
52703,345,5900,3.0,1342828981
72537,469,933,4.0,965425407
14178,91,329,2.0,1112713258
89268,579,1573,5.0,958881915
25215,177,2805,3.0,1435535696


In [6]:
# We will merge the three dataframes to create a single dataframe that contains all the information we need.
user_movie_df = movie_df.merge(rating_df, on = 'movieId', how = 'inner')
df = user_movie_df.merge(tag_df, on = ['movieId', 'userId'], how = 'inner')
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp_x,tag,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,1122227329,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,978575760,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,1525286001,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,magic board game,1528843932
...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,1528934550,star wars,1528934552
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,anime,1537098582
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,comedy,1537098587
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,gintama,1537098603


In [7]:
# Here, we will drop the timestamp columns as they are not needed for our analysis.
df.drop(columns = ['timestamp_x', 'timestamp_y'], inplace = True)
df

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama


In [8]:
print('Number of rows: ' , df.shape[0])
print('Number of columns: ' , df.shape[1])

Number of rows:  3476
Number of columns:  6


In [9]:
df.dtypes

movieId      int64
title       object
genres      object
userId       int64
rating     float64
tag         object
dtype: object

In [10]:
# Deal with null values
df.isnull().any()

movieId    False
title      False
genres     False
userId     False
rating     False
tag        False
dtype: bool

In [13]:
df_1 = df
df_1.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game


In [14]:
num_votes = df_1.groupby('movieId').size().reset_index(name='numVotes')

# Merge the numVotes back into the original DataFrame
df_1 = pd.merge(df_1, num_votes, on='movieId')

df_1.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar,3
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun,3
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game,4


In [15]:
avg_ratings = df_1.groupby('movieId')['rating'].mean().reset_index(name='avgRating')

# Merge the avgRating back into the original DataFrame
df_1 = pd.merge(df_1, avg_ratings, on='movieId')

In [17]:
df_1.drop_duplicates(subset = ['movieId', 'title', 'avgRating', 'numVotes'], inplace = True)
df_1.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.75
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.5
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.5
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.0


In [18]:
import statistics

# Define the function to calculate the weighted score
def calculate_weighted_score(avgRating, num_votes, C, m):
    return (num_votes * avgRating + m * C) / (num_votes + m)

# Calculate the global average rating (C)
average_rating = statistics.mean(df_1['avgRating'])
print('The average rating across all movies is:', average_rating)

# Calculate the average number of votes (m)
avg_num_votes = statistics.mean(df_1['numVotes'])  # Use the average number of votes for threshold
print('The average number of votes is:', avg_num_votes)

# Create a new column 'score' for the weighted average rating using 'avgRating' and 'numVotes'
df_1['score'] = df_1.apply(lambda row: calculate_weighted_score(row['avgRating'], row['numVotes'], average_rating, avg_num_votes), axis=1)

# Display the DataFrame with the calculated weighted score
df_1[['movieId', 'title', 'avgRating', 'numVotes', 'score']].head()

The average rating across all movies is: 3.7323364168313313
The average number of votes is: 2.3743169398907105


Unnamed: 0,movieId,title,avgRating,numVotes,score
0,1,Toy Story (1995),3.833333,3,3.788714
3,2,Jumanji (1995),3.75,4,3.743421
7,3,Grumpier Old Men (1995),2.5,2,3.168895
9,5,Father of the Bride Part II (1995),1.5,2,2.71168
11,7,Sabrina (1995),3.0,1,3.515304


In [19]:
df_1.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating,score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333,3.788714
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.75,3.743421
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.5,3.168895
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.5,2.71168
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.0,3.515304


In [20]:
# filtering out the top 5 suggestions
top_5_movies = df_1.sort_values(by = 'score', ascending = False).head(5)[['title', 'genres', 'tag', 'score']]
print('Top 5 movies:')
top_5_movies

Top 5 movies:


Unnamed: 0,title,genres,tag,score
199,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,good dialogue,4.967226
1337,Fight Club (1999),Action|Crime|Drama|Thriller,dark comedy,4.893394
604,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,Hal,4.884498
998,"Big Lebowski, The (1998)",Comedy|Crime,Coen Brothers,4.868802
164,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,assassin,4.852577


# Content-based recommendation

In [22]:
# We will now create a new DataFrame that contains only the columns we need for our analysis.
df_2 = df_1[['movieId', 'title', 'userId', 'avgRating', 'numVotes', 'score', 'genres', 'tag']].copy()
df_2.reset_index(drop=True, inplace=True)
df_2.head()

Unnamed: 0,movieId,title,userId,avgRating,numVotes,score,genres,tag
0,1,Toy Story (1995),336,3.833333,3,3.788714,Adventure|Animation|Children|Comedy|Fantasy,pixar
1,2,Jumanji (1995),62,3.75,4,3.743421,Adventure|Children|Fantasy,fantasy
2,3,Grumpier Old Men (1995),289,2.5,2,3.168895,Comedy|Romance,moldy
3,5,Father of the Bride Part II (1995),474,1.5,2,2.71168,Comedy,pregnancy
4,7,Sabrina (1995),474,3.0,1,3.515304,Comedy|Romance,remake


In [24]:
# Replace '|' with spaces in 'genres' and combine it with 'tag' using a space
df_2['features'] = df_2['genres'].str.replace('|', ' ') + ' ' + df_2['tag'].fillna('')

df_2.head()

Unnamed: 0,movieId,title,userId,avgRating,numVotes,score,genres,tag,features
0,1,Toy Story (1995),336,3.833333,3,3.788714,Adventure|Animation|Children|Comedy|Fantasy,pixar,Adventure Animation Children Comedy Fantasy pixar
1,2,Jumanji (1995),62,3.75,4,3.743421,Adventure|Children|Fantasy,fantasy,Adventure Children Fantasy fantasy
2,3,Grumpier Old Men (1995),289,2.5,2,3.168895,Comedy|Romance,moldy,Comedy Romance moldy
3,5,Father of the Bride Part II (1995),474,1.5,2,2.71168,Comedy,pregnancy,Comedy pregnancy
4,7,Sabrina (1995),474,3.0,1,3.515304,Comedy|Romance,remake,Comedy Romance remake


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the 'features' column to create TF-IDF vectors
X = vectorizer.fit_transform(df_2['features'])

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate Cosine Similarity
similarity = cosine_similarity(X)

# Recommendation function (including itself as first result)
def recommendation(title, df, similarity, top_n=3):
    try:
        # Get the index of the movie that matches the title
        idx = df[df['title'] == title].index[0]
    except IndexError:
        print(f"Movie '{title}' not found in the dataset.")
        return

    # Get the similarity scores for the given movie
    sim_scores = list(enumerate(similarity[idx]))

    # Sort the movies based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Print the top_n most similar movies (including itself)
    print(f"Movies similar to '{title}' (First movie is itself):")
    for i, (index, score) in enumerate(sim_scores[:top_n+1]):
        movie = df.iloc[index]
        print(f"{i}. {movie['title']} (Similarity Score: {score:.3f})")
        print(f"   Genres: {movie['genres']}")
        print(f"   Tag: {movie['tag']}\n")

# Test the recommendation function
recommendation("Toy Story (1995)", df_2, similarity)

Movies similar to 'Toy Story (1995)' (First movie is itself):
0. Toy Story (1995) (Similarity Score: 1.000)
   Genres: Adventure|Animation|Children|Comedy|Fantasy
   Tag: pixar

1. Bug's Life, A (1998) (Similarity Score: 0.939)
   Genres: Adventure|Animation|Children|Comedy
   Tag: Pixar

2. Toy Story 2 (1999) (Similarity Score: 0.675)
   Genres: Adventure|Animation|Children|Comedy|Fantasy
   Tag: animation

3. Sintel (2010) (Similarity Score: 0.583)
   Genres: Animation|Fantasy
   Tag: adventure



In [27]:
recommendation("Toy Story 2 (1999)", df_2, similarity)

Movies similar to 'Toy Story 2 (1999)' (First movie is itself):
0. Toy Story 2 (1999) (Similarity Score: 1.000)
   Genres: Adventure|Animation|Children|Comedy|Fantasy
   Tag: animation

1. Croods, The (2013) (Similarity Score: 0.856)
   Genres: Adventure|Animation|Comedy
   Tag: animation

2. Sintel (2010) (Similarity Score: 0.853)
   Genres: Animation|Fantasy
   Tag: adventure

3. Invincible Iron Man, The (2007) (Similarity Score: 0.775)
   Genres: Animation
   Tag: animation



# Collaborative filtering

In [28]:
# Pivot user-item matrix from ratings
user_rating_matrix = rating_df.pivot(index="movieId", columns="userId", values="rating")

# fill na with 0
user_rating_matrix = user_rating_matrix.fillna(0)

user_rating_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
from sklearn.neighbors import NearestNeighbors

rec = NearestNeighbors(metric = 'cosine')
rec.fit(user_rating_matrix)

0,1,2
,"n_neighbors  n_neighbors: int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries.",5
,"radius  radius: float, default=1.0 Range of parameter space to use by default for :meth:`radius_neighbors` queries.",1.0
,"algorithm  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm  based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.",'auto'
,"leaf_size  leaf_size: int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.",30
,"metric  metric: str or callable, default='minkowski' Metric to use for distance computation. Default is ""minkowski"", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is ""precomputed"", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only ""nonzero"" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string.",'cosine'
,"p  p: float (positive), default=2 Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.",2
,"metric_params  metric_params: dict, default=None Additional keyword arguments for the metric function.",
,"n_jobs  n_jobs: int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",


In [30]:
# Function to get movie recommendations based on a title
def get_recommendations(title):
    # Get movie details
    movie = df_2[df_2['title'] == title]
    
    if movie.empty:
        print(f"Movie '{title}' not found in dataset.")
        return None
    
    movie_id = int(movie['movieId'])
    
    # Get the index of the movie in the user-item matrix
    try:
        user_index = user_rating_matrix.index.get_loc(movie_id)
    except KeyError:
        print(f"Movie ID {movie_id} not found in the user rating matrix.")
        return None
    
    # Get the user ratings for the movie
    user_ratings = user_rating_matrix.iloc[user_index]
    
    # Reshape the ratings to be a single sample (1, -1)
    reshaped_df = user_ratings.values.reshape(1, -1)
    
    # Find the nearest neighbors (similar movies)
    distances, indices = rec.kneighbors(reshaped_df, n_neighbors=15)
    
    # Get the movieIds of the nearest neighbors (excluding the first, which is the queried movie itself)
    nearest_idx = user_rating_matrix.iloc[indices[0]].index[1:]
    
    # Get the movie details for the nearest neighbors
    nearest_neighbors = pd.DataFrame({'movieId': nearest_idx})
    result = pd.merge(nearest_neighbors, df_2, on='movieId', how='left')
    
    # Return the top recommendations
    return result[['title', 'avgRating', 'genres']].head()

# Test the recommendation function
get_recommendations('Toy Story (1995)')

Unnamed: 0,title,avgRating,genres
0,Toy Story 2 (1999),3.125,Adventure|Animation|Children|Comedy|Fantasy
1,Jurassic Park (1993),4.5,Action|Adventure|Sci-Fi|Thriller
2,Independence Day (a.k.a. ID4) (1996),4.0,Action|Adventure|Sci-Fi|Thriller
3,Star Wars: Episode IV - A New Hope (1977),4.527778,Action|Adventure|Sci-Fi
4,Forrest Gump (1994),3.666667,Comedy|Drama|Romance|War


In [31]:
get_recommendations('Jurassic Park (1993)')

Unnamed: 0,title,avgRating,genres
0,Terminator 2: Judgment Day (1991),2.625,Action|Sci-Fi
1,Forrest Gump (1994),3.666667,Comedy|Drama|Romance|War
2,Braveheart (1995),4.35,Action|Drama|War
3,"Fugitive, The (1993)",5.0,Thriller
4,Speed (1994),4.0,Action|Romance|Thriller
