# Practice PS06: Recommendations engines (interactions-based)

Author: <font color="blue">Rubén Vera</font>

E-mail: <font color="blue">ruben.vera01@estudiant.upf.edu</font>

Date: <font color="blue">9/11/2022</font>

# 1. The Movies dataset

# 1.1. Load the input files

In [1]:
# Leave this code as-is

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from math import*
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Leave this code as-is

FILENAME_MOVIES = "movies-2000s.csv"
FILENAME_RATINGS = "ratings-2000s.csv"
FILENAME_TAGS = "tags-2000s.csv"

In [3]:
# Leave this code as-is

movies = pd.read_csv(FILENAME_MOVIES, 
                    sep=',', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movie_id', 'title', 'genres'])
display(movies.head(5))

ratings_raw = pd.read_csv(FILENAME_RATINGS, 
                    sep=',', 
                    encoding='latin-1',
                    engine='python',
                    names=['user_id', 'movie_id', 'rating'])
display(ratings_raw.head(5))

Unnamed: 0,movie_id,title,genres
0,2769,"Yards, The (2000)",Crime|Drama
1,3177,Next Friday (2000),Comedy
2,3190,Supernova (2000),Adventure|Sci-Fi|Thriller
3,3225,Down to You (2000),Comedy|Romance
4,3228,Wirey Spindell (2000),Comedy


Unnamed: 0,user_id,movie_id,rating
0,4,1,3.0
1,4,260,3.5
2,4,296,4.0
3,4,541,4.5
4,4,589,4.0


# 1.2. Merge the data into a single dataframe

In [4]:
ratings = pd.merge(ratings_raw, movies, how = 'inner', on = 'movie_id')
display(ratings.head(5))

Unnamed: 0,user_id,movie_id,rating,title,genres
0,4,3624,2.5,Shanghai Noon (2000),Action|Adventure|Comedy|Western
1,152,3624,3.0,Shanghai Noon (2000),Action|Adventure|Comedy|Western
2,171,3624,3.5,Shanghai Noon (2000),Action|Adventure|Comedy|Western
3,276,3624,4.0,Shanghai Noon (2000),Action|Adventure|Comedy|Western
4,494,3624,3.5,Shanghai Noon (2000),Action|Adventure|Comedy|Western


In [5]:
def find_movies(text, movies):
    for i in range(len(movies)):#Iterate over the movies and check if the text is in the title of each movie.
        if text in movies["title"][i]:
            print(movies["movie_id"][i])

In [6]:
# LEAVE AS-IS

# For testing, this should print '59784'
find_movies("Kung Fu Panda (2008)", movies)

59784


In [7]:
# LEAVE AS-IS

def get_title(movie_id, movies):
    return movies[movies['movie_id'] == movie_id].title.iloc[0]

In [8]:
# LEAVE AS-IS

# For testing, should print "Kung Fu Panda (2008)")
print(get_title(59784, movies))

Kung Fu Panda (2008)


## 1.3. Count unique registers

In [9]:
print("Number of users who have rated a movie: ", len(pd.unique(ratings.user_id)))
print("Number of movies that have been rated: ", len(pd.unique(ratings.movie_id)))
print("Total number of movies: ", len(pd.unique(movies.movie_id)))

Number of users who have rated a movie:  12676
Number of movies that have been rated:  2049
Total number of movies:  33168


# 2. Item-based Collaborative Filtering

## 2.1. Data pre-processing

In [10]:
rated_movies = ratings.drop(columns = ['genres'])#Delete the column genres from the dataset ratings
display(rated_movies.head(5))

Unnamed: 0,user_id,movie_id,rating,title
0,4,3624,2.5,Shanghai Noon (2000)
1,152,3624,3.0,Shanghai Noon (2000)
2,171,3624,3.5,Shanghai Noon (2000)
3,276,3624,4.0,Shanghai Noon (2000)
4,494,3624,3.5,Shanghai Noon (2000)


In [11]:
ratings_summary = rated_movies.groupby('movie_id').first()#Group dataset by movie_id
ratings_summary = ratings_summary.drop(columns = ['user_id', 'rating']) #Delete user_id and rating columns
ratings_mean = rated_movies.groupby('movie_id')['rating'].mean()#Save the mean of the column rating per each movie
ratings_count= rated_movies.groupby('movie_id')['rating'].count()#Count how many rates have each movie
ratings_summary['ratings_mean'] = ratings_mean#Add the column ratings_mean with the mean of ratings of each movie
ratings_summary['ratings_count'] = ratings_count#Add the column ratings_count with the count of rates of each movie
display(ratings_summary.head(5))

Unnamed: 0_level_0,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2769,"Yards, The (2000)",3.122549,102
3177,Next Friday (2000),2.824,125
3190,Supernova (2000),2.395683,139
3225,Down to You (2000),2.577273,110
3228,Wirey Spindell (2000),2.5,2


In [12]:
#Display ordered descending the movies with more than 2000 ratings
top_rated = ratings_summary[ratings_count>=2000]
top_rated = top_rated.sort_values(by = 'ratings_mean', ascending = False)
display(top_rated.head(10))

Unnamed: 0_level_0,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5618,Spirited Away (Sen to Chihiro no kamikakushi) ...,4.215216,2458
6016,City of God (Cidade de Deus) (2002),4.186592,2133
4226,Memento (2000),4.158512,4476
4973,"Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le)...",4.097234,3687
4993,"Lord of the Rings: The Fellowship of the Ring,...",4.09253,5944
7153,"Lord of the Rings: The Return of the King, The...",4.08396,5449
5952,"Lord of the Rings: The Two Towers, The (2002)",4.083869,5449
7361,Eternal Sunshine of the Spotless Mind (2004),4.074589,3593
4011,Snatch (2000),4.034259,2700
3949,Requiem for a Dream (2000),3.955145,2352


In [13]:
#Display ordered descending the movies with more than 3 ratings
top_rated = ratings_summary[ratings_count>=3]
top_rated = top_rated.sort_values(by = 'ratings_mean', ascending = False)
display(top_rated.head(10))

Unnamed: 0_level_0,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5082,"Rumor of Angels, A (2000)",4.666667,6
27764,2LDK (2003),4.5,3
31954,Beautiful City (Shah-re ziba) (2004),4.4,5
5224,Promises (2001),4.388889,18
6775,Life and Debt (2001),4.333333,3
31856,Surplus: Terrorized Into Being Consumers (2003),4.333333,3
6672,War Photographer (2001),4.229167,24
5618,Spirited Away (Sen to Chihiro no kamikakushi) ...,4.215216,2458
7563,"Discovery of Heaven, The (2001)",4.2,5
6016,City of God (Cidade de Deus) (2002),4.186592,2133


the less ratings it has, the less confident is the rating it has.

## 2.2. Compute the user-movie matrix

In [14]:
#Compute user-movie matrix with each row a user_id and each column the rate of each movie for this user.
user_movie = rated_movies.pivot_table(index = 'user_id', columns = 'movie_id', values = 'rating')
display(user_movie.head(10))

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,,,,,,,,,,,...,,,,,,,,,,
33,,,,,,,,,,,...,,,,,,,,,,
62,,,,,,,,4.5,,,...,,,,,,,,,,3.5
63,,,,,,,,,,,...,,,,,,,,,,
95,,,,,,,,3.5,,,...,,,,,,,,,,
131,,,,,,,,,,,...,,,,,,,,,,
132,,,,,,,,,,,...,,,,,,,,,,
152,,,,,,,,,,,...,,,,,,,,,,
162,,,,,,,,,,,...,,,,,,,,,,
170,,,,,,,,,,,...,,,,,,,,,,


This could be for 2 reasons:
The first one is the users don't rate films they see.
Second reason could be because users don't see lot of films, so they rate will be NaN.

We call this characteristic as a Sparse Matrix

# 2.3. Explore some correlations in the user-movie matrix

In [15]:
#Display the rates of each user for 3 different movies in ratings3
id_pivot = movies.loc[movies['title'] == 'Monsters, Inc. (2001)']['movie_id'].to_list()[0]
id_m1 = movies.loc[movies['title'] == 'Finding Nemo (2003)']['movie_id'].to_list()[0]
id_m2 = movies.loc[movies['title'] == 'Talk to Her (Hable con Ella) (2002)']['movie_id'].to_list()[0]
s1 = user_movie[id_pivot].dropna()
s2 = user_movie[id_m1].dropna()
s3 = user_movie[id_m2].dropna()
ratings3 = pd.concat([s1,s2,s3], axis = 1).dropna(0)
display(ratings3.head(10))

Unnamed: 0_level_0,4886,6377,5878
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
859,4.5,4.0,5.0
1229,4.0,4.0,4.5
1281,2.5,2.5,3.0
1722,4.5,4.5,4.0
2004,4.0,3.0,3.5
2360,5.0,4.0,5.0
3726,5.0,5.0,2.5
3798,4.5,4.5,5.0
4590,4.5,4.0,2.0
5052,4.0,4.0,4.0


In [16]:
#Check similiratiy between each pair of movies of these 3
print("Similarity between 'Monsters, Inc. (2001)' and 'Finding Nemo (2003)': ", ratings3[id_pivot].corr(ratings3[id_m1]))
print("Similarity between 'Monsters, Inc. (2001)' and 'Talk to Her (Hable con Ella) (2002)': ",ratings3[id_pivot].corr(ratings3[id_m2]))
print("Similarity between 'Finding Nemo (2003)' and 'Talk to Her (Hable con Ella) (2002)': ", ratings3[id_m1].corr(ratings3[id_m2]))

Similarity between 'Monsters, Inc. (2001)' and 'Finding Nemo (2003)':  0.55517746382533
Similarity between 'Monsters, Inc. (2001)' and 'Talk to Her (Hable con Ella) (2002)':  0.16921368851474122
Similarity between 'Finding Nemo (2003)' and 'Talk to Her (Hable con Ella) (2002)':  0.241570123828288


In my opinion Monsters, Inc and Finding Nemo are similar films due to both films have same target audience. So, 0.55 it's an adecuate correlation. It's not higher due to they not share all genres and the rating they obtained it's not equal at all. In the case of correlation of this two films wih talk to her, we can see that it is lower due to they do not share target audience, and they do not share genres. The only thing that can make vary the similarity is the mean of the ratings obtained by the users.

In [17]:
#Check correlation of each movie with the pivot movie.
similar_to_pivot = user_movie.corrwith(user_movie[id_pivot]).dropna()
display(similar_to_pivot.head(10))

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


movie_id
2769   -0.082710
3177    0.210484
3190    0.039882
3225    0.162232
3239   -0.438086
3273    0.236845
3275    0.122747
3276    0.694492
3279    1.000000
3285    0.006660
dtype: float64

In [18]:
#Add a column with the correlation computed before and display movies with more than 500 ratings
corr_with_pivot = pd.DataFrame(similar_to_pivot, columns = ['corr'])
corr_with_pivot = corr_with_pivot.join(ratings_summary)
corr_with_pivot = corr_with_pivot[corr_with_pivot['ratings_count']>500]
corr_with_pivot.sort_values('corr', ascending = False).head(10)

Unnamed: 0_level_0,corr,title,ratings_mean,ratings_count
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4886,1.0,"Monsters, Inc. (2001)",3.850066,3775
6377,0.639308,Finding Nemo (2003),3.862284,3765
8961,0.546784,"Incredibles, The (2004)",3.867771,3320
4306,0.471673,Shrek (2001),3.768787,4591
5218,0.465032,Ice Age (2002),3.5464,1972
5444,0.424311,Lilo & Stitch (2002),3.576974,760
3751,0.423167,Chicken Run (2000),3.476384,2096
8360,0.404075,Shrek 2 (2004),3.520989,2144
4016,0.402423,"Emperor's New Groove, The (2000)",3.573889,900
3624,0.37385,Shanghai Noon (2000),3.297443,1017


All the films are so similar in my opinion, so the system seems accurate, they share target audience and genres, so I'd recommend same kind of films.

If u set ratings_count to a much larger value, you may lose some films that are similar just because not a lot of people rated them, so you should establish a threshold not so large, but not so small either. As i said, if you set it to a much smaller value you may take into account films that only really few people have seen and their rate can't be supported by many people and may not be so accurate.

# 2.4. Implement the item-based recommendations

In [19]:
#Compute correlation of each pair of movies
item_similarity = user_movie.corr()
display(item_similarity.head(5))

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2769,1.0,0.115068,0.033721,-0.232268,,-0.5,0.197011,0.199514,0.250873,,...,0.37998,0.87831,,,,0.248126,0.180609,-0.08557,-0.408248,0.105671
3177,0.115068,1.0,0.30382,0.559533,,,0.331191,0.167918,1.0,,...,0.546119,0.735767,-1.0,,,-0.221382,0.317475,0.014735,0.661989,0.185654
3190,0.033721,0.30382,1.0,0.636361,,-0.014315,0.146042,0.394293,-0.290397,,...,0.246183,0.632026,,,,0.378181,0.170926,0.022444,-0.07336,-0.054114
3225,-0.232268,0.559533,0.636361,1.0,,0.578414,0.347716,0.263671,-0.250313,,...,-0.300376,0.318377,,,,0.480173,0.750306,0.536828,0.753141,0.098748
3228,,,,,1.0,,,,,,...,,,,,,,,,,


In [20]:
#Same as before but with minimum 100 observations
item_similarity_min_ratings = user_movie.corr(min_periods = 100)
display(item_similarity_min_ratings.head(5))

movie_id,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,33138,33145,33148,33150,33152,33154,33158,33162,33164,33166
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2769,1.0,,,,,,,,,,...,,,,,,,,,,
3177,,1.0,,,,,,,,,...,,,,,,,,,,
3190,,,1.0,,,,,,,,...,,,,,,,,,,
3225,,,,1.0,,,,,,,...,,,,,,,,,,
3228,,,,,,,,,,,...,,,,,,,,,,


In [21]:
for user in user_movie.iterrows():
    #Save in user_id_super the id of an user who has rated with more than 4.5 in movies 5349, 3793 and 6534
    if(user[1][5349] > 4.5 and user[1][3793] > 4.5 and user[1][6534] > 4.5):
        user_id_super = user[0]
        break
        
for user in user_movie.iterrows():
    #Save in user_id_drama the id of an user who has rated with more than 4.5 in movies 6870, 5595 and 3555
    if(user[1][6870] > 4.5 and user[1][5995] > 4.5 and user[1][3555] > 4.5):
        user_id_drama = user[0]
        break    

In [22]:
# Leave this code as-is

# Gets a list of watched movies for a user_id
def get_watched_movies(user_id, user_movie):
    return list(user_movie.loc[user_id].dropna().sort_values(ascending=False).index)
    
# Gets the rating a user_id has given to a movie_id
def get_rating(user_id, movie_id, user_movie):
    return user_movie[movie_id][user_id]

# Print watched movies
def print_watched_movies(user_id, user_movie, movies):
    for movie_id in get_watched_movies(user_id, user_movie):
        print("%d %.1f %s " %
          (movie_id, get_rating(user_id, movie_id, user_movie), get_title(movie_id, movies)))


In [23]:
# LEAVE AS-IS (TESTING CODE)

print_watched_movies(user_id_super, user_movie, movies)

5502 5.0 Signs (2002) 
5445 5.0 Minority Report (2002) 
6156 5.0 Shanghai Knights (2003) 
5952 5.0 Lord of the Rings: The Two Towers, The (2002) 
5944 5.0 Star Trek: Nemesis (2002) 
5816 5.0 Harry Potter and the Chamber of Secrets (2002) 
5618 5.0 Spirited Away (Sen to Chihiro no kamikakushi) (2001) 
5524 5.0 Blue Crush (2002) 
5480 5.0 Stuart Little 2 (2002) 
5459 5.0 Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002) 
5420 5.0 Windtalkers (2002) 
4388 5.0 Scary Movie 2 (2001) 
5389 5.0 Spirit: Stallion of the Cimarron (2002) 
5349 5.0 Spider-Man (2002) 
5218 5.0 Ice Age (2002) 
5064 5.0 The Count of Monte Cristo (2002) 
4993 5.0 Lord of the Rings: The Fellowship of the Ring, The (2001) 
4973 5.0 Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le) (2001) 
4896 5.0 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) 
4886 5.0 Monsters, Inc. (2001) 
6186 5.0 Gods and Generals (2003) 
6333 5.0 X2: X-Men United (2003) 
6377 5.0 Finding Nemo (2003) 
6

In [24]:
# LEAVE AS-IS (TESTING CODE)

print_watched_movies(user_id_drama, user_movie, movies)

3967 5.0 Billy Elliot (2000) 
4014 5.0 Chocolat (2000) 
4034 5.0 Traffic (2000) 
5995 5.0 Pianist, The (2002) 
7147 5.0 Big Fish (2003) 
4995 5.0 Beautiful Mind, A (2001) 
3555 5.0 U-571 (2000) 
6870 5.0 Mystic River (2003) 
5991 5.0 Chicago (2002) 
8464 5.0 Super Size Me (2004) 
5669 5.0 Bowling for Columbine (2002) 
8622 5.0 Fahrenheit 9/11 (2004) 
30707 5.0 Million Dollar Baby (2004) 
6953 4.5 21 Grams (2003) 
5015 4.5 Monster's Ball (2001) 
5464 4.5 Road to Perdition (2002) 
3510 4.5 Frequency (2000) 
5989 4.5 Catch Me If You Can (2002) 
4022 4.0 Cast Away (2000) 
5010 4.0 Black Hawk Down (2001) 
5299 4.0 My Big Fat Greek Wedding (2002) 
3897 4.0 Almost Famous (2000) 
3755 4.0 Perfect Storm, The (2000) 
4308 4.0 Moulin Rouge (2001) 
4447 3.5 Legally Blonde (2001) 
4246 3.5 Bridget Jones's Diary (2001) 
4975 3.5 Vanilla Sky (2001) 
4019 3.5 Finding Forrester (2000) 
5377 3.5 About a Boy (2002) 
3948 3.5 Meet the Parents (2000) 
5956 3.0 Gangs of New York (2002) 
6281 3.0 Phone Booth

In [25]:
def get_movies_relevance(user_id, user_movie, item_similarity_matrix):

    # Create an empty series
    movies_relevance = pd.Series(dtype = 'object')

    # Iterate through the movies the user has watched
    for watched_movie in user_movie.loc[user_id].index:

        # Obtain the rating given
        rating_given = user_movie[watched_movie][user_id]

        # Obtain the vector containing the similarities of watched_movie
        # with all other movies in item_similarity_matrix
        similarities = item_similarity_matrix[watched_movie]

        # Multiply this vector by the given rating
        weighted_similarities = rating_given * similarities

        # Append these terms to movies_relevance
        movies_relevance = movies_relevance.append(weighted_similarities)

    # Compute the sum for each movie
    movies_relevance = movies_relevance.groupby(movies_relevance.index).sum()

    # Convert to a dataframe
    movies_relevance_df = pd.DataFrame(movies_relevance, columns=['relevance'])
    movies_relevance_df['movie_id'] = movies_relevance_df.index

    return movies_relevance_df


In [27]:
relevance_hero = get_movies_relevance(user_id_super, user_movie, item_similarity_min_ratings)
movies_recommended_hero = pd.merge(relevance_hero, movies, how = 'inner', on = 'movie_id')
movies_recommended_hero = movies_recommended_hero.sort_values(by = 'relevance', ascending=False)
display(movies_recommended_hero.head(10))

relevance_drama = get_movies_relevance(user_id_drama, user_movie, item_similarity_min_ratings)
movies_recommended_drama = pd.merge(relevance_drama, movies, how = 'inner', on = 'movie_id')
movies_recommended_drama = movies_recommended_drama.sort_values(by = 'relevance', ascending=False)
display(movies_recommended_drama.head(10))

Unnamed: 0,relevance,movie_id,title,genres
1472,189.170085,8644,"I, Robot (2004)",Action|Adventure|Sci-Fi|Thriller
663,181.63812,5459,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
85,176.650945,3753,"Patriot, The (2000)",Action|Drama|War
1414,172.899804,8361,"Day After Tomorrow, The (2004)",Action|Adventure|Drama|Sci-Fi|Thriller
310,172.700877,4310,Pearl Harbor (2001),Action|Drama|Romance|War
297,172.301301,4270,"Mummy Returns, The (2001)",Action|Adventure|Comedy|Thriller
325,169.123776,4367,Lara Croft: Tomb Raider (2001),Action|Adventure
1003,168.960164,6373,Bruce Almighty (2003),Comedy|Drama|Fantasy|Romance
1586,168.783883,8972,National Treasure (2004),Action|Adventure|Drama|Mystery|Thriller
996,166.866641,6365,"Matrix Reloaded, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX


Unnamed: 0,relevance,movie_id,title,genres
1572,65.46137,8958,Ray (2004),Drama
195,63.007635,4019,Finding Forrester (2000),Drama
1055,61.354376,6565,Seabiscuit (2003),Drama
501,61.21305,4995,"Beautiful Mind, A (2001)",Drama|Romance
508,61.209632,5014,I Am Sam (2001),Drama
1239,60.751048,7143,"Last Samurai, The (2003)",Action|Adventure|Drama|War
1472,60.700299,8644,"I, Robot (2004)",Action|Adventure|Sci-Fi|Thriller
1168,60.611768,6870,Mystic River (2003),Crime|Drama|Mystery
1313,59.820898,7325,Starsky & Hutch (2004),Action|Comedy|Crime|Thriller
1432,59.438079,8464,Super Size Me (2004),Comedy|Documentary|Drama


Super user accuracy:
yes
yes
yes
no
yes
no
yes
yes
no
yes
So, I'd recommend to a super user 7 out of the 10 recommended films. I based on looking at the genres and the synopsis of the film if I'd recommend them or not. For example, I'd recommend these films that are related with powers, action, thriller, sci-fi if, once read their synopsis, I can relate them with superheroes.

Drama user accuracy:
yes
yes
yes
yes
yes
yes
no
yes
no
no
I'd recommend tu a drama user 7 out of the 10 recommended films by the system. I based my decision the same way as before. Now all drama/romance films passed the filter. Once i filtered, I decided with the synopsis.

In [28]:
def get_recommended_movies(user_id, user_movie, item_similarity):
    relevant_movies = get_movies_relevance(user_id, user_movie, item_similarity)
    relevant_movies = relevant_movies.set_index('movie_id')
    movie_ids = get_watched_movies(user_id, user_movie)
    relevant_movies = relevant_movies.drop(movie_ids)
    return relevant_movies

In [29]:
relevant_hero_movies = get_recommended_movies(user_id_super, user_movie, item_similarity_min_ratings)
relevant_hero_movies = relevant_hero_movies.sort_values(by = 'relevance', ascending=False)
display(relevant_hero_movies.head(10))

relevant_drama_movies = get_recommended_movies(user_id_drama, user_movie, item_similarity_min_ratings)
relevant_drama_movies = relevant_drama_movies.sort_values(by = 'relevance', ascending=False)
display(relevant_drama_movies.head(10))

Unnamed: 0_level_0,relevance
movie_id,Unnamed: 1_level_1
6365,166.866641
4018,165.338077
4025,163.032765
5507,161.080324
6378,155.293219
31685,154.993274
3948,150.570934
4369,148.949754
6934,148.394158
4963,148.251901


Unnamed: 0_level_0,relevance
movie_id,Unnamed: 1_level_1
8958,65.46137
6565,61.354376
5014,61.209632
7325,59.820898
7149,59.294621
4448,58.968024
7445,58.192646
5152,58.004447
3753,57.920754
4223,57.482846


Seeing my last commentary, I can assure that these recommendations are so close to be real recommendations that I'd do to theres users, so I think they are highly relevant. In this case, we will not see movies already watched, which is good for the user because it had no sense to recommend a film already seen. Otherwise, I think that after removing the movies already watched, we should re-calculate the relevances, so they will change because there are less movies available to recommend. I'd say they are not comparable because we're not taking into account that there are less movies to recommend, so the relevance showed is over all movies and not over movies not seen.

<font size="+2" color="#003300">I hereby declare that, except for the code provided by the course instructors, all of my code, report, and figures were produced by myself.</font>