In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tests as t
from scipy.sparse import csr_matrix
from IPython.display import HTML


%matplotlib inline

# Read in the datasets
movies = pd.read_csv('movies_clean.csv')
reviews = pd.read_csv('reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

In [2]:
user_items = reviews[['user_id', 'movie_id', 'rating']]
user_items.head()

Unnamed: 0,user_id,movie_id,rating
0,1,68646,10
1,1,113277,10
2,2,422720,8
3,2,454876,8
4,2,790636,7


In [3]:
user_by_movie = user_items.groupby(['user_id','movie_id'])['rating'].max().unstack()

In [4]:
assert movies.shape[0] == user_by_movie.shape[1], "Oh no! Your matrix should have {} columns, and yours has {}!".format(movies.shape[0], user_by_movie.shape[1])
assert reviews.user_id.nunique() == user_by_movie.shape[0], "Oh no! Your matrix should have {} rows, and yours has {}!".format(reviews.user_id.nunique(), user_by_movie.shape[0])
print("Looks like you are all set! Proceed!")
HTML('<img src="images/greatjob.webp">')

Looks like you are all set! Proceed!


In [43]:
#list(user_by_movie.iloc[0][user_by_movie.iloc[0].isnull() == False].index.values)
user_by_movie.index.values.shape

(53968,)

In [45]:
def movies_watched(user_id):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    OUTPUT:
    movies - an array of movies the user has watched
    '''
    movies = list(user_by_movie.iloc[user_id-1][user_by_movie.iloc[user_id-1].isnull() == False].index.values)

    return movies


def create_user_movie_dict():
    '''
    INPUT: None
    OUTPUT: movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids
    
    Creates the movies_seen dictionary
    '''
    movies_seen = dict()

    for eachUser in list(user_by_movie.index.values):
        movies_seen[eachUser] = movies_watched(eachUser)
    
    return movies_seen


# Use your function to return dictionary
movies_seen = create_user_movie_dict()

In [47]:
len(movies_seen)

53968

In [50]:
# Remove individuals who have watched 2 or fewer movies - don't have enough data #to make recs

def create_movies_to_analyze(movies_seen, lower_bound=2):
    '''
    INPUT:  
    movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids
    lower_bound - (an int) a user must have more movies seen than the lower bound to be added to the movies_to_analyze dictionary

    OUTPUT: 
    movies_to_analyze - a dictionary where each key is a user_id and the value is an array of movie_ids
    
    The movies_seen and movies_to_analyze dictionaries should be the same except that the output dictionary has removed 
    
    '''
    movies_to_analyze = dict()

    # Do things to create updated dictionary
    for user_id,movies_list in movies_seen.items():
        if(len(movies_list)>lower_bound):
            movies_to_analyze[user_id] = movies_list

    return movies_to_analyze


# Use your function to return your updated dictionary
movies_to_analyze = create_movies_to_analyze(movies_seen)

In [52]:
#movies_to_analyze

In [53]:
assert len(movies_to_analyze) == 23512, "Oops!  It doesn't look like your dictionary has the right number of individuals."
assert len(movies_to_analyze[2]) == 23, "Oops!  User 2 didn't match the number of movies we thought they would have."
assert len(movies_to_analyze[7])  == 3, "Oops!  User 7 didn't match the number of movies we thought they would have."
print("If this is all you see, you are good to go!")

If this is all you see, you are good to go!


In [72]:
def compute_correlation(user1, user2):
    '''
    INPUT
    user1 - int user_id
    user2 - int user_id
    OUTPUT
    the correlation between the matching ratings between the two users
    '''
    movies_user1 = movies_to_analyze[user1]
    movies_user2 = movies_to_analyze[user2]

    sim_movies = np.intersect1d(movies_user1,movies_user2)

    temp_df = user_by_movie.loc[(user1,user2),sim_movies]

    corr = temp_df.T.corr().iloc[0,1]
  
    return corr,sim_movies #return the correlation

In [68]:
#sample_df = user_by_movie.loc[(1,3),[8,10,12]]
#sample_df.T.corr().iloc[0,1]

user_id,1,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,,
3,,


In [71]:
# Test your function against the solution
assert compute_correlation(2,2) == 1.0, "Oops!  The correlation between a user and itself should be 1.0."
assert round(compute_correlation(2,66), 2) == 0.76, "Oops!  The correlation between user 2 and 66 should be about 0.76."
assert np.isnan(compute_correlation(2,104)), "Oops!  The correlation between user 2 and 104 should be a NaN."

print("If this is all you see, then it looks like your function passed all of our tests!")

If this is all you see, then it looks like your function passed all of our tests!


In [73]:
corr,movies = compute_correlation(2,104)
print(corr)
print(movies)

nan
[ 454876  816711 1454468 1535109]


In [74]:
user_by_movie.loc[(2,104),movies]

movie_id,454876,816711,1454468,1535109
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,8.0,8.0,8.0,8.0
104,9.0,7.0,7.0,9.0


In [76]:
def compute_euclidean_dist(user1, user2):
    '''
    INPUT
    user1 - int user_id
    user2 - int user_id
    OUTPUT
    the euclidean distance between user1 and user2
    '''
    movies_user1 = movies_to_analyze[user1]
    movies_user2 = movies_to_analyze[user2]

    sim_movies = np.intersect1d(movies_user1,movies_user2)

    temp_df = user_by_movie.loc[(user1,user2),sim_movies]

    dist = np.linalg.norm(temp_df.iloc[0].values-temp_df.iloc[1].values)

    return dist #return the euclidean distance

In [78]:
import pickle
df_dists = pd.read_pickle("dists.p")

In [79]:
assert compute_euclidean_dist(2,2) == df_dists.query("user1 == 2 and user2 == 2")['eucl_dist'][0], "Oops!  The distance between a user and itself should be 0.0."
assert round(compute_euclidean_dist(2,66), 2) == round(df_dists.query("user1 == 2 and user2 == 66")['eucl_dist'][1], 2), "Oops!  The distance between user 2 and 66 should be about 2.24."
assert np.isnan(compute_euclidean_dist(2,104)) == np.isnan(df_dists.query("user1 == 2 and user2 == 104")['eucl_dist'][4]), "Oops!  The distance between user 2 and 104 should be 2."

print("If this is all you see, then it looks like your function passed all of our tests!")

If this is all you see, then it looks like your function passed all of our tests!


In [83]:
df_dists.head(5)

Unnamed: 0,user1,user2,eucl_dist
0,2,2,0.0
1,2,66,2.236068
2,2,90,5.385165
3,2,99,2.828427
4,2,104,2.0


In [90]:
df_dists[df_dists['user1']==2][1:].sort_values(by='eucl_dist').user2.values

array([22915, 34706, 33207, ..., 36807, 32494, 52737], dtype=int64)

In [117]:
def find_closest_neighbors(user):
    '''
    INPUT:
        user - (int) the user_id of the individual you want to find the closest users
    OUTPUT:
        closest_neighbors - an array of the id's of the users sorted from closest to farthest away
    '''
    # I treated ties as arbitrary and just kept whichever was easiest to keep using the head method
    # You might choose to do something less hand wavy - order the neighbors
    
    closest_neighbors = df_dists[df_dists['user1']==user][1:].sort_values(by='eucl_dist').user2.values
    
    return np.array(closest_neighbors)
    
    
    
def movies_liked(user_id, min_rating=7):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    min_rating - the minimum rating considered while still a movie is still a "like" and not a "dislike"
    OUTPUT:
    movies_liked - an array of movies the user has watched and liked
    '''
    
    movies_liked = np.array(
        user_items.query("user_id == @user_id and rating >= @min_rating")       ['movie_id'])
    
    return movies_liked


def movie_names(movie_ids):
    '''
    INPUT
    movie_ids - a list of movie_ids
    OUTPUT
    movies - a list of movie names associated with the movie_ids
    
    '''
    movie_lst = list(movies[movies['movie_id'].isin(movie_ids)]['movie'])
   
    return movie_lst

def make_recommendations(user, num_recs=10):
    '''
    INPUT:
        user - (int) a user_id of the individual you want to make recommendations for
        num_recs - (int) number of movies to return
    OUTPUT:
        recommendations - a list of movies - if there are "num_recs" recommendations return this many
                          otherwise return the total number of recommendations available for the "user"
                          which may just be an empty list
    '''
    movies_seen = movies_watched(user)
    neighbours = find_closest_neighbors(user)

    recs = np.array([])

    for neighbour in neighbours:
        neighbour_movies = movies_liked(neighbour)
        new_recs = np.setdiff1d(neighbour_movies,movies_seen)
        recs = np.unique(np.concatenate([new_recs,recs],axis=0))
        
        if(len(recs)>num_recs-1):
            break
    
    recommendations = movie_names(recs)
    
    return recommendations

def all_recommendations(num_recs=10):
    '''
    INPUT 
        num_recs (int) the (max) number of recommendations for each user
    OUTPUT
        all_recs - a dictionary where each key is a user_id and the value is an array of recommended movie titles
    '''
    all_recs = dict()

    # Make the recommendations for each user
    for user_id in list(user_by_movie.index.values):
        recomm = make_recommendations(user_id,num_recs)
        all_recs[user_id] = recomm

    return all_recs



In [118]:
all_recs = all_recommendations(10)

In [98]:
all_recs_sol = pd.read_pickle("all_recs.p")

In [119]:
assert all_recs[2] == make_recommendations(2), "Oops!  Your recommendations for user 2 didn't match ours."
assert all_recs[26] == make_recommendations(26), "Oops!  It actually wasn't possible to make any recommendations for user 26."
assert all_recs[1503] == make_recommendations(1503), "Oops! Looks like your solution for user 1503 didn't match ours."
print("If you made it here, you now have recommendations for many users using collaborative filtering!")
HTML('<img src="images/greatjob.webp">')

If you made it here, you now have recommendations for many users using collaborative filtering!


In [129]:
all_recs[2]

['His Majesty, the Scarecrow of Oz (1914)',
 'Three Ages (1923)',
 'The Navigator (1924)',
 'The Sea Hawk (1924)',
 'Sherlock Jr. (1924)',
 'The Thief of Bagdad (1924)',
 'The Cocoanuts (1929)',
 'Frau im Mond (1929)',
 'The Mummy (1932)',
 'The Black Cat (1934)',
 'The Lady Vanishes (1938)',
 'The Green Hornet (1940)',
 'The Mark of Zorro (1940)',
 "The Mummy's Hand (1940)",
 'The Ghost of Frankenstein (1942)',
 'Lifeboat (1944)',
 "The Mummy's Ghost (1944)",
 'Along Came Jones (1945)',
 'Miracle on 34th Street (1947)',
 'On an Island with You (1948)',
 'Destination Moon (1950)',
 'The Man from Planet X (1951)',
 'The Lusty Men (1952)',
 "Pluto's Christmas Tree (1952)",
 'Dial M for Murder (1954)',
 'Gojira (1954)',
 'The Curse of Frankenstein (1957)',
 'Taste of Fear (1961)',
 'Dr. No (1962)',
 'First Men in the Moon (1964)',
 'The Gorgon (1964)',
 'In the Heat of the Night (1967)',
 'Kingu Kongu no gyakushû (1967)',
 'The Devil Rides Out (1968)',
 'The Lost Continent (1968)',
 'The 

In [130]:
all_recs_sol[2]

['Philadelphia (1993)',
 'Training Day (2001)',
 'About Schmidt (2002)',
 'Insomnia (2002)',
 'The United States of Leland (2003)',
 'Shattered Glass (2003)',
 'Man on Fire (2004)',
 'Flipped (2010)',
 'Silver Linings Playbook (2012)',
 'Lawless (2012)',
 '50/50 (2011)',
 'Crazy, Stupid, Love. (2011)',
 'The Perks of Being a Wallflower (2012)',
 'Before I Go to Sleep (2014)',
 'Zero Dark Thirty (2012)',
 'American Hustle (2013)',
 'Django Unchained (2012)',
 'Side Effects (2013)',
 'Gone Girl (2014)',
 'Enough Said (2013)',
 'Nightcrawler (2014)']

In [125]:
# Check your understanding of the results by correctly filling in the dictionary below
a = "pearson's correlation and spearman's correlation"
b = 'item based collaborative filtering'
c = "there were too many ratings to get a stable metric"
d = 'user based collaborative filtering'
e = "euclidean distance and pearson's correlation coefficient"
f = "manhattan distance and euclidean distance"
g = "spearman's correlation and euclidean distance"
h = "the spread in some ratings was zero"
i = 'content based recommendation'

sol_dict = {
    'The type of recommendation system implemented here was a ...': d,
    'The two methods used to estimate user similarity were: ': e,
    'There was an issue with using the correlation coefficient.  What was it?': h
}

t.test_recs(sol_dict)

"That's right! All of your solutions look good!"

In [None]:
a = 567
b = 1503
c = 1319
d = 1325
e = 2526710
f = 0
g = 'Use another method to make recommendations - content based, knowledge based, or model based collaborative filtering'

sol_dict2 = {
    'For how many pairs of users were we not able to obtain a measure of similarity using correlation?': # letter here,
    'For how many pairs of users were we not able to obtain a measure of similarity using euclidean distance?': # letter here,
    'For how many users were we unable to make any recommendations for using collaborative filtering?': # letter here,
    'For how many users were we unable to make 10 recommendations for using collaborative filtering?': # letter here,
    'What might be a way for us to get 10 recommendations for every user?': # letter here   
}

t.test_recs2(sol_dict2)

In [126]:
no_rec = []

for user_id,movie_list in all_recs.items():
    if(len(movie_list)==0):
        no_rec.append(user_id)

print(len(no_rec))

32032


In [127]:
len(all_recs)

53968