In [1]:
import pandas as pd

In [2]:
movie_file = "dataset/movielens.csv"
movie_data = pd.read_csv(movie_file, usecols = [0, 1])
movie_data.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
ratings_file = "dataset/movielens_ratings.csv"
ratings_info = pd.read_csv(ratings_file, usecols = [0, 1, 2])
ratings_info.head()

Unnamed: 0,userId,movieId,rating
0,1,16,4.0
1,1,24,1.5
2,1,32,4.0
3,1,47,4.0
4,1,50,4.0


In [4]:
movie_info = pd.merge(movie_data, ratings_info, left_on = 'movieId', right_on = 'movieId')
movie_info.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),2,5.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),8,5.0
3,1,Toy Story (1995),11,4.0
4,1,Toy Story (1995),14,4.0


In [5]:
movie_info.loc[0:10, ['userId']]
movie_info[movie_info.title == "Toy Story (1995)"].head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),2,5.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),8,5.0
3,1,Toy Story (1995),11,4.0
4,1,Toy Story (1995),14,4.0


In [6]:
movie_info = pd.DataFrame.sort_values(movie_info, ['userId', 'movieId'], ascending = [0, 1])
movie_info.head()

Unnamed: 0,movieId,title,userId,rating
231,1,Toy Story (1995),668,3.0
323,2,Jumanji (1995),668,3.0
381,3,Grumpier Old Men (1995),668,2.0
454,5,Father of the Bride Part II (1995),668,2.5
569,6,Heat (1995),668,5.0


In [7]:
num_users = max(movie_info.userId)
num_movies = max(movie_info.movieId)
print(num_users)
print(num_movies)

668
149532


In [8]:
movie_per_user = movie_info.userId.value_counts()
movie_per_user.head()

668    5678
575    2837
458    2086
232    1421
310    1287
Name: userId, dtype: int64

In [9]:
users_per_movie = movie_info.title.value_counts()
users_per_movie.head()

Pulp Fiction (1994)                 325
Forrest Gump (1994)                 311
Shawshank Redemption, The (1994)    308
Jurassic Park (1993)                294
Silence of the Lambs, The (1991)    290
Name: title, dtype: int64

In [10]:
def fav_movies(current_user, N):
    fav_movies = pd.DataFrame.sort_values(movie_info[movie_info.userId == current_user], 
                                          ['rating'], ascending = [0]) [:N]
    return list(fav_movies.title)
print(fav_movies(5, 3))

['Tangled (2010)', 'Night at the Museum: Battle of the Smithsonian (2009)', 'Importance of Being Earnest, The (2002)']


In [11]:
user_movie_rating_matrix = pd.pivot_table(movie_info, values = 'rating', index=['userId'], columns=['movieId'])
user_movie_rating_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,5.0,,2.0,,3.0,,,,,,...,,,,,,,,,,
3,,,,,3.0,,3.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [12]:
from scipy.spatial.distance import correlation
import numpy as np
def similarity(user1, user2): 
    user1 = np.array(user1) - np.nanmean(user1) 
    user2 = np.array(user2) - np.nanmean(user2)
    common_movie_ids = [i for i in range(len(user1)) if user1[i] > 0 and user2[i] > 0]
    if(len(common_movie_ids) == 0):
        return 0
    else:
        user1 = np.array([user1[i] for i in common_movie_ids])
        user2 = np.array([user2[i] for i in common_movie_ids])
        return correlation(user1, user2)

In [13]:
def nearest_neighbour_ratings(current_user, K):
    similarity_matrix = pd.DataFrame(index = user_movie_rating_matrix.index, 
                                    columns = ['similarity'])
    for i in user_movie_rating_matrix.index:
        similarity_matrix.loc[i] = similarity(user_movie_rating_matrix.loc[current_user],
                                             user_movie_rating_matrix.loc[i])
    similarity_matrix = pd.DataFrame.sort_values(similarity_matrix,
                                                ['similarity'], ascending= [0])
    nearest_neighbours = similarity_matrix[:K]
    neighbour_movie_ratings = user_movie_rating_matrix.loc[nearest_neighbours.index]
    predicted_movie_rating = pd.DataFrame(index = user_movie_rating_matrix.columns, columns = ['rating'])
    for i in user_movie_rating_matrix.columns:
        predicted_rating = np.nanmean(user_movie_rating_matrix.loc[current_user])
        for j in neighbour_movie_ratings.index:
            if(user_movie_rating_matrix.loc[j,i] > 0):
                predicted_rating += ((user_movie_rating_matrix.loc[j,i] -np.nanmean(user_movie_rating_matrix.loc[j])) *
                                                    nearest_neighbours.loc[j, 'similarity']) / nearest_neighbours['similarity'].sum()
        predicted_movie_rating.loc[i, 'rating'] = predicted_rating
    return predicted_movie_rating

In [15]:
def top_n_recommendations(current_user, N):
    predicted_movie_rating = nearest_neighbour_ratings(current_user, 10)
    movies_already_watched = list(user_movie_rating_matrix.loc[current_user]
                                  .loc[user_movie_rating_matrix.loc[current_user] > 0].index)
    predicted_movie_rating = predicted_movie_rating.drop(movies_already_watched)
    top_n_recommendations = pd.DataFrame.sort_values(predicted_movie_rating, ['rating'], ascending=[0])[:N]
    top_n_recommendation_titles = movie_data.loc[movie_data.movieId.isin(top_n_recommendations.index)]
    return list(top_n_recommendation_titles.title)

In [16]:
current_user = 140
print("User's favorite movies are : ", fav_movies(current_user, 5),"\nUser's top recommendations are: ", top_n_recommendations(current_user, 3))

  dist = 1.0 - uv / np.sqrt(uu * vv)


User's favorite movies are :  ['Shawshank Redemption, The (1994)', 'Return of Martin Guerre, The (Retour de Martin Guerre, Le) (1982)', 'Godfather, The (1972)', 'Secrets & Lies (1996)', 'Misérables, Les (1995)'] 
User's top recommendations are:  ['Being John Malkovich (1999)', 'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)', 'Donnie Darko (2001)']
