In [1]:
import sklearn
import numpy as np
import pandas as pd
import random
import math
import pickle



In [2]:
class LoadFiles:
    def load(self, filename):
        try:
            file = open('datasets/' + filename + '.pkl', 'rb')
        except:
            err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
            print(err)
            file.close()
            return [False, err]
        else:
            data = pickle.load(file)
            file.close()
            return data
    def loadClusterMoviesDataset(self):
        return self.load('clusters_movies_dataset')

    def loadUsersClusters(self):
        return self.load('users_clusters')
  


In [3]:
# Load users_clusters
loader = LoadFiles()
users_cluster = loader.loadUsersClusters()

In [4]:
# Find a random valid user
test_user = random.choice(users_cluster['userId'])
print("Test user: ",test_user)
test_user_cluster = (users_cluster[users_cluster['userId'] == test_user]['Cluster']).astype(int)
i = test_user_cluster.to_string(index=False).replace(" ", "")
file_name = "cluster_"+ str(i) +".csv"

Test user:  95821


In [5]:
# Read average genre ratings of each user from the clustered dataset
cluster_genre_ratings = pd.read_csv("./datasets/"+file_name, index_col = 0)
cluster_genre_ratings = cluster_genre_ratings.reset_index(drop = True)

In [6]:
cluster_genre_ratings

Unnamed: 0,IMAX,Western,Animation,Action,Horror,Documentary,Sci-Fi,Fantasy,Comedy,Drama,...,Film-Noir,Mystery,Musical,Crime,War,Thriller,Children,Romance,userId,cluster
0,0.0,3.5,0.0,3.75,0.0,0.0,4.375,4.0,3.642857,3.900000,...,0.0,0.0,4.0,3.833333,0.0,4.500000,4.0,3.750000,75251,3
1,0.0,0.0,0.0,0.00,0.0,0.0,5.000,0.0,4.666667,4.714286,...,0.0,0.0,0.0,4.000000,0.0,4.333333,0.0,4.666667,90742,3
2,0.0,0.0,0.0,5.00,0.0,0.0,0.000,0.0,5.000000,5.000000,...,0.0,0.0,0.0,5.000000,5.0,0.000000,0.0,5.000000,79676,3
3,0.0,0.0,0.0,0.00,0.0,0.0,0.000,0.0,5.000000,5.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,5.000000,66794,3
4,0.0,0.0,0.0,5.00,0.0,0.0,5.000,0.0,4.750000,4.333333,...,0.0,0.0,0.0,4.500000,4.0,5.000000,0.0,4.500000,66916,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19451,0.0,0.0,0.0,0.00,0.0,4.0,0.000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,46844,3
19452,0.0,0.0,0.0,0.00,0.0,0.0,0.000,0.0,0.000000,4.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,56828,3
19453,0.0,0.0,0.0,0.00,0.0,0.0,0.000,0.0,0.000000,4.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,43787,3
19454,0.0,0.0,0.0,0.00,0.0,0.0,0.000,0.0,4.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,61651,3


In [7]:
users_test_cluster = users_cluster[users_cluster['Cluster'] == int(i)]
#users_test_cluster

In [8]:
movie_ratings = pd.read_csv(".\datasets\movie_ratings.csv", index_col = 0)
ratings_test_cluster = movie_ratings.merge(users_test_cluster, on='userId', how = 'inner')
ratings_test_cluster

Unnamed: 0,userId,movieId,rating,title,genres,year,Cluster
0,75251,1277,4.0,Cyrano de Bergerac (1990),Comedy|Drama|Romance,1990,3
1,75251,1097,4.0,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi,1982,3
2,75251,1206,4.5,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller,1971,3
3,75251,919,4.0,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical,1939,3
4,75251,1968,3.5,"Breakfast Club, The (1985)",Comedy|Drama,1985,3
...,...,...,...,...,...,...,...
71127,46844,791,4.0,"Last Klezmer: Leopold Kozlowski, His Life and ...",Documentary,1994,3
71128,56828,770,4.0,Costa Brava (1946),Drama,1946,3
71129,43787,770,4.0,Costa Brava (1946),Drama,1946,3
71130,61651,1311,4.0,Santa with Muscles (1996),Comedy,1996,3


In [9]:
        
def get_user_movies(user_id, ratings):
    return list(ratings[ratings['userId'] == user_id]['movieId'])

In [10]:
# Movies rated by the test user
user_movies = get_user_movies(test_user, ratings_test_cluster)
print("Movies rated by the user ", test_user, " ", user_movies)

Movies rated by the user  95821   [1231, 1208, 1345, 3671, 3683, 1278, 1207]


# Cosine similarity between users

In [11]:
# User-to-User similarity within a cluster
from sklearn.metrics.pairwise import cosine_similarity

# compute the cosine similarity
cos_sim = cosine_similarity(cluster_genre_ratings)
#cos_sim
#cos_sim.shape

# Top most similar users

In [12]:
# Top 25 similar users indices

# Find index of test user
idx = cluster_genre_ratings[cluster_genre_ratings['userId'] == test_user].index[0]
#print("Test user: ",test_user, " index : ", idx)
sim_scores = pd.Series(cos_sim[idx]).sort_values(ascending = False)
top_25_similar_users_indexes = list(sim_scores.iloc[1:26].index)
#print(top_25_similar_users_indexes)


In [13]:
# Top 25 similar user ids
similar_users = cluster_genre_ratings[cluster_genre_ratings.index.isin(top_25_similar_users_indexes)]['userId']
#similar_users

In [14]:
# Movie Ids of movies liked by similar users in the cluster
liked_movies = ratings_test_cluster[ratings_test_cluster.userId.isin(similar_users)]['movieId']
liked_movies = set(liked_movies.tolist())
rec_movies = list(liked_movies)

In [15]:
# Removing movies liked by test user
for movie in list(rec_movies):
    if movie in user_movies:
        rec_movies.remove(movie)
#print("Movies for recommendation")
#print(rec_movies)

# Movie Recommendation

In [16]:
movies = pd.read_csv(".\datasets\movies.csv")

In [17]:
# Recommending 15 movies from the list of movies for recommendation
for movie in rec_movies[:15]:
    title = list(movies.loc[movies['movieId'] == movie]['title'])
    genres = list(movies.loc[movies['movieId'] == movie]['genres'])
    if title != []:
        print( title, '\t\t ', genres)

['Matrix, The (1999)'] 		  ['Action|Sci-Fi|Thriller']
['All About My Mother (Todo sobre mi madre) (1999)'] 		  ['Drama']
['Batman & Robin (1997)'] 		  ['Action|Adventure|Fantasy|Thriller']
['Contact (1997)'] 		  ['Drama|Sci-Fi']
["On Her Majesty's Secret Service (1969)"] 		  ['Action|Adventure|Romance|Thriller']
['Sleeper (1973)'] 		  ['Comedy|Sci-Fi']
['Mummy, The (1999)'] 		  ['Action|Adventure|Comedy|Fantasy|Horror|Thriller']
['Dial M for Murder (1954)'] 		  ['Crime|Mystery|Thriller']
['Reservoir Dogs (1992)'] 		  ['Crime|Mystery|Thriller']
['Indiana Jones and the Temple of Doom (1984)'] 		  ['Action|Adventure|Fantasy']
['Streetcar Named Desire, A (1951)'] 		  ['Drama']
['Rush Hour 2 (2001)'] 		  ['Action|Comedy']
['Invasion of the Body Snatchers (1956)'] 		  ['Horror|Sci-Fi|Thriller']
['Rope (1948)'] 		  ['Crime|Drama|Thriller']
['Strangers on a Train (1951)'] 		  ['Crime|Drama|Film-Noir|Thriller']
