# Content-based recommendation

In [1]:
%pylab inline

import numpy as np
from sklearn.neighbors import NearestNeighbors

from evaluator import Evaluator
from dataset_handler import DatasetHandler

Populating the interactive namespace from numpy and matplotlib


In [2]:
dataset100k = "datasets/ml-latest-small/"
dataset1M = "datasets/ml-1m"

In [3]:
dataset_handler = DatasetHandler(dataset100k)
user_ratings = dataset_handler.load_users_ratings()

In [4]:
class ContentBasedRecommender(object):
    def __init__(self, dataset_handler):
        self.dataset_handler = dataset_handler
        self.movies_vectors = self.dataset_handler.load_movies()
    
    def train(self, train_set):
        pass
    
    def top(self, user_profile, topN):
        return self._cosineKNN_all_movies(user_profile[0], topN)
    
    def predict_rating(self, user_profile, movieId):
        nearest_watched_movies = self._cosineKNN_movies_subset(user_profile[1].keys(), movieId, 5)
        return np.average(np.array([user_profile[1][movie] for movie in nearest_watched_movies]))
        
    def create_user_profile(self, user_ratings):
        return (
            np.average(
                np.array([
                    self.movies_vectors[self.dataset_handler.id2index(movie)]
                    for (movie, rating) in user_ratings.items()
                ]),
                weights=np.array(user_ratings.values()),
                axis=0
            ),
            user_ratings
        )
    
    def present_user_profile(self, user_profile):
        print "User favourite genre:", self.dataset_handler.feature_index2genre(np.argmax(user_profile[0]))
        print "User ratings:"
        for (movieId, rating) in user_profile[1].items():
            movie_vector = self.movies_vectors[self.dataset_handler.id2index(movieId)]
            print "{} {}: {}".format(
                self.dataset_handler.id_to_title[movieId],
                self.dataset_handler.movie_vector2genres(movie_vector),
                rating
            )
    
    def present_recommendations(self, recommendations):
        print "Recommended movies:"
        for movieId in recommendations:
            movie_vector = self.movies_vectors[self.dataset_handler.id2index(movieId)]
            print "{} {}".format(
                self.dataset_handler.id_to_title[movieId],
                self.dataset_handler.movie_vector2genres(movie_vector)
            )
    
    def _cosineKNN_all_movies(self, user_profile, k):
        nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
        nbrs.fit(self.movies_vectors)
        return self.dataset_handler.indices2ids(nbrs.kneighbors(np.array([user_profile]), k, return_distance=False)[0])
    
    def _cosineKNN_movies_subset(self, movies_subset, movieId, k):
        nbrs = NearestNeighbors(k, metric='cosine', algorithm='brute')
        movies_with_ids = np.array([
            np.hstack([[watched_movie], self.movies_vectors[self.dataset_handler.id2index(watched_movie)]])
            for watched_movie in movies_subset
        ])
        nbrs.fit(movies_with_ids[:, 1:])
        return movies_with_ids[
            nbrs.kneighbors(
                np.array([self.movies_vectors[self.dataset_handler.id2index(movieId)]]), return_distance=False
            )[0],
            0
        ]

In [5]:
recommender = ContentBasedRecommender(dataset_handler)
user_profile = recommender.create_user_profile(user_ratings[1])
recommender.present_user_profile(user_profile)

User favourite genre: Adventure
User ratings:
French Connection, The (1971) ['Action', 'Crime', 'Thriller']: 4.0
Dracula (Bram Stoker's Dracula) (1992) ['Fantasy', 'Horror', 'Romance', 'Thriller']: 3.5
Sleepers (1996) ['Thriller']: 3.0
Gods Must Be Crazy, The (1980) ['Adventure', 'Comedy']: 3.0
Ben-Hur (1959) ['Action', 'Adventure', 'Drama']: 2.0
Willow (1988) ['Action', 'Adventure', 'Fantasy']: 2.0
Escape from New York (1981) ['Action', 'Adventure', 'Sci-Fi', 'Thriller']: 2.0
Blazing Saddles (1974) ['Comedy', 'Western']: 3.0
Gandhi (1982) ['Drama']: 2.0
Deer Hunter, The (1978) ['Drama', 'War']: 2.0
Cape Fear (1991) ['Thriller']: 2.0
Cinema Paradiso (Nuovo cinema Paradiso) (1989) ['Drama']: 4.0
Antz (1998) ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']: 2.0
Fly, The (1986) ['Drama', 'Horror', 'Sci-Fi', 'Thriller']: 2.5
Time Bandits (1981) ['Adventure', 'Comedy', 'Fantasy', 'Sci-Fi']: 1.0
Tron (1982) ['Action', 'Adventure', 'Sci-Fi']: 4.0
Star Trek: The Motion Picture (1979

In [6]:
top = recommender.top(user_profile, topN=5)
recommender.present_recommendations(top)

Recommended movies:
Jumper (2008) ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']
Jurassic World (2015) ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']
The Hunger Games (2012) ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']
Children of Men (2006) ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']
Day After Tomorrow, The (2004) ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Thriller']


In [7]:
from evaluator import Evaluator
evaluator = Evaluator(ContentBasedRecommender(dataset_handler))
evaluator.computeMAP()

0.01083168480052753

In [14]:
evaluator.computeRMSE(dataset_handler)

0.9913712686567373