In [1]:
import utils.data_loader as data_loader
from pathlib import Path
import utils.model_loader as model_loader

In [3]:
data_raw = data_loader.get_data('ml-100k')

In [2]:
# data_dir = Path(Path.cwd().parents[0], 'data', 'movielens', 'ml-latest-small')
data_raw = data_loader.get_data('ratings.csv', False)

## Model Pipeline

In [5]:
from surprise.model_selection import train_test_split
from surprise import SVD, NMF
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.algo_base import AlgoBase

In [6]:
def train_and_evalute_model_pipeline(model_class: AlgoBase, dataset: str = 'ml-100k', 
                                     from_surprise: bool = True, 
                                     test_size: float = 0.2,
                                     model_kwargs: dict = {}) -> (AlgoBase, dict):
    data = data_loader.get_data(dataset, from_surprise)
    train_set, test_set = train_test_split(data, test_size, random_state=42)
    model = model_loader.get_trained_model(model_class, train_set, model_kwargs)
    metrics_dict = model_loader.evaluate_model(model, test_set)
    return model, metrics_dict

In [7]:
my_model, metrics_dict = train_and_evalute_model_pipeline(KNNBasic)
metrics_dict

Computing the msd similarity matrix...
Done computing similarity matrix.


{'RMSE': 0.980150596704479, 'MAE': 0.980150596704479}

In [21]:
model_kwargs = {'sim_options': {'user_based': False, 'name': 'pearson'}}
my_model, metrics_dict = train_and_evalute_model_pipeline(KNNBasic, model_kwargs=model_kwargs)
metrics_dict

Computing the pearson similarity matrix...
Done computing similarity matrix.


{'RMSE': 1.041104054968961, 'MAE': 1.041104054968961}

In [18]:
my_model, metrics_dict = train_and_evalute_model_pipeline(SVD)
metrics_dict

{'RMSE': 0.9360802939362804, 'MAE': 0.9360802939362804}

## Benchmarking

In [24]:
benchmark_dict = {}


model_dict_list = [
    {
        'model_name' : 'KNN user based with cosine similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': True, 'name': 'cosine'}
    },
    {
        'model_name' : 'KNN user based with pearson similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': True, 'name': 'pearson'}
    },
    {
        'model_name' : 'KNN item based with cosine similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': False, 'name': 'cosine'}
    },
    {
        'model_name' : 'KNN item based with pearson similarity',
        'model_class' : KNNBasic,
        'model_kwargs' : {'user_based': False, 'name': 'pearson'}
    },
    {
        'model_name' : 'SVD',
        'model_class' : SVD
    },
    {
        'model_name' : 'NMF',
        'model_class' : NMF
    },
    
]

for model_dict in model_dict_list:
    model, metrics_dict = train_and_evalute_model_pipeline(
        model_dict['model_class'], model_kwargs = model_dict.get('model_kwargs', {}))
    benchmark_dict[model_dict['model_name']] = metrics_dict
    model_dict['fitted_model'] = model
    

benchmark_dict

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'KNN user based with cosine similarity': {'RMSE': 0.980150596704479,
  'MAE': 0.980150596704479},
 'KNN user based with pearson similarity': {'RMSE': 0.980150596704479,
  'MAE': 0.980150596704479},
 'KNN item based with cosine similarity': {'RMSE': 0.980150596704479,
  'MAE': 0.980150596704479},
 'KNN item based with pearson similarity': {'RMSE': 0.980150596704479,
  'MAE': 0.980150596704479},
 'SVD': {'RMSE': 0.9345987396157499, 'MAE': 0.9345987396157499},
 'NMF': {'RMSE': 0.9624799306524018, 'MAE': 0.9624799306524018}}

## User Recommendation

In [27]:
import pandas as pd
from collections import defaultdict
from surprise.dump import dump, load

data_dir = Path(Path.cwd().parents[0], 'data', 'movielens', 'ml-latest-small')
def get_movies_data(data_dir):
    movies = pd.read_csv(Path(data_dir, 'movies.csv'))
    return movies

# def get_predictions(model, user, movies, k):
#     movies['user'] = user
#     preds = movies.apply(lambda x: model.predict(x[0], x[-1]), 1, result_type='expand')
#     idx = preds[3].argsort()[:k]
#     ids = preds.iloc[idx, 0]
#     mvs = movies.movieId.isin(ids)
#     return movies.loc[mvs, ['title', 'genres']]

def get_movie_details(preds, movies):
#     movies['user'] = user
#     preds = movies.apply(lambda x: model.predict(x[0], x[-1]), 1, result_type='expand')
#     idx = preds[3].argsort()[:k]
    ids = [int(x[0]) for x in preds]
    mvs = movies.movieId.isin(ids)
    return movies.loc[mvs, ['movieId', 'title', 'genres']]

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n
    

    
def get_user_recommendation(model: AlgoBase, user_id: int, k: int, data
                           ) -> pd.DataFrame:
    """Makes movie recommendations a user.
    
    Parameters
    ----------
        model : AlgoBase
            A trained surprise model
        user_id : int
            The user for whom the recommendation will be done.
        k : int
            The number of items to recommend.
        data : FIXME
            The data needed to do the recommendation.
        movies : pandas.DataFrame
            The dataframe containing the movies metadata (title, genre, etc)
        
    Returns
    -------
    pandas.Dataframe
        A dataframe with the k movies that will be recommended the user. The dataframe should have the following
        columns (movie_name : str, movie_genre : str, predicted_rating : float, true_rating : float)
        
    Notes
    -----
    - You should create other functions that are used in this one and not put all the code in the same function.
        For example to create the final dataframe, instead of implemented all the code
        in this function (get_user_recommendation), you can create a new one (create_recommendation_dataframe)
        that will be called in this function.
    - You can add other arguments to the function if you need to.
    """
    try:
        predictions, model = load('model.prod')
    except:
        trainset = data.build_full_trainset()
        testset = trainset.build_anti_testset()
        model = model_loader.get_trained_model(model, trainset)
        predictions = model.test(testset)
        dump('model.prod', predictions, model)
    top_n = get_top_n(predictions, n=k)
    movies = get_movies_data(data_dir)
#     details = get_movie_details(top_n[user_id], movies)
#     df = pd.DataFrame(top_n, columns=['Id', 'Rating'])
    
    return get_movie_details(top_n[user_id], movies)

recommendations = get_user_recommendation(SVD, '196', 10, data_raw)
display(recommendations)

success


Unnamed: 0,movieId,title,genres
150,178,Love & Human Remains (1993),Comedy|Drama
158,187,Party Girl (1995),Comedy
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
353,408,8 Seconds (1994),Drama
454,519,RoboCop 3 (1993),Action|Crime|Drama|Sci-Fi|Thriller
705,923,Citizen Kane (1941),Drama|Mystery
