# Information

In this notebook I will test [surprise library](https://surprise.readthedocs.io/en/stable/index.html) and choose the best algorithm for recommendation

In [11]:
from surprise import Dataset, Reader, SVD, SVDpp, KNNBasic, KNNWithMeans, NMF
import pandas as pd
from collections import defaultdict
from surprise.model_selection import GridSearchCV, KFold
import numpy as np
from sklearn.metrics import ndcg_score
from scipy import sparse

In [12]:
ratings = pd.read_csv('../data/interim/u.data', index_col=0)[["user", "item", "rating"]]

In [13]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings, reader)

In [14]:
def precision_recall_at_k(predictions, k=10,
                          threshold=4):  # code taken from https://surprise.readthedocs.io/en/stable/FAQ.html
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


def get_ndcg(predictions, k_highest_scores=None):
    """
    Calculates the NDCG from algorithm predictions
    :param predictions: list of predictions
    :param k_highest_scores: only consider the highest k scores in the ranking. If None, use all. 
    :return: float in [0., 1.]: The averaged NDCG scores over all recommendations
    """

    uids = [int(p.uid) for p in predictions]
    iids = [int(p.iid) for p in predictions]
    r_uis = [p.r_ui for p in predictions]
    ests = [p.est for p in predictions]

    assert (len(uids) == len(iids) == len(r_uis) == len(ests))

    sparse_preds = sparse.coo_matrix((ests, (uids, iids)))
    sparse_vals = sparse.coo_matrix((r_uis, (uids, iids)))

    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()

    return ndcg_score(y_true=dense_vals, y_score=dense_preds, k=k_highest_scores)

In [15]:
def test_algo(model, k=5, random_state=42):
    kf = KFold(n_splits=5, random_state=random_state)
    precisions_list = []
    recalls_list = []
    ndcgs_list = []

    for trainset, testset in kf.split(data):
        model.fit(trainset)
        predictions = model.test(testset)
        precisions, recalls = precision_recall_at_k(predictions, k=k)
        ndcg = get_ndcg(predictions)

        # Precision and recall can then be averaged over all users
        precisions_list.append(sum(prec for prec in precisions.values()) / len(precisions))
        recalls_list.append(sum(rec for rec in recalls.values()) / len(recalls))
        ndcgs_list.append(ndcg)

    result = {
        'Precision@K': [np.mean(precisions_list)],
        'Recall@K': [np.mean(recalls_list)],
        'NDCG@K': [np.mean(ndcgs_list)],
    }
    return pd.DataFrame(data=result, index=[str(model.__class__).split('.')[-1][:-2]])

In [16]:
models_list = [SVD(), SVDpp(), KNNBasic(), KNNWithMeans(), NMF()]

In [17]:
result = []

for model in models_list:
    result.append(test_algo(model))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [18]:
pd.concat(result)

Unnamed: 0,Precision@K,Recall@K,NDCG@K
SVD,0.630388,0.238925,0.955183
SVDpp,0.643185,0.250266,0.956909
KNNBasic,0.689412,0.25531,0.956816
KNNWithMeans,0.567798,0.230601,0.95554
NMF,0.614016,0.238655,0.951269


Based on the result, `KNNBasic` shows the best performance and now I will search the best parameters for it

In [19]:
param_grid = {'k': [i for i in range(30, 50, 2)], 'min_k': [1, 2, 3, 4], 'verbose': [False]}
gs = GridSearchCV(KNNBasic, param_grid, measures=["mae"], cv=5)

gs.fit(data)

In [20]:
best_model = gs.best_estimator['mae']
test_algo(best_model)

Unnamed: 0,Precision@K,Recall@K,NDCG@K
KNNBasic,0.687496,0.258572,0.956339


This approach outperforms baseline solution and solution with encoder for movies, so I will use it for final solution.