# Collaborative Filtering Models

### User-Based, Item-Based, SVD

In [1]:
import random
import pandas as pd

from surprise import Dataset
from surprise import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

from collections import defaultdict
from itertools import islice

### Read in Data in Surprise Format

Must have three attributes: User ID, Item ID, Rating - in that order

In [2]:
data_ya = pd.read_pickle("dataframes/Humor.pkl")
data_ya = data_ya[['user_id','book_id','rating']]
data_ya.head()

Unnamed: 0,user_id,book_id,rating
0,31,743,4
1,75,743,3
2,245,743,5
3,228,743,5
4,325,743,3


In [3]:
data_ya['rating'].value_counts().to_dict()

{4: 5661, 5: 5378, 3: 3609, 2: 776, 1: 214}

In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_ya[['user_id', 'book_id', 'rating']], reader)

### Split into training and test sets and run model

In [6]:
trainset = data.build_full_trainset()
algo_svd = SVD(n_factors=50, lr_all=0.005, reg_all=0.04, random_state=12345)
algo_svd.fit(trainset)

# predict all the cells without values
testset = trainset.build_anti_testset()
predictions_svd = algo_svd.test(testset)

### Get predictions

In [18]:

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions_svd, n=10)

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
n_items = take(5, top_n.items())

# Print the recommended items for each user
for uid, user_ratings in n_items:
    print(uid, [iid for (iid, _) in user_ratings])

31 [8978, 9580, 1331, 2579, 9495, 5788, 5866, 8396, 8430, 2696]
75 [8978, 2696, 1331, 8430, 5866, 9580, 8775, 5520, 6919, 9495]
245 [8978, 5866, 2579, 1331, 9580, 6919, 8396, 5520, 2696, 9495]
228 [8978, 9580, 5520, 2696, 5788, 6919, 1331, 2579, 9495, 8396]
325 [8978, 5866, 1331, 9580, 9495, 2579, 5520, 6919, 2696, 7186]


### Get the model accuracy

Precision and Recall

In [24]:
def precision_recall_at_k(predictions, k=10, threshold=4):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

({31: 0.0,
  75: 0.0,
  245: 0.0,
  228: 0.0,
  325: 0.0,
  361: 0.0,
  287: 0.0,
  399: 0.0,
  311: 0.0,
  446: 0.0,
  332: 0.0,
  550: 0.0,
  543: 0.0,
  585: 0.0,
  603: 0.0,
  643: 0.0,
  650: 0.0,
  682: 0.0,
  738: 0.0,
  18: 0.0,
  810: 0.0,
  871: 0.0,
  895: 0.0,
  992: 0.0,
  642: 0.0,
  606: 0.0,
  1300: 0.0,
  1306: 0.0,
  1323: 0.0,
  1590: 0.0,
  1619: 0.0,
  1315: 0.0,
  774: 0.0,
  1787: 0.0,
  978: 0.0,
  815: 0.0,
  1263: 0.0,
  2085: 0.0,
  2177: 0.0,
  1666: 0.0,
  2210: 0.0,
  2204: 0.0,
  2276: 0.0,
  952: 0.0,
  1851: 0.0,
  2534: 0.0,
  2585: 0.0,
  2639: 0.0,
  2641: 0.0,
  2648: 0.0,
  2670: 0.0,
  2710: 0.0,
  2825: 0.0,
  2922: 0.0,
  3023: 0.0,
  3022: 0.0,
  2853: 0.0,
  2162: 0.0,
  3082: 0.0,
  3106: 0.0,
  1235: 0.0,
  3167: 0.0,
  3278: 0.0,
  3254: 0.0,
  3352: 0.0,
  1390: 0.0,
  3365: 0.0,
  3369: 0.0,
  3147: 0.0,
  3411: 0.0,
  3412: 0.0,
  1832: 0.0,
  3679: 0.0,
  3377: 0.0,
  3259: 0.0,
  3693: 0.0,
  3768: 0.0,
  3705: 0.0,
  3842: 0.0,
  3922

In [26]:

kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=5)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

1.0
0.6341059602649006
1.0
0.6374487131667288
1.0
0.6286888307807247
1.0
0.6365220615498702
1.0
0.6394106813996316
