In [106]:
# conda install -c conda-forge scikit-surprise

import pandas as pd
import numpy as np
from collections import defaultdict
from surprise import Reader
from surprise import KNNBasic
from surprise import Dataset
from surprise.model_selection import cross_validate

# BB=Breaking Bad, WD=Walking Dead, RD=Riverdale, MF=Modern Family

ratings = {'itemid': ['BB', 'WD', 'RD', 'MF', 'BB', 'WD', 'RD', 'MF', 'BB', 'RD', 'WD', 'BB', 'MF','BB','WD','RD'],
           'userid': ['Mary', 'Mary', 'Mary', 'Mary', 'Tim', 'Tim', 'Tim', 'Tim', 'Ahmed','Leo', 'Leo', 'Anna', 'Anna', 'Sarah', 'Sarah', 'Sarah'],
           'rating': [1,1,5,5,5,5,2,2,5,5,1,2,4,1,1,5]}

df = pd.DataFrame(ratings);
df




Unnamed: 0,itemid,rating,userid
0,BB,1,Mary
1,WD,1,Mary
2,RD,5,Mary
3,MF,5,Mary
4,BB,5,Tim
5,WD,5,Tim
6,RD,2,Tim
7,MF,2,Tim
8,BB,5,Ahmed
9,RD,5,Leo


In [107]:

df.describe()

Unnamed: 0,rating
count,16.0
mean,3.125
std,1.857418
min,1.0
25%,1.0
50%,3.0
75%,5.0
max,5.0


In [108]:
# Construct reader
reader = Reader(rating_scale=(1, 5))

# Generate surprise Dataset
data = Dataset.load_from_df(df[['userid', 'itemid', 'rating']], reader)

In [109]:
# Set all data as training set
trainset = data.build_full_trainset()

# Build and train an algorithm.

sim_options = {
               'user_based': True  # compute  similarities between items
}

algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x243fbce5320>

In [110]:
# Sample prediction
user_id = 'Leo';
item_id = 'MF';

pred = algo.predict(user_id, item_id, verbose=True)

user: Leo        item: MF         r_ui = None   est = 4.79   {'actual_k': 2, 'was_impossible': False}


In [111]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

# Example:output
print(predictions)


[Prediction(uid='Ahmed', iid='WD', r_ui=3.125, est=4.5789473684210522, details={'actual_k': 3, 'was_impossible': False}), Prediction(uid='Ahmed', iid='RD', r_ui=3.125, est=2.3157894736842102, details={'actual_k': 3, 'was_impossible': False}), Prediction(uid='Ahmed', iid='MF', r_ui=3.125, est=2.3248730964467001, details={'actual_k': 3, 'was_impossible': False}), Prediction(uid='Leo', iid='BB', r_ui=3.125, est=1.1428571428571428, details={'actual_k': 3, 'was_impossible': False}), Prediction(uid='Leo', iid='MF', r_ui=3.125, est=4.7931034482758621, details={'actual_k': 2, 'was_impossible': False}), Prediction(uid='Anna', iid='WD', r_ui=3.125, est=1.4705882352941175, details={'actual_k': 3, 'was_impossible': False}), Prediction(uid='Anna', iid='RD', r_ui=3.125, est=4.6470588235294121, details={'actual_k': 3, 'was_impossible': False}), Prediction(uid='Sarah', iid='MF', r_ui=3.125, est=4.5507246376811601, details={'actual_k': 3, 'was_impossible': False})]


In [112]:
# This block copied from Surprise documentation at
# http://surprise.readthedocs.io/en/stable/FAQ.html#how-to-get-the-top-n-recommendations-for-each-user

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Ahmed ['WD', 'MF', 'RD']
Leo ['MF', 'BB']
Anna ['RD', 'WD']
Sarah ['MF']


In [113]:
# Evaluate accuracy
# MAE = mean absolute error, RMSE = root mean squared error
cross_validate(algo, data, measures=['MAE', 'RMSE'], cv=3, verbose=True)


Evaluating MAE, RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
MAE (testset)     2.1212  0.9788  2.5081  1.8694  0.6492  
RMSE (testset)    2.5385  1.4347  2.8296  2.2676  0.6008  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'fit_time': (0.0004985332489013672, 0.0005009174346923828, 0.0),
 'test_mae': array([ 2.12121212,  0.97878788,  2.50805195]),
 'test_rmse': array([ 2.53854582,  1.43472928,  2.82956323]),
 'test_time': (0.0004999637603759766, 0.0, 0.0010008811950683594)}