### Import libraries

In [1]:
from collections import defaultdict
from surprise import KNNBaseline
from surprise import Dataset
#from surprise.model_selection import cross_validate
#from surprise.model_selection import train_test_split

### Download data

In [2]:
data = Dataset.load_builtin('ml-100k')

### Create test set

In [3]:
# Retrieve the trainset
trainset = data.build_full_trainset()

# sample random trainset and testset
# test set is made of 30% of the ratings
#trainset, testset = train_test_split(data, test_size=.3)

### Define the model

We use Stochastic Gradient descent with $\eta=0.00005$ for estimating the baselines and Pearson Correlation Coefficient for estimating the similarities. We use item-item Collaborative Filtering. We take into account from 1 to 40 neighbors.

In [4]:
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }

sim_options = {'name': 'pearson',
               'user_based': False  # compute  similarities between items
              }

algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options)

### Fit the model

In [5]:
algo.fit(trainset)
#cross_validate(algo, data, cv=5, verbose=True)

Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fb7059f4a90>

### Get the baselines

In [6]:
baselines = algo.compute_baselines()

### Get the similarity matrix

For each pair of users $u, v$ we have the similarity $w_{uv}$ as computed by the Pearson correlation coefficient.

In [7]:
w = algo.compute_similarities()

Computing the pearson similarity matrix...
Done computing similarity matrix.


### Predict ratings for all pairs $(u, v)$ that are NOT in the training set.

In [8]:
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

### Get the top-10 rated items for each user.

For each user $u$ the top 10 rated items $\mathcal{L}_{u}$ are stored in `top_n`.

In [9]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)

### Print the recommended items for each user

In [10]:
#for uid, user_ratings in top_n.items():
#    print(uid, [iid for (iid, _) in user_ratings])