# Determine the top-10 recommendations for user #100 using user-user collaborative filtering with Pearson correlation and cosine similarity.

In [8]:
# Use suprise for abstraction, load dataset and KNNBasic.
from surprise import KNNBasic
from surprise import Dataset

# Load the movielens-100k dataset.
data = Dataset.load_builtin('ml-100k')

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11473c290>

In [20]:
uid = str(100)  # raw user id (as in the ratings file). They are **strings**!

predictions = []

# get a prediction for specific users and items.
for movie in range(1, 9000):
    pred = algo.predict(uid, str(movie), r_ui=4, verbose=False)
    predictions.append(pred)

In [44]:
# Sort good results for top recommendation, gets impossible past 1683.
results = []

for prediction in predictions:
    if prediction[4] == True:
        continue
    else:
        results.append((prediction.est,prediction.iid,))
        
results.sort(reverse=True)

In [48]:
# Print out the top results
for count, topResult in enumerate(results[0:10]):
    print("#{}: Estimate: {}, Movie ID: {}".format(count + 1, topResult[0], topResult[1]))

#1: Estimate: 5, Movie ID: 814
#2: Estimate: 5, Movie ID: 1653
#3: Estimate: 5, Movie ID: 1599
#4: Estimate: 5, Movie ID: 1536
#5: Estimate: 5, Movie ID: 1293
#6: Estimate: 5, Movie ID: 1201
#7: Estimate: 5, Movie ID: 1189
#8: Estimate: 5, Movie ID: 1122
#9: Estimate: 4.999999999999999, Movie ID: 1500
#10: Estimate: 4.999999999999999, Movie ID: 1467


In [55]:
# Or I could use the packages method which I just found now of course
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    if uid == 100:
        print(uid, [iid for (iid, _) in user_ratings])

# Evaluate your recommendation system using Precision and Recall at 10 and neighborhood size from 2 to 50.

## Here is with recall at 10 , neighborhood size 2

In [56]:
from collections import defaultdict

from surprise import Dataset
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = KNNBasic(k=2)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

Computing the msd similarity matrix...
Done computing similarity matrix.
0.6955268490374873
0.4700381187105888
Computing the msd similarity matrix...
Done computing similarity matrix.
0.6974402442521468
0.46978343882498513
Computing the msd similarity matrix...
Done computing similarity matrix.
0.7058202465383312
0.4684097867860941
Computing the msd similarity matrix...
Done computing similarity matrix.
0.6889188618709896
0.48223750100610624
Computing the msd similarity matrix...
Done computing similarity matrix.
0.6848758388694488
0.4635279339983386


## Here is with recall at 10 and neighbors at 50

In [58]:
data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = KNNBasic(k=50)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

Computing the msd similarity matrix...
Done computing similarity matrix.
0.8398917058853291
0.3181896118169244
Computing the msd similarity matrix...
Done computing similarity matrix.
0.8275332862024268
0.3138428459632695
Computing the msd similarity matrix...
Done computing similarity matrix.
0.8383212139574805
0.33578373344271184
Computing the msd similarity matrix...
Done computing similarity matrix.
0.8467886693684564
0.3232230615263422
Computing the msd similarity matrix...
Done computing similarity matrix.
0.8298899673103491
0.32518367526955416


# Determine the top-10 similar movies to (Toy Story) and (Batman Forever).

## First we do Toy Story

In [59]:
import io  # needed because of weird encoding of u.item file

from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir


def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


## Then we do Batman Forever 

In [63]:
# Retrieve inner id of the movie Toy Story
batman_raw_id = name_to_rid['Batman Forever (1995)']
batman_inner_id = algo.trainset.to_inner_iid(batman_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
batman_neighbors = algo.get_neighbors(batman_inner_id, k=10)

# Convert inner ids of the neighbors into names.
batman_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in batman_neighbors)
batman_neighbors = (rid_to_name[rid]
                       for rid in batman_neighbors)

print()
print('The 10 nearest neighbors of Batman Forever are:')
for movie in batman_neighbors:
    print(movie)


The 10 nearest neighbors of Batman Forever are:
Net, The (1995)
Cape Fear (1991)
Liar Liar (1997)
Mission: Impossible (1996)
Interview with the Vampire (1994)
Batman & Robin (1997)
Casper (1995)
Batman Returns (1992)
Conspiracy Theory (1997)
Junior (1994)
