In [2]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, KNNBasic, evaluate, Reader, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split

In [3]:
# df_to_read = pd.read_csv('data/to_read.csv')
# df_tag_names = pd.read_csv('data/tags.csv')
# df_tags = pd.read_csv('data/book_tags.csv')
# df_books = pd.read_csv('data/books.csv', usecols=['book_id', 'original_title'])
df = pd.read_csv('data/ratings.csv')
# df_books.set_index('book_id', inplace=True)
# df_books.head(5)

In [12]:
# use a smaller sample df
small_df = df.sample(1000)
small_df.head(5)

Unnamed: 0,user_id,book_id,rating
2628783,29206,218,5
4962031,42984,8645,5
17041,669,1111,2
709148,11810,916,4
243371,3832,804,3


In [13]:
reader = Reader(rating_scale=(1,5)) # rating scale 1 to 5
data = Dataset.load_from_df(small_df[['user_id', 'book_id', 'rating']], reader) #user id, item id and ratings (in that order)

In [14]:
sim_options = {'name': 'cosine', 'user-based': False}
knn = KNNBasic(sim_options=sim_options)

In [44]:
# cross_validate(knn, data, cv=5, verbose=True)

In [15]:
trainingSet = data.build_full_trainset()
# trainingSet, testSet = train_test_split(data, test_size=.25)

In [16]:
knn.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fd41d85f2b0>

In [17]:
knn.predict(2877, 111, 4) # predict example

Prediction(uid=2877, iid=111, r_ui=4, est=3.951, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

In [18]:
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)

In [19]:
predictions

[Prediction(uid=29206, iid=8645, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=29206, iid=1111, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=29206, iid=916, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=29206, iid=804, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=29206, iid=15, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=29206, iid=950, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=29206, iid=1069, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=29206, iid=2513, r_ui=3.951, est=3.951, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}

In [20]:
# get top 3 recommendations
from collections import defaultdict

def get_top_rec(predictions, top=3):
    top_recs = defaultdict(list)
    for uid, iid, r, est, _ in predictions:
        top_recs[uid].append((iid, est))
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:top]
        
    return top_recs

In [21]:
top_rec = get_top_rec(predictions) # get top rec
top_rec

defaultdict(list,
            {82: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             84: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             139: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             189: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             289: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             316: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             387: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             429: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             479: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             562: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             600: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             669: [(218, 3.951), (8645, 3.951), (916, 3.951)],
             682: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             697: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             701: [(218, 3.951), (8645, 3.951), (1111, 3.951)],
             703: [(218, 

In [24]:
top_rec[82] # get user's rec

[(218, 3.951), (8645, 3.951), (1111, 3.951)]

In [49]:
def get_book_name():
    
    return name

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
5,2,26,4
6,2,315,3
7,2,33,4
8,2,301,5
9,2,2686,5


In [61]:
for uid, user_ratings in top_rec.items():
    print(uid, [rid_to_name[iid] for (iid, _) in user_ratings])

53424

10000

5976479

(5976479, 3)