In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, KNNBasic, evaluate, Reader, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split

In [2]:
df = pd.read_csv('data/cleaned_ratings.csv', usecols=['user_id', 'book_id', 'rating'])
print(df.shape)
df.head()

(3421894, 3)


Unnamed: 0,user_id,book_id,rating
0,1,437,4
1,1,143,5
2,1,421,5
3,1,3294,5
4,1,262,3


In [3]:
# # use a smaller sample df
# small_df = df.sample(1000)
# small_df.head(5)
df1 = df.iloc[:30000,:]
print(df1.shape)
df1.head()

(30000, 3)


Unnamed: 0,user_id,book_id,rating
0,1,437,4
1,1,143,5
2,1,421,5
3,1,3294,5
4,1,262,3


In [4]:
reader = Reader(rating_scale=(1,5)) # rating scale 1 to 5
data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader) #user id, item id and ratings (in that order)

In [5]:
sim_options = {'name': 'cosine', 'user-based': False}
knn = KNNBasic(sim_options=sim_options)

In [6]:
# trainingSet, testSet = train_test_split(data, test_size=.99)
trainingSet = data.build_full_trainset()
knn.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7efcf81c8d68>

In [7]:
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)
predictions

[Prediction(uid=1, iid=8855, r_ui=3.7110666666666665, est=4.336774245175747, details={'was_impossible': False, 'actual_k': 3}),
 Prediction(uid=1, iid=932, r_ui=3.7110666666666665, est=4.381288541523024, details={'was_impossible': False, 'actual_k': 8}),
 Prediction(uid=1, iid=630, r_ui=3.7110666666666665, est=3.9305937531497013, details={'was_impossible': False, 'actual_k': 31}),
 Prediction(uid=1, iid=3122, r_ui=3.7110666666666665, est=4.325475080716344, details={'was_impossible': False, 'actual_k': 3}),
 Prediction(uid=1, iid=7233, r_ui=3.7110666666666665, est=3.999828000296919, details={'was_impossible': False, 'actual_k': 4}),
 Prediction(uid=1, iid=790, r_ui=3.7110666666666665, est=3.820325137109933, details={'was_impossible': False, 'actual_k': 6}),
 Prediction(uid=1, iid=8014, r_ui=3.7110666666666665, est=5, details={'was_impossible': False, 'actual_k': 1}),
 Prediction(uid=1, iid=6384, r_ui=3.7110666666666665, est=3.677030150204276, details={'was_impossible': False, 'actual_k'

In [8]:
# get top 3 recommendations
from collections import defaultdict

def get_top_rec(predictions, top=3):
    top_recs = defaultdict(list)
    for uid, iid, r, est, _ in predictions:
        top_recs[uid].append((iid, est))
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:top]
        
    return top_recs

In [9]:
top_rec = get_top_rec(predictions) # get top rec
top_rec

defaultdict(list,
            {1: [(8014, 5), (2686, 5), (1446, 5)],
             2: [(8492, 5), (6745, 5), (1237, 5)],
             3: [(7233, 5), (3545, 5), (5369, 5)],
             4: [(3294, 5), (1310, 5), (1180, 5)],
             5: [(8492, 5), (2241, 5), (2693, 5)],
             6: [(8014, 5), (1446, 5), (7402, 5)],
             7: [(6745, 5), (7591, 5), (4947, 5)],
             8: [(128, 5), (1180, 5), (8519, 5)],
             9: [(8464, 5), (4947, 5), (4344, 5)],
             10: [(8014, 5), (1446, 5), (9296, 5)],
             11: [(3122, 5), (2139, 5), (4947, 5)],
             12: [(2139, 5), (3378, 5), (7591, 5)],
             13: [(4947, 5), (7891, 5), (6404, 5)],
             14: [(8014, 5), (1446, 5), (9296, 5)],
             15: [(2139, 5), (3691, 5), (4344, 5)],
             16: [(1180, 5), (920, 5), (3469, 5)],
             17: [(8014, 5), (2686, 5), (1446, 5)],
             18: [(3469, 5), (3378, 5), (4947, 5)],
             19: [(8855, 5), (8014, 5), (1446, 5)],
     

In [25]:
df_books = pd.read_csv('data/cleaned_books_data.csv', usecols=['book_id', 'original_title'])

In [64]:
df11 = pd.read_csv('data/books.csv')
df11.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [26]:
df_books.set_index('book_id', inplace=True)
df_books.head()

Unnamed: 0_level_0,original_title
book_id,Unnamed: 1_level_1
1,The Hunger Games
2,Harry Potter and the Philosopher's Stone
3,Twilight
4,To Kill a Mockingbird
5,The Great Gatsby


In [59]:
df_books.shape

(5764, 1)

In [53]:
def name(toprec):
    names = {}
    for rec in range(3):
        names[rec] = df_books['original_title'].loc[toprec[rec][0]]
    return names
print(name(top_rec[3])) # recommend to user number 3

{0: 'How to Talk So Teens Will Listen and Listen So Teens Will Talk', 1: 'Gulp: Adventures on the Alimentary Canal', 2: 'Erinnerungen, Träume, Gedanken von C.G. Jung'}


In [60]:
df_b = df_books.reset_index()
df_b

Unnamed: 0,book_id,original_title
0,1,The Hunger Games
1,2,Harry Potter and the Philosopher's Stone
2,3,Twilight
3,4,To Kill a Mockingbird
4,5,The Great Gatsby
5,6,The Fault in Our Stars
6,7,The Hobbit or There and Back Again
7,8,The Catcher in the Rye
8,9,Angels & Demons
9,10,Pride and Prejudice


In [54]:
top_rec[1] # get user's rec

[(8014, 5), (2686, 5), (1446, 5)]

In [58]:
df_books['original_title'].iloc[8014]

IndexError: single positional indexer is out-of-bounds

In [62]:
df.loc[df['book_id'] == 8014]

Unnamed: 0,user_id,book_id,rating
41,2,8014,5
284451,4966,8014,2
313206,5408,8014,4
342400,5882,8014,4
401080,6803,8014,2
667661,11090,8014,3
686399,11381,8014,4
708604,11730,8014,5
765816,12599,8014,4
829055,13549,8014,5
