In [9]:
import pandas as pd

from surprise import Reader, Dataset, KNNBasic, accuracy
from surprise.model_selection import train_test_split

dataset_dir = '/Users/daniellee/Desktop/University/Data Mining/goodbooks/data/ratings.csv'
book_data_dir = '/Users/daniellee/Desktop/University/Data Mining/goodbooks/data/books.csv'

# import as pandas to shorten the dataset
df = pd.read_csv(dataset_dir, header=None, names=['userID','itemID','rating'])

print("Data entries: %d" % df.shape[0])

# shorten the data to 5000 entries
df = df.iloc[:5000,:]
print("Reduce data entries to: %d" % df.shape[0])

df.head()

Data entries: 5976479
Reduce data entries to: 5000


Unnamed: 0,userID,itemID,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [10]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

data.split(n_folds=5)

sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)

trainset, testset = train_test_split(data, test_size=.25)

algo.fit(trainset)

predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0629


1.062882409018608

In [11]:
predictions

[Prediction(uid=136, iid=1797, r_ui=3.0, est=4.0191955694800168, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=171, iid=86, r_ui=2.0, est=3.6600636858885309, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid=246, iid=1096, r_ui=3.0, est=4.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=212, iid=2052, r_ui=4.0, est=4.0146924466080343, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=66, iid=7683, r_ui=5.0, est=3.8421333333333334, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid=206, iid=2145, r_ui=4.0, est=3.8421333333333334, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid=73, iid=3933, r_ui=3.0, est=3.8421333333333334, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid=66, iid=80, r_ui=5.0, est=4.6536555211168666, details={'actual_k': 15, 'was_impossible': False}),
 Prediction(uid=12

In [12]:
# get top 3 recommendations
from collections import defaultdict

def get_top_rec(predictions, top=3):
    top_recs = defaultdict(list)
    for uid, iid, r, est, _ in predictions:
        top_recs[uid].append((iid, est))
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:top]
        
    return top_recs

In [28]:
top_rec = get_top_rec(predictions, top=3)
top_rec

defaultdict(list,
            {1: [(1796, 3.8421333333333334),
              (5556, 3.8421333333333334),
              (2738, 3.8421333333333334)],
             2: [(4081, 3.8421333333333334),
              (33, 3.7142857142857144),
              (301, 2.0)],
             4: [(70, 4.7454035968410961),
              (87, 4.4325019204205702),
              (27, 4.1121386929322465)],
             8: [(1364, 5), (177, 4.0), (1432, 3.8421333333333334)],
             9: [(664, 5), (494, 4.3097174442703619), (3417, 4.0)],
             10: [(37, 4.6705862976247206),
              (19, 4.4007121550245181),
              (87, 4.3458074879502337)],
             11: [(258, 5),
              (115, 4.271181374206531),
              (574, 3.6710965711522556)],
             15: [(337, 4.4961284343969945),
              (101, 4.1561196907505407),
              (71, 4.0046062371796225)],
             18: [(289, 4.5026432663444957),
              (121, 4.2148220697259475),
              (11, 4.0380464960

In [14]:
# get books data to get book title names
book_df = pd.read_csv(book_data_dir)
book_df.head()
    

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [45]:
top_rec = get_top_rec(predictions, top=3)
book_rec_for_users = {}

# iterate over dictionary of recommendations
for key,books in top_rec.items():
        
    # list of books to recommend
    books_to_rec = []
    
    # iterate over all top 3 recommendations
    for book in books:
        # get book id for each recommendation
        book_id = book[0]
        book_rating = book[1]

        # find respective book title
        book_title = book_df.loc[book_df['book_id'] == book_id, 'original_title'].item()
        books_to_rec.append((book_title, book_rating))
        
    book_rec_for_users[key] = books_to_rec

book_rec_for_users

{1: [('Gilead', 3.8421333333333334),
  ('The Sea', 3.8421333333333334),
  ('Balzac et la Petite Tailleuse chinoise', 3.8421333333333334)],
 2: [(nan, 3.8421333333333334),
  ('Memoirs of a Geisha', 3.7142857142857144),
  ('Heart of Darkness', 2.0)],
 4: [("Ender's Game", 4.7454035968410961),
  ('Un di Velt Hot Geshvign', 4.4325019204205702),
  ('Harry Potter and the Half-Blood Prince', 4.1121386929322465)],
 8: [('Записки из подполья', 5),
  ('Преступление и наказание', 4.0),
  ('Manifest der Kommunistischen Partei', 3.8421333333333334)],
 9: [("Breakfast at Tiffany's", 5),
  ('The Secret History', 4.3097174442703619),
  ('Motherless Brooklyn', 4.0)],
 10: [('The Lion, the Witch and the Wardrobe', 4.6705862976247206),
  (' The Fellowship of the Ring', 4.4007121550245181),
  ('Un di Velt Hot Geshvign', 4.3458074879502337)],
 11: [('La sombra del viento', 5),
  ('Middlesex', 4.271181374206531),
  ('Kitchen Confidential: Adventures in the Culinary Underbelly',
   3.6710965711522556)],
 15: