In [1]:
from surprise import SVD, NMF, Dataset, Reader, accuracy, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd
import numpy as np
from main import get_top_n

In [2]:
df = pd.read_csv('ratings.csv')
df.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [3]:
df.shape

(5976479, 3)

In [4]:
df.rating.describe()

count    5.976479e+06
mean     3.919866e+00
std      9.910868e-01
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [5]:
reader = Reader(rating_scale=(1.0,5.0))

In [6]:
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

In [7]:
trainset, testset = train_test_split(data, test_size=0.25)

In [8]:
nmf = NMF()

In [9]:
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1e180017978>

In [10]:
predictions = nmf.test(testset)

## What is the RMSE for the Training set?

In [11]:
accuracy.rmse(predictions)

RMSE: 0.8649


0.8648916225567647

In [12]:
predictions[:5]

[Prediction(uid=36664, iid=129, r_ui=5.0, est=4.603201603919418, details={'was_impossible': False}),
 Prediction(uid=11851, iid=2412, r_ui=3.0, est=3.6890847672954195, details={'was_impossible': False}),
 Prediction(uid=33099, iid=2594, r_ui=5.0, est=4.459455124311024, details={'was_impossible': False}),
 Prediction(uid=45519, iid=1905, r_ui=4.0, est=4.719946326559952, details={'was_impossible': False}),
 Prediction(uid=52717, iid=4875, r_ui=4.0, est=3.7929859486716504, details={'was_impossible': False})]

## Which book had the most ratings? The fewest ratings? (The actual name not the ID)

In [41]:
high_id = df.groupby('book_id').count().sort_values(by = 'rating', ascending = False)[:5].index.tolist()
high_id

[1, 2, 4, 3, 5]

In [29]:
low_id = df.groupby('book_id').count().sort_values(by = 'rating', ascending = True)[:5].index.tolist()
low_id

[7803, 9345, 9486, 1935, 9315]

In [37]:
df_book = pd.read_csv('books.csv')
df_book[df_book.book_id.isin(low_id)]['title']

1934                       Kindle Paperwhite User's Guide
7802                                  Kindle User's Guide
9314                                     The King's Agent
9344                                      Diary ng Panget
9485    Fifty Shades Duo: Fifty Shades Darker / Fifty ...
Name: title, dtype: object

In [42]:
df_book[df_book.book_id.isin(high_id)]['title']

0              The Hunger Games (The Hunger Games, #1)
1    Harry Potter and the Sorcerer's Stone (Harry P...
2                              Twilight (Twilight, #1)
3                                To Kill a Mockingbird
4                                     The Great Gatsby
Name: title, dtype: object

## average number of books read across all users?

In [46]:
df.groupby('user_id').count().mean()[0]

111.86880428271938

## How many books were published between 2000 and 2010?

In [52]:
df_book[(df_book.original_publication_year >= 2000) & (df_book.original_publication_year <= 2010)].shape[0]

3594

## What are the top 10 most similar books to "The Great Gatsby"? You will have to use a KNN-based model to answer this? Print out the actual book names, not their IDs.

In [53]:
knn = KNNBaseline(sim_options={'name':'cosine', 'user_based':False})
knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1e22fbcd438>

In [60]:
df_book[df_book.book_id.isin(knn.get_neighbors(int(df_book[df_book.title == 'The Great Gatsby']['book_id']), 10))]['title']

12                                                  1984
21                                      The Lovely Bones
26     Harry Potter and the Half-Blood Prince (Harry ...
63                                    My Sister's Keeper
82                                  A Tale of Two Cities
99                                  The Poisonwood Bible
106                                   A Walk to Remember
123                                                 Room
135               Divine Secrets of the Ya-Ya Sisterhood
162            The Lost Hero (The Heroes of Olympus, #1)
Name: title, dtype: object

## What are the top 5 books you would recommend to User #37? (The actual book names, not IDs)

In [88]:
top_n = get_top_n(predictions, n = 5)
for uid, ur in top_n.items():
    if uid == 37:
        print(uid, [df_book[df_book.book_id == iid]['title'].values[0] for iid,_ in ur])

37 ['number9dream', 'Shantaram', 'The Martian', 'Shutter Island', 'Turn Right at Machu Picchu: Rediscovering the Lost City One Step at a Time']
