In [80]:
%matplotlib inline
from surprise import SVD, NMF, Dataset, Reader, accuracy, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd
import numpy as np
from main import get_top_n


In [2]:
ratings = pd.read_csv( 'ratings.csv' )
to_read = pd.read_csv( 'to_read.csv' )
books = pd.read_csv( 'books.csv' )

tags = pd.read_csv( 'tags.csv' )
booktags = pd.read_csv( 'book_tags.csv')

In [3]:
ratings.head(3)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5


In [11]:
ratings.rating.describe()

count    5.976479e+06
mean     3.919866e+00
std      9.910868e-01
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [12]:
to_read.head(3)

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275


In [13]:
books.head(3)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...


In [14]:
#Rating scale
reader = Reader(rating_scale=(1, 5.0))

In [16]:
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

In [17]:
trainset, testset = train_test_split(data, test_size=0.30)

In [18]:
nmf = NMF()

In [20]:
nmf.fit(trainset)

In [22]:
predictions = nmf.test(testset)

In [23]:
accuracy.rmse(predictions)

RMSE: 0.8661


0.8660511320164065

In [26]:
ratings.book_id.value_counts().max()

22806

# Book Reviews

In [57]:
#book_ratings_inner = pd.merge(ratings, books, on='book_id', how='inner')

#book_ratings_inner.head(4)

In [63]:
reviews_per_book = ratings.groupby('book_id').book_id.apply(lambda x: len( x ))


In [68]:
#Fewest ratings
sort_df = reviews_per_book.sort_values()
#sort_df.columns.values[1]='bkcount'
sort_df.head(1)
#book_fewest_rt


book_id
7803    8
Name: book_id, dtype: int64

In [69]:
#Most ratings
sort_df.sort_values().tail(1)

book_id
1    22806
Name: book_id, dtype: int64

# User Reviews

In [73]:
reviews_per_user = ratings.groupby('user_id').user_id.apply( lambda x: len( x ))
reviews_per_user.head()


user_id
1    117
2     65
3     91
4    134
5    100
Name: user_id, dtype: int64

In [74]:
reviews_per_user.mean()

111.86880428271938

In [78]:
# Count of books between 2000 and 2010
df = books[(books.original_publication_year >= 2000) & (books.original_publication_year <= 2010)]
df.original_publication_year.count()

3594

In [87]:
#top 5 books you would recommend for user 37
top_n = get_top_n(predictions, n=5)
for uid, ur in top_n.items():
    if uid == 37:
        print(uid, [books[books.book_id == iid].original_title for iid, _ in ur])
        break

37 [1471    An Ember in the Ashes
Name: original_title, dtype: object, 134    A Storm of Swords
Name: original_title, dtype: object, 2124    The Book of Three
Name: original_title, dtype: object, 2229    The Black Cauldron
Name: original_title, dtype: object, 16    Catching Fire
Name: original_title, dtype: object]


In [89]:
#Top 10 most similar books to "The Great Gatsby"
knn = KNNBaseline(sim_options={'name': 'cosine', 'user_based':False})

In [90]:
knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x18335eaada0>

In [92]:
lst = knn.get_neighbors(5, 10)

In [95]:
for id in lst:
    print(books[books.book_id == id].original_title)

8    Angels & Demons 
Name: original_title, dtype: object
10    The Kite Runner 
Name: original_title, dtype: object
29    Gone Girl
Name: original_title, dtype: object
37    The Time Traveler's Wife
Name: original_title, dtype: object
74    NaN
Name: original_title, dtype: object
149    The Red Tent
Name: original_title, dtype: object
151    Dear John
Name: original_title, dtype: object
192    Outliers: The Story of Success
Name: original_title, dtype: object
195    Fight Club
Name: original_title, dtype: object
208    The Silence of the Lambs
Name: original_title, dtype: object
