# Book Recommendation System

#### Credit for making a sparse table comes from this article:
https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c

## Load Libraries and Pre-processing Data

In [1]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn

In [3]:
data = []

for line in open('reviews_Books_5.json','r'): 
    if len(data) < 1000000:
        data.append(json.loads(line))    

In [6]:
data[0]

{'reviewerID': 'A10000012B7CGYKOMPQ4L',
 'asin': '000100039X',
 'reviewerName': 'Adam',
 'helpful': [0, 0],
 'reviewText': 'Spiritually and mentally inspiring! A book that allows you to question your morals and will help you discover who you really are!',
 'overall': 5.0,
 'summary': 'Wonderful!',
 'unixReviewTime': 1355616000,
 'reviewTime': '12 16, 2012'}

In [None]:
# skip header info
data = data[1:]

In [7]:
# load json data into pandas Dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,000100039X,"[0, 0]",5.0,Spiritually and mentally inspiring! A book tha...,"12 16, 2012",A10000012B7CGYKOMPQ4L,Adam,Wonderful!,1355616000
1,000100039X,"[0, 2]",5.0,This is one my must have books. It is a master...,"12 11, 2003",A2S166WSCFIFP5,"adead_poet@hotmail.com ""adead_poet@hotmail.com""",close to god,1071100800
2,000100039X,"[0, 0]",5.0,This book provides a reflection that you can a...,"01 18, 2014",A1BM81XB4QHOA3,"Ahoro Blethends ""Seriously""",Must Read for Life Afficianados,1390003200
3,000100039X,"[0, 0]",5.0,I first read THE PROPHET in college back in th...,"09 27, 2011",A1MOSTXNIO5MPJ,Alan Krug,Timeless for every good and bad time in your l...,1317081600
4,000100039X,"[7, 9]",5.0,A timeless classic. It is a very demanding an...,"10 7, 2002",A2XQ5LZHTD4AFT,Alaturka,A Modern Rumi,1033948800


In [8]:
# save dataframe to csv for easier loading 
df.to_csv("books.csv")

In [9]:
# load data from csv for EDA
df = pd.read_csv('books.csv')

## Processes the CSV file and creates the DF needed to make a pivot table

In [36]:
def top_reviewed_books (file, threshold):
    # function loads the csv file to use, drops columns, finds reviews that are above a threshold count
    # returns a dataframe with only review informations for the most reviewed books
    df = pd.read_csv(file)
    rating_df = df[['asin', 'reviewerID', 'overall']]
    pop_rating = rating_df.groupby(['asin']).count().rename(columns = {'overall':'review_count'})
    rating_df['review_count'] = rating_df['asin'].apply(lambda x: pop_rating['review_count'][x])
    good_rating_df = rating_df[rating_df['review_count']>threshold].copy()
    good_rating_df.drop(columns = ['review_count'], inplace=True)
    
    return good_rating_df

rating_pivot = top_reviewed_books('books.csv', 359)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Pivot Table

In [37]:
rating_pivot_x = rating_pivot.pivot(index = 'asin', columns = 'reviewerID', values = 'overall')

In [19]:
import sys
sys.getsizeof(rating_pivot_x)

303185676

In [38]:
rating_pivot_x = rating_pivot_x.fillna(0)

### Utility (Sparse) Matrix

In [16]:
util_matrix = csr_matrix(rating_pivot_x.values)

### KNN Modeling

In [27]:
from sklearn.neighbors import NearestNeighbors

In [28]:
model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute')
model_knn.fit(util_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

### Setting up Random tests for Recommendations

In [30]:
query_index = np.random.choice(util_matrix.shape[0])

In [49]:
distance, indices = model_knn.kneighbors(util_matrix.toarray()[query_index, :].reshape(1,-1), n_neighbors=6)

### Finding the Book name as data is in ISBN -10

In [50]:
import requests
import re

def get_response(url):
    response = requests.get(url)
    # print('Acquired following info about the book {}'.format(response.text))
    json_data = json.loads(response.text)
    return json_data                              
                    
    
def get_title_of_book(text):
    title_ =[]
    for x in text.keys():
        # print(x)
        title_.append(text[x]['title'])
    return title_
    
    
def form_url(ISBN=None):
    """Formulate an url that will be sent to openlibrary.org/api'.

    Parameters
    ----------
    ISBN : string
        a string of book's ISBN.

    Returns
    -------
    string
        An url that will be sent to openlibrary/org/api.

    """
    url = 'https://openlibrary.org/api/books?bibkeys=ISBN:' + \
        ISBN + '&jscmd=data&format=json'

    return url

def get_book_names():
    book_names = []
    for i in range(len(distance.flatten())):
        ISBN = rating_pivot_x.index[indices.flatten()[i]]
        url = form_url(ISBN)
        text = get_response(url)
        name_of_book = get_title_of_book(text)
        if len(name_of_book) > 1:
            book_names.append(name_of_book)
        else:
            book_names.append(ISBN)
    return(book_names)

## Following up with KNN and Book Names

In [51]:
book_names = []

for i in range(len(distance.flatten())):
    ISBN = rating_pivot_x.index[indices.flatten()[i]]
    url = form_url(ISBN)
    text = get_response(url)
    name_of_book = get_title_of_book(text)
    if len(name_of_book) > 0:
        book_names.append(name_of_book)
    else:
        book_names.append(ISBN)

for i in range(len(distance.flatten())):
    if i==0:
        print('Recommendations for {0}:\n'.format(book_names[i]))
    else:
        print("{0}:{1} with distances of {2}".format(i, book_names[i], 
                                                     distance.flatten()[i]))

Recommendations for ['Marley & Me']:

1:["The Memory Keeper's Daughter"] with distances of 0.9387498300776762
2:['Night'] with distances of 0.9437722045086124
3:['Water for Elephants'] with distances of 0.9485648623090394
4:['The curious incident of the dog in the night-time'] with distances of 0.9509469360942313
5:['The art of racing in the rain'] with distances of 0.9559398303278569
