# Book Recommendation System

#### Credit for making a sparse table comes from this article:
https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c

## Load Libraries and Pre-processing Data

In [1]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn

In [2]:
data = []

for line in open('reviews_Books_5.json','r'): 
    if len(data) < 1000000:
        data.append(json.loads(line))    

KeyboardInterrupt: 

In [None]:
# skip header info
data = data[1:]

In [None]:
# load json data into pandas Dataframe
df = pd.DataFrame(data)
df.head()

In [None]:
# save dataframe to csv for easier loading 
df.to_csv("books.csv")

In [2]:
# load data from csv for EDA
df = pd.read_csv('books.csv')

## Processes the CSV file and creates the DF needed to make a pivot table

In [3]:
def top_reviewed_books (file, threshold):
    # function loads the csv file to use, drops columns, finds reviews that are above a threshold count
    # returns a dataframe with only review informations for the most reviewed books
    df = pd.read_csv(file)
    rating_df = df[['asin', 'reviewerID', 'overall']]
    pop_rating = rating_df.groupby(['asin']).count().rename(columns = {'overall':'review_count'})
    rating_df['review_count'] = rating_df['asin'].apply(lambda x: pop_rating['review_count'][x])
    good_rating_df = rating_df[rating_df['review_count']>threshold].copy()
    good_rating_df.drop(columns = ['review_count'], inplace=True)
    
    return good_rating_df

rating_pivot = top_reviewed_books('books.csv', 359)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Pivot Table

In [14]:
rating_pivot_x = rating_pivot.pivot(index = 'asin', columns = 'reviewerID', values = 'overall')

In [15]:
rating_pivot_x = rating_pivot_x.fillna(0)

### Utility (Sparse) Matrix

In [16]:
util_matrix = csr_matrix(rating_pivot_x.values)

### KNN Modeling

In [17]:
from sklearn.neighbors import NearestNeighbors

In [18]:
model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute')
model_knn.fit(util_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

### Setting up Random tests for Recommendations

In [19]:
query_index = np.random.choice(rating_pivot_x.shape[0])

In [20]:
distance, indices = model_knn.kneighbors(rating_pivot_x.iloc[query_index, :].values.reshape(1,-1), n_neighbors=6)

### Finding the Book name as data is in ISBN -10

In [21]:
import requests
import re

def get_response(url):
    response = requests.get(url)
    # print('Acquired following info about the book {}'.format(response.text))
    json_data = json.loads(response.text)
    return json_data                              
                    
    
def get_title_of_book(text):
    title_ =[]
    for x in text.keys():
        # print(x)
        title_.append(text[x]['title'])
    return title_
    
    
def form_url(ISBN=None):
    """Formulate an url that will be sent to openlibrary.org/api'.

    Parameters
    ----------
    ISBN : string
        a string of book's ISBN.

    Returns
    -------
    string
        An url that will be sent to openlibrary/org/api.

    """
    url = 'https://openlibrary.org/api/books?bibkeys=ISBN:' + \
        ISBN + '&jscmd=data&format=json'

    return url

def get_book_names():
    book_names = []
    for i in range(len(distance.flatten())):
        ISBN = rating_pivot_x.index[indices.flatten()[i]]
        url = form_url(ISBN)
        text = get_response(url)
        name_of_book = get_title_of_book(text)
        if len(name_of_book) > 1:
            book_names.append(name_of_book)
        else:
            book_names.append(ISBN)
    return(book_names)

## Following up with KNN and Book Names

In [22]:
book_names = []

for i in range(len(distance.flatten())):
    ISBN = rating_pivot_x.index[indices.flatten()[i]]
    url = form_url(ISBN)
    text = get_response(url)
    name_of_book = get_title_of_book(text)
    if len(name_of_book) > 0:
        book_names.append(name_of_book)
    else:
        book_names.append(ISBN)

for i in range(len(distance.flatten())):
    if i==0:
        print('Recommendations for {0}:\n'.format(book_names[i]))
    else:
        print("{0}:{1} with distances of {2}".format(i, book_names[i], 
                                                     distance.flatten()[i]))

Recommendations for ['We Need to Talk About Kevin']:

1:['Extremely Loud and Incredibly Close'] with distances of 0.9625451566142782
2:["The Time Traveler's Wife"] with distances of 0.9675885059661646
3:0143170090 with distances of 0.9689686505290422
4:['The sense of an ending'] with distances of 0.9704010005099658
5:['People of the Book'] with distances of 0.9705262575587029
