# Book Recommendation System

#### Credit for making a sparse table comes from this article:
https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c

## Load Libraries and Pre-processing Data

In [2]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn

In [None]:
data = []

for line in open('reviews_Books_5.json','r'): 
    if len(data) < 1000000:
        data.append(json.loads(line))  

In [None]:
# skip header info
data = data[1:]

In [None]:
# load json data into pandas Dataframe
df = pd.DataFrame(data)
df.head()

In [None]:
# save dataframe to csv for easier loading 
df.to_csv("books.csv")

In [3]:
# load data from csv for EDA
df = pd.read_csv('books.csv')

## Processes the CSV file and creates the DF needed to make a pivot table

In [3]:
def top_reviewed_books (file, threshold):
    # function loads the csv file to use, drops columns, finds reviews that are above a threshold count
    # returns a dataframe with only review informations for the most reviewed books
    df = pd.read_csv(file)
    rating_df = df[['asin', 'reviewerID', 'overall']].copy()
    pop_rating = rating_df.groupby(['asin']).count().rename(columns = {'overall':'review_count'})
    rating_df['review_count'] = rating_df['asin'].apply(lambda x: pop_rating['review_count'][x])
    good_rating_df = rating_df[rating_df['review_count']>threshold].copy()
    good_rating_df.drop(columns = ['review_count'], inplace=True)
    
    return good_rating_df

## Generate Training and Testing Data

In [4]:
rating_pivot = top_reviewed_books('books.csv', 359)

In [5]:
# only use if index must be int and cannot be strings
#rating_pivot['asin'] = rating_pivot['asin'].apply(lambda x: x.replace('X', '10'))

In [6]:
rating_pivot.head()

Unnamed: 0,asin,reviewerID,overall
314,2007770,A35VAQCQ4U28V7,2.0
315,2007770,APBA7Y4SINS0H,1.0
316,2007770,A3FVQCS5Q09W9N,5.0
317,2007770,A22019GDU7OVFG,5.0
318,2007770,A1X3ASRHM5ZF8K,3.0


## Here is where we manually setting the train and test splits

In [7]:
len(rating_pivot) * .33

71482.29000000001

#### Chose and then store random index values to be target values

In [8]:
test_index = np.random.randint(len(rating_pivot), size=int(len(rating_pivot) * .33))

In [10]:
test_df = rating_pivot.iloc[test_index]

In [11]:
test_values = rating_pivot.iloc[test_index]['overall']

In [12]:
test_values.head()

225000    4.0
912943    4.0
306791    5.0
21282     2.0
938663    5.0
Name: overall, dtype: float64

In [13]:
len(test_values)

71482

#### Once the target index is found,  set, and stored, we change those rating values to 0 in order to use the utility matrix to predict values

In [108]:
rating_pivot.iloc[test_index]['overall'].apply(lambda x: 0);

### Pivot Table

In [15]:
rating_pivot_x = rating_pivot.pivot(index = 'asin', columns = 'reviewerID', values = 'overall')

In [16]:
rating_pivot_x = rating_pivot_x.fillna(0)

### Utility (Sparse) Matrix

In [18]:
util_matrix = csr_matrix(rating_pivot_x.values)

### KNN Modeling

In [19]:
from sklearn.neighbors import NearestNeighbors

In [20]:
model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute')
model_knn.fit(util_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

### Using Testing Data to determine K-5 neighbors averages

In [22]:
# FYI  rating_pivot_x uses .loc NOT iloc 
  

### Here is where we find K_5 neighbors, average any reviews greater than zero, mean those 5 scores, and return that as the predictive value

In [110]:
mean_ratings = []
    
for index in test_df.values[:100]:
       
    distance, indices = model_knn.kneighbors(rating_pivot_x.loc[index[0], :].values.reshape(1,-1), n_neighbors=5)
    
    knn_of_ratings = []
    for x in indices[0]:
        
        knn_of_ratings.append(sum(rating_pivot_x.iloc[x, :][rating_pivot_x.iloc[x, :].values >0].values)\
                              /len(rating_pivot_x.iloc[x, :][rating_pivot_x.iloc[x, :].values >0].values))
       
    mean = sum(knn_of_ratings)/len(knn_of_ratings)
    mean_ratings.append((index, mean))
       


### Here is where I manually find the RMSE from target and predicted ratings

In [111]:
import math 

mean_sq = []
for x in mean_ratings:
    test_pred = round(x[1], 2)
    target = test_df[(test_df['asin']==x[0][0]) & (test_df['reviewerID']==x[0][1])]['overall'].values
    target = sum(target)/len(target)
    mean_sq_e = (target - test_pred)**2
    mean_sq.append(mean_sq_e)
    
print("RMSE is ", math.sqrt(sum(mean_sq)/len(mean_ratings)))

RMSE is  1.1763587888055242


### Setting up Random tests for Recommendations

In [65]:
query_index = np.random.choice(rating_pivot_x.shape[0])

In [20]:
distance, indices = model_knn.kneighbors(rating_pivot_x.iloc[query_index, :].values.reshape(1,-1), n_neighbors=6)

### Finding the Book name as data is in ISBN -10

In [21]:
import requests
import re

def get_response(url):
    response = requests.get(url)
    # print('Acquired following info about the book {}'.format(response.text))
    json_data = json.loads(response.text)
    return json_data                              
                    
    
def get_title_of_book(text):
    title_ =[]
    for x in text.keys():
        # print(x)
        title_.append(text[x]['title'])
    return title_
    
    
def form_url(ISBN=None):
    """Formulate an url that will be sent to openlibrary.org/api'.

    Parameters
    ----------
    ISBN : string
        a string of book's ISBN.

    Returns
    -------
    string
        An url that will be sent to openlibrary/org/api.

    """
    url = 'https://openlibrary.org/api/books?bibkeys=ISBN:' + \
        ISBN + '&jscmd=data&format=json'

    return url

def get_book_names():
    book_names = []
    for i in range(len(distance.flatten())):
        ISBN = rating_pivot_x.index[indices.flatten()[i]]
        url = form_url(ISBN)
        text = get_response(url)
        name_of_book = get_title_of_book(text)
        if len(name_of_book) > 1:
            book_names.append(name_of_book)
        else:
            book_names.append(ISBN)
    return(book_names)

## Following up with KNN and Book Names

In [22]:
book_names = []

for i in range(len(distance.flatten())):
    ISBN = rating_pivot_x.index[indices.flatten()[i]]
    url = form_url(ISBN)
    text = get_response(url)
    name_of_book = get_title_of_book(text)
    if len(name_of_book) > 0:
        book_names.append(name_of_book)
    else:
        book_names.append(ISBN)

for i in range(len(distance.flatten())):
    if i==0:
        print('Recommendations for {0}:\n'.format(book_names[i]))
    else:
        print("{0}:{1} with distances of {2}".format(i, book_names[i], 
                                                     distance.flatten()[i]))

Recommendations for 0142419400:

1:['Unearthly'] with distances of 0.7954196113339824
2:014242899X with distances of 0.8041996810448896
3:['Delirium'] with distances of 0.831780796243394
4:['Shatter me'] with distances of 0.8459274032182215
5:['Before I Fall'] with distances of 0.8559573456414674
