# Book Recommendation System

#### Credit for making a sparse table comes from this article:
https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c

## Load Libraries and pre-process Data

In [4]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn

In [2]:
def load_json_data(file):
    data = []
    for line in open(file,'r'): 
    if len(data) < 1000000:
        data.append(json.loads(line))    
    return data[1:]

def make_df_and_csv_from_json(data):
    df = pd.DataFrame(data)
    rating_df = df[['asin', 'reviewerID', 'overall']]
    rating_df.to_csv("books_ratings.csv")
    return df


data = load_json_data('reviews_Books_5.json')
df = pd.make_df_and_csv_from_json(data)

KeyboardInterrupt: 

In [2]:
df = pd.read_csv('books.csv')

In [3]:
rating_df = df[['asin', 'reviewerID', 'overall']]
rating_df.to_csv("books_ratings.csv")

## Processes the CSV file and creates the DF needed to make a pivot table

In [5]:
def top_reviewed_books (file, threshold):
    # function loads the csv file to use, drops columns, finds reviews that are above a threshold count
    # returns a dataframe with only review informations for the most reviewed books
    rating_df = pd.read_csv(file)
    pop_rating = rating_df.groupby(['asin']).count().rename(columns = {'overall':'review_count'})
    rating_df['review_count'] = rating_df['asin'].apply(lambda x: pop_rating['review_count'][x])
    good_rating_df = rating_df[rating_df['review_count']>threshold].copy()
    good_rating_df.drop(columns = ['review_count', 'Unnamed: 0'], inplace=True)
    
    return good_rating_df

rating_pivot = top_reviewed_books('books_ratings.csv', 359)

In [6]:
rating_pivot.head()

Unnamed: 0,asin,reviewerID,overall
314,2007770,A35VAQCQ4U28V7,2.0
315,2007770,APBA7Y4SINS0H,1.0
316,2007770,A3FVQCS5Q09W9N,5.0
317,2007770,A22019GDU7OVFG,5.0
318,2007770,A1X3ASRHM5ZF8K,3.0


In [7]:
target= rating_pivot['overall']

In [8]:
X_ = rating_pivot.drop(columns = ['overall'])

In [9]:
X_.head()

Unnamed: 0,asin,reviewerID
314,2007770,A35VAQCQ4U28V7
315,2007770,APBA7Y4SINS0H
316,2007770,A3FVQCS5Q09W9N
317,2007770,A22019GDU7OVFG
318,2007770,A1X3ASRHM5ZF8K


In [10]:
unique_ids={}
for no, rev in enumerate((X_['reviewerID'].unique())):
    unique_ids[rev] = no

In [11]:
X_['reviews'] = X_['reviewerID'].map(unique_ids)

In [12]:
X_.head()

Unnamed: 0,asin,reviewerID,reviews
314,2007770,A35VAQCQ4U28V7,0
315,2007770,APBA7Y4SINS0H,1
316,2007770,A3FVQCS5Q09W9N,2
317,2007770,A22019GDU7OVFG,3
318,2007770,A1X3ASRHM5ZF8K,4


In [13]:
X_.drop(columns=['reviewerID'], inplace=True)

In [14]:
find_X = lambda x: x.replace('X', '10')

In [15]:
X_['asin'] = X_['asin'].apply(lambda x: find_X(x) if 'X' in x else x)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_, target, test_size=0.33, random_state=42)

In [17]:
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

In [18]:
model_knn = KNeighborsClassifier(n_neighbors = 4, metric='cosine', algorithm='brute')
model_knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')

In [1]:
 pred_reviews = model_knn.predict(X_test)

NameError: name 'model_knn' is not defined

In [None]:
model_knn.score(X_test, y_test)

###neighbors = 5, metric mink
model_knn.score(X_test, y_test)
0.47

### Pivot Table

In [6]:
rating_pivot_x = rating_pivot.pivot(index = 'asin', columns = 'reviewerID', values = 'overall')

In [5]:
rating_pivot_x = rating_pivot_x.fillna(0)

### Utility (Sparse) Matrix

In [6]:
util_matrix = csr_matrix(rating_pivot_x.values)

### KNN Modeling

In [7]:
from sklearn.neighbors import NearestNeighbors

In [8]:
model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute')
model_knn.fit(util_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

### Setting up Random tests for Recommendations

In [92]:
query_index = np.random.choice(rating_pivot_x.shape[0])

In [93]:
distance, indices = model_knn.kneighbors(rating_pivot_x.iloc[query_index, :].values.reshape(1,-1), n_neighbors=6)

### Finding the Book name as data is in ISBN -10

In [95]:
import requests
import re

def get_response(url):
    response = requests.get(url)
    # print('Acquired following info about the book {}'.format(response.text))
    json_data = json.loads(response.text)
    return json_data                              
                    
    
def get_title_of_book(text):
    title_ =[]
    for x in text.keys():
        # print(x)
        title_.append(text[x]['title'])
    return title_
    
    
def form_url(ISBN=None):
    """Formulate an url that will be sent to openlibrary.org/api'.

    Parameters
    ----------
    ISBN : string
        a string of book's ISBN.

    Returns
    -------
    string
        An url that will be sent to openlibrary/org/api.

    """
    url = 'https://openlibrary.org/api/books?bibkeys=ISBN:' + \
        ISBN + '&jscmd=data&format=json'

    return url

def get_book_name(ISBN):
        url = form_url(ISBN)
        text = get_response(url)
        name_of_book = get_title_of_book(text)
        
        return(name_of_book)


## Following up with KNN and Book Names

In [106]:
book_names = []

for i in range(len(distance.flatten())):
    ISBN = rating_pivot_x.index[indices.flatten()[i]]
    print(ISBN)
    name_of_book = get_book_name(ISBN)
    if name_of_book:
        book_names.append(name_of_book)
    else:
        book_names.append(ISBN)

for i in range(len(distance.flatten())):
    if i==0:
        print('Recommendations for {0}:\n'.format(book_names[i]))
        matrix_fact_knn_check_isbn = rating_pivot_x.index[indices.flatten()[i]]
    else:
        print("{0}:{1} with distances of {2}".format(i, book_names[i], 
                                                     distance.flatten()[i]))

0060573775
0060890096
0140012486
014118776X
0060850523
0062314254
Recommendations for ["Slaughterhouse-Five (or The Children's Crusade: A Duty Dance with Death)"]:

1:['Catch-22 CD'] with distances of 0.8788552653328394
2:['The Catcher in the Rye (Modern Classics)'] with distances of 0.8992043362015576
3:['Nineteen Eighty-four (Penguin Modern Classics)'] with distances of 0.9010827065366002
4:['Brave New World'] with distances of 0.9104149605988603
5:0062314254 with distances of 0.9227942205673346


## Matrix Factorization

In [19]:
rating_pivot_mat_fac = rating_pivot.pivot(index = 'reviewerID', columns = 'asin', values = 'overall')

In [20]:
# fill null values with zero
rating_pivot_mat_fac = rating_pivot_mat_fac.fillna(0)

In [66]:
util_matrix = rating_pivot_mat_fac.values.T

In [67]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components = 12, random_state = 17)

In [68]:
matrix = SVD.fit_transform(util_matrix)
matrix.shape

(292, 12)

In [69]:
corr = np.corrcoef(matrix)
corr.shape

(292, 292)

In [70]:
book_isbn10 = rating_pivot_mat_fac.columns

In [71]:
isbn_list = list(book_isbn10)

In [98]:
# ISBN or 'asin' number of the queried index above in KNN
ISBN = matrix_fact_knn_check_isbn

In [99]:
compare_recs = isbn_list.index(ISBN)

In [100]:
print(compare_recs)

32


In [101]:
corr_compare_isbn = corr[compare_recs]

In [102]:
recommendations = list(book_isbn10[(corr_compare_isbn < 1.0) & (corr_compare_isbn > 0.9)])

In [103]:
print(recommendations)

['0007205236', '0020519109', '0025853503', '0026009102', '0060098902', '0060194995', '0060573775', '0060652926', '0060652934', '006073132X', '0060738170', '0060740450', '0060755334', '0060794410', '0060838582', '0060850523', '0060890096', '0061122416', '0062008110', '0062314254', '0091883768', '0099244721', '0099297701', '0099908506', '0140012486', '0140049975', '0140058893', '0140060286', '014014773X', '0140177396', '0140186409', '0140264078', '0140283331', '0140306013', '0140390839', '014044615X', '0140621199', '0140621806', '0141182571', '014118776X', '014143984X', '0141884819', '0142000280', '0142800821', '0143036556', '0143038583', '0143143743', '0195014766', '020530902X', '0307237699', '0307265773', '0307275639', '0307277674', '0307353133', '0307387178', '0307475255']


In [109]:
matrix_fact_knn_check_isbn in recommendations

True

In [113]:
corr_compare_isbn[isbn_list.index('0007205236')]

0.9403461326476087

In [127]:
book_names = []
book_names.append(get_book_name(matrix_fact_knn_check_isbn))

for x in recommendations:
    name_of_book = get_book_name(x)
    if name_of_book:
        book_names.append(name_of_book)
    else:
        book_names.append(x)


In [128]:
corr_compare_isbn[isbn_list.index('0007205236')]

0.9403461326476087

In [130]:
for i in range(len(book_names)):
    if i==0:
        print('If you like {0}:\n'.format(book_names[i]))
    else:
        print("{0}:We recommend reading {1} with a distance of {2}".format(i, book_names[i], \
              corr_compare_isbn[isbn_list.index(recommendations[i])]))

If you like ["Slaughterhouse-Five (or The Children's Crusade: A Duty Dance with Death)"]:

1:We recommend reading ["Angela's Ashes"] with a distance of 0.9922459475954086
2:We recommend reading ['The Old Man and the Sea (A Scribner Classic)'] with a distance of 0.9140587125150821
3:We recommend reading ['Gone With the Wind'] with a distance of 0.9722917414648534
4:We recommend reading ['The fountainhead'] with a distance of 0.9378229529291554
5:We recommend reading ['The Great Gatsby'] with a distance of 0.9883941780641975
6:We recommend reading 0060194995 with a distance of 0.9999999999999999
7:We recommend reading ["Slaughterhouse-Five (or The Children's Crusade: A Duty Dance with Death)"] with a distance of 0.9449399347044792
8:We recommend reading ['Mere Christianity'] with a distance of 0.9049965188474759
9:We recommend reading ['The Screwtape Letters'] with a distance of 0.9486854192345556
10:We recommend reading ['Freakonomics'] with a distance of 0.9411111641545542
11:We recomm

IndexError: list index out of range