# Book Recommendation System

### importing relevant libraries

In [70]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore',category=RuntimeWarning)

### importing the datasets

In [2]:
book=pd.read_csv('BX-Books.csv',sep=';',error_bad_lines=False,encoding='latin-1')
book.columns=['ISBN','bookTitle','author','yearOfPublication','publisher','imageUrlS','imageUrlM','imageUrlL']
book.head()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


Unnamed: 0,ISBN,bookTitle,author,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
user=pd.read_csv('BX-Users.csv',sep=';',error_bad_lines=False,encoding='latin-1')
user.columns=['userID','Location','age']
user.head()

Unnamed: 0,userID,Location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
rating=pd.read_csv('BX-Book-Ratings.csv',sep=';',error_bad_lines=False,encoding='latin-1')
rating.columns=['userID','ISBN','bookRating']
rating.head()

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


### combining the ratings and book columns

In [5]:
combine_book_rating=pd.merge(rating,book,on='ISBN')
columns=['yearOfPublication','publisher','author','imageUrlS','imageUrlM','imageUrlL']
combine_book_rating.drop(columns,axis=1,inplace=True)

In [6]:
combine_book_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


### new column for total ratings count

In [7]:
combine_book_rating.dropna(axis=0,subset=['bookTitle'],inplace=True)

In [8]:
book_ratingCount=(combine_book_rating.groupby(['bookTitle'])['bookRating'].count().
                  reset_index().
                 rename(columns={'bookRating':'totalRatingCount'})
                 [['bookTitle','totalRatingCount']])

In [9]:
book_ratingCount.head()

Unnamed: 0,bookTitle,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


### combining rating data with total rating count data to filter out lesser known books

In [10]:
ratings_with_totalRatingCount=combine_book_rating.merge(book_ratingCount,left_on='bookTitle',right_on='bookTitle',how='left')
ratings_with_totalRatingCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


## statistics

In [11]:
pd.set_option('display.float_format',lambda x:'%.3f' % x)
book_ratingCount['totalRatingCount'].describe()

count   241071.000
mean         4.277
std         16.739
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64

### only 1% of books received 50 or more ratings

In [12]:
threshold=50
rating_popular_book=ratings_with_totalRatingCount.query('totalRatingCount >= @threshold')
rating_popular_book.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


### to improve computing speed, we limit to users from US and Canada

In [13]:
combined=rating_popular_book.merge(user,left_on='userID',right_on='userID',how='left')

us_canada_user_rating=combined[combined['Location'].str.contains('usa|canada')]
us_canada_user_rating.drop('age',axis=1,inplace=True)
us_canada_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
0,276725,034545104X,0,Flesh Tones: A Novel,60,"tyler, texas, usa"
1,2313,034545104X,5,Flesh Tones: A Novel,60,"cincinnati, ohio, usa"
2,6543,034545104X,0,Flesh Tones: A Novel,60,"strafford, missouri, usa"
3,8680,034545104X,5,Flesh Tones: A Novel,60,"st. charles county, missouri, usa"
4,10314,034545104X,9,Flesh Tones: A Novel,60,"beaverton, oregon, usa"


### checking and removing rows with same 'userID' and 'bookTitle'

In [17]:
if not us_canada_user_rating[us_canada_user_rating.duplicated(['userID','bookTitle'])].empty:
    initial_rows=us_canada_user_rating.shape[0]
    
    print( 'Initial dataframe shape{0}'.format(us_canada_user_rating.shape))
    us_canada_user_rating=us_canada_user_rating.drop_duplicates(['userID','bookTitle'])
    current_rows=us_canada_user_rating.shape[0]
    print ('New dataframe shape {0}'.format(us_canada_user_rating.shape))
    print( 'removed {0} rows '.format(initial_rows-current_rows))

Initial dataframe shape(251615, 6)
New dataframe shape (248949, 6)
removed 2666 rows 


## Implementing kNN

In [18]:
user_rating_pivot=us_canada_user_rating.pivot(index='bookTitle',columns='userID',values='bookRating').fillna(0)
rating_matrix=csr_matrix(user_rating_pivot.values)

In [19]:
from sklearn.neighbors import NearestNeighbors

knn=NearestNeighbors(metric='cosine',algorithm='brute')
knn.fit(rating_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [29]:
us_canada_user_rating.bookTitle.unique()

array(['Flesh Tones: A Novel', 'The Notebook', 'A Painted House', ...,
       '10 Lb. Penalty', 'Interest of Justice', 'Love in Another Town'],
      dtype=object)

In [30]:
user_rating_pivot

userID,8,9,14,16,17,23,26,32,39,42,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
16 Lighthouse Road,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1984,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1st to Die: A Novel,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
2010: Odyssey Two,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Zoya,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"\O\"" Is for Outlaw""",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


## Testing our model and making predictions 

In [66]:
user_rating_pivot_2=us_canada_user_rating.pivot(index='bookTitle',columns='userID',values='bookRating').fillna(0)

In [67]:
user_rating_pivot_2.head()

userID,8,9,14,16,17,23,26,32,39,42,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
user_rating_pivot_2.shape

(2442, 40017)

In [87]:
us_canada_book_title=user_rating_pivot_2.T.columns
us_canada_book_list=list(us_canada_book_title)
us_canada_book_list

['10 Lb. Penalty',
 '16 Lighthouse Road',
 '1984',
 '1st to Die: A Novel',
 '2010: Odyssey Two',
 '204 Rosewood Lane',
 '2061: Odyssey Three',
 '24 Hours',
 '2nd Chance',
 '3rd Degree',
 '4 Blondes',
 '50 Simple Things You Can Do to Save the Earth',
 '84 Charing Cross Road',
 'A 2nd Helping of Chicken Soup for the Soul (Chicken Soup for the Soul Series (Paper))',
 'A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash',
 'A Bend in the Road',
 'A Case of Need',
 'A Child Called \\It\\": One Child\'s Courage to Survive"',
 'A Civil Action',
 'A Clockwork Orange (Norton Paperback Fiction)',
 'A Cold Heart (Alex Delaware Novels (Paperback))',
 'A Cold Heart: An Alex Delaware Novel',
 'A Confederacy of Dunces',
 'A Confederacy of Dunces (Evergreen Book)',
 'A Cry In The Night',
 "A Cup of Tea (Ballantine Reader's Circle)",
 'A Dangerous Fortune',
 'A Darkness More Than Night',
 'A Day Late and a Dollar Short',
 'A Density of Souls',
 'A Fine Balance',
 'A Game of T

In [74]:
us_canada_book_list

['10 Lb. Penalty',
 '16 Lighthouse Road',
 '1984',
 '1st to Die: A Novel',
 '2010: Odyssey Two',
 '204 Rosewood Lane',
 '2061: Odyssey Three',
 '24 Hours',
 '2nd Chance',
 '3rd Degree',
 '4 Blondes',
 '50 Simple Things You Can Do to Save the Earth',
 '84 Charing Cross Road',
 'A 2nd Helping of Chicken Soup for the Soul (Chicken Soup for the Soul Series (Paper))',
 'A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash',
 'A Bend in the Road',
 'A Case of Need',
 'A Child Called \\It\\": One Child\'s Courage to Survive"',
 'A Civil Action',
 'A Clockwork Orange (Norton Paperback Fiction)',
 'A Cold Heart (Alex Delaware Novels (Paperback))',
 'A Cold Heart: An Alex Delaware Novel',
 'A Confederacy of Dunces',
 'A Confederacy of Dunces (Evergreen Book)',
 'A Cry In The Night',
 "A Cup of Tea (Ballantine Reader's Circle)",
 'A Dangerous Fortune',
 'A Darkness More Than Night',
 'A Day Late and a Dollar Short',
 'A Density of Souls',
 'A Fine Balance',
 'A Game of T

In [92]:
query_index=np.where(user_rating_pivot.index==input('Enter book: '))[0][0]
#np.random.choice(user_rating_pivot.shape[0])

distances,indices= knn.kneighbors(user_rating_pivot.iloc[query_index,:].values.reshape(1,-1),n_neighbors=12)

for i in range(0,len(distances.flatten())):
    if i==0:
        print('Recommendations for {0}:\n'.format(user_rating_pivot.index[query_index]))
    else:
        print('{0}:{1},  with distance of {2} :'.format(i,user_rating_pivot.index[indices.flatten()[i]],distances.flatten()[i]))

Enter book: Catch 22
Recommendations for Catch 22:

1:Invisible Man,  with distance of 0.8888152461605668 :
2:Holidays on Ice : Stories,  with distance of 0.9118698959191662 :
3:The Artist's Way : A Spiritual Path to Higher Creativity,  with distance of 0.9189990240409717 :
4:One Hundred Years of Solitude,  with distance of 0.9201287870570409 :
5:Eats, Shoots &amp; Leaves: The Zero Tolerance Approach to Punctuation,  with distance of 0.9302981509434869 :
6:To the Lighthouse,  with distance of 0.9354511420510541 :
7:Their Eyes Were Watching God,  with distance of 0.9360035644012074 :
8:A Portrait of the Artist As a Young Man,  with distance of 0.9371513860683763 :
9:The Martian Chronicles,  with distance of 0.9375667854372998 :
10:Tricky Business,  with distance of 0.9384885408523949 :
11:Cat's Cradle,  with distance of 0.9398782677671512 :


In [1]:
from fuzzywuzzy import fuzz

def print_book_recommendations(query_book, rating_matrix, knn_model, k):
    """
    Inputs:
    query_book: query artist name
    book_matrix: book user dataframe (not the sparse one, the pandas dataframe)
    knn_model: our previously fitted sklearn knn model
    k: the number of nearest neighbors.
    
    Prints: book recommendations for the query book
    Returns: None
    """
    query_index = None
    ratio_tuples = []
    
    for i in rating_matrix.index:
        ratio = fuzz.ratio(i.lower(), query_book.lower())
        if ratio >= 75:
            current_query_index = rating_matrix.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_query_index))
    
    print ('Possible matches: {0}\n'.format([(x[0], x[1]) for x in ratio_tuples]))
    
    try:
        query_index = max(ratio_tuples, key = lambda x: x[1])[2] # get the index of the best book match in the data
    except:
        print ('Your book didn\'t match any books in the data. Try again')
        return None
    
    distances, indices = knn_model.kneighbors(rating_matrix.iloc[query_index, :].values.reshape(1, -1), n_neighbors = k + 1)

    for i in range(0, len(distances.flatten())):
        if i == 0:
            print ('Recommendations for {0}:\n'.format(rating_matrix.index[query_index]))
        else:
            print ('{0}: {1}, with distance of {2}:'.format(i, rating_matrix.index[indices.flatten()[i]], distances.flatten()[i]))

    return None



In [108]:
print_book_recommendations(input('Enter book name: '),user_rating_pivot,knn,10)

Enter book name: Dark Paradise
Possible matches: [('Dark Paradise', 100), ('Paradise', 76)]

Recommendations for Dark Paradise:

1: Night Sins, with distance of 0.8408167005092462:
2: Self-Defense (Alex Delaware Novels (Paperback)), with distance of 0.8484915543939107:
3: Guilty as Sin, with distance of 0.8543831319701152:
4: Night Prey, with distance of 0.8564257684533574:
5: A Thin Dark Line (Mysteries &amp; Horror), with distance of 0.8591449772755426:
6: Interest of Justice, with distance of 0.8794130046127359:
7: Where the Heart Is, with distance of 0.8801683687355407:
8: Dark Lady, with distance of 0.8858044120355206:
9: Family Album, with distance of 0.8893295897810378:
10: Cry Wolf, with distance of 0.8898834327825457:


In [107]:
import pickle

with open('knn.pickle','wb') as f:
    pickle.dump(knn,f)
    
with open('dataset.pickle','wb') as f:
    pickle.dump(us_canada_user_rating,f)
    
with open('matrix.pickle','wb') as f:
    pickle.dump(user_rating_pivot,f)