In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
books = pd.read_csv('BX_Books.csv', sep = ';')
books.drop( axis=1,  columns=['Image-URL-S', 'Image-URL-M','Image-URL-L'], inplace=True)
books.columns = ['ISBN','BookTitle','BookAuthor','PubTime','Publisher']
books.head()

Unnamed: 0,ISBN,BookTitle,BookAuthor,PubTime,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [27]:
users = pd.read_csv('BX-Users.csv', sep = ';')
users.columns = ['UserId','Loc','Age']
users.head()

Unnamed: 0,UserId,Loc,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [28]:
ratings = pd.read_csv('BX-Book-Ratings.csv', sep = ';')
ratings.columns = ['UserId','ISBN','Rating']
ratings.head()

Unnamed: 0,UserId,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


To ensure statistical significance, users with less than 200 ratings, and books with less than 100 ratings are excluded.

In [42]:
counts1 = ratings['UserId'].value_counts()
ratings = ratings[ratings['UserId'].isin(counts1[counts1 >= 200].index)]

In [53]:
counts2 = ratings['Rating'].value_counts()
ratings = ratings[ratings['Rating'].isin(counts2[counts2 > 100].index)]

KNN

In [58]:
combinedTable = pd.merge(ratings, books, on= 'ISBN')
combinedTable.drop(axis = 1, columns = ['BookAuthor', 'PubTime','Publisher'], inplace = True)
combine_book_rating = combinedTable.dropna(axis = 0, subset = ['BookTitle'])
combine_book_rating

Unnamed: 0,UserId,ISBN,Rating,BookTitle
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...
...,...,...,...,...
488751,275970,1892145022,0,Here Is New York
488752,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...
488753,275970,3411086211,10,Die Biene.
488754,275970,3829021860,0,The Penis Book


In [127]:


book_ratingCount = (combine_book_rating.
     groupby(by = ['BookTitle'])['Rating'].
     count().
     reset_index().
     rename(columns = {'Rating': 'totalRatingCount'})
     [['BookTitle', 'totalRatingCount']]
    )
print(book_ratingCount.head())

                                           BookTitle  totalRatingCount
0   A Light in the Storm: The Civil War Diary of ...                 2
1                              Always Have Popsicles                 1
2               Apple Magic (The Collector's series)                 1
3   Beyond IBM: Leadership Marketing and Finance ...                 1
4   Clifford Visita El Hospital (Clifford El Gran...                 1


In [128]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, 
                                                        left_on = 'BookTitle', 
                                                        right_on = 'BookTitle', 
                                                        how = 'left')
print(rating_with_totalRatingCount.head())

   UserId        ISBN  Rating  \
0  277427  002542730X      10   
1    3363  002542730X       0   
2   11676  002542730X       6   
3   12538  002542730X      10   
4   13552  002542730X       0   

                                           BookTitle  totalRatingCount  
0  Politically Correct Bedtime Stories: Modern Ta...                82  
1  Politically Correct Bedtime Stories: Modern Ta...                82  
2  Politically Correct Bedtime Stories: Modern Ta...                82  
3  Politically Correct Bedtime Stories: Modern Ta...                82  
4  Politically Correct Bedtime Stories: Modern Ta...                82  


In [129]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   160587.000
mean         3.044
std          7.428
min          1.000
25%          1.000
50%          1.000
75%          2.000
max        365.000
Name: totalRatingCount, dtype: float64


In [130]:
popularity_threshold = 50
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
print(rating_popular_book.head())

   UserId        ISBN  Rating  \
0  277427  002542730X      10   
1    3363  002542730X       0   
2   11676  002542730X       6   
3   12538  002542730X      10   
4   13552  002542730X       0   

                                           BookTitle  totalRatingCount  
0  Politically Correct Bedtime Stories: Modern Ta...                82  
1  Politically Correct Bedtime Stories: Modern Ta...                82  
2  Politically Correct Bedtime Stories: Modern Ta...                82  
3  Politically Correct Bedtime Stories: Modern Ta...                82  
4  Politically Correct Bedtime Stories: Modern Ta...                82  


In [136]:
from scipy.sparse import csr_matrix
us_canada_user_rating = rating_popular_book.drop_duplicates(['UserId', 'BookTitle'])
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'BookTitle', columns = 'UserId', values = 'Rating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)
print(model_knn)

NearestNeighbors(algorithm='brute', metric='cosine')


In [138]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
us_canada_user_rating_pivot.index[query_index]

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Reap the Wind:

1: The Ugly Duckling, with distance of 0.7327592803775523:
2: Body of Lies, with distance of 0.7343097618954784:
3: Envy, with distance of 0.7482275597841389:
4: Touching Evil, with distance of 0.779630146338416:
5: Welcome to Temptation, with distance of 0.8007133056992567:
