In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
books = pd.read_csv("C:\\Users\\BISWAJIT\\Documents\\Books.csv",low_memory=False)

In [3]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
books=books.drop(columns = ['Image-URL-S', 'Image-URL-M','Image-URL-L'])

In [5]:
print(books.shape)
print(list(books.columns))

(271360, 5)
['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']


In [6]:
# For data cleansing in column
for column in books.columns:
    print(pd.api.types.infer_dtype(books[column]))

string
string
string
string
string


In [7]:
# Converting mixed-integer to int type

books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce').astype('Int64')

In [8]:
ratings = pd.read_csv('C:\\Users\\BISWAJIT\\Documents\\Ratings.csv')

In [9]:
print(ratings.shape)
print(list(ratings.columns))

(1149780, 3)
['User-ID', 'ISBN', 'Book-Rating']


In [10]:
users=pd.read_csv('C:\\Users\\BISWAJIT\\Documents\\Users.csv')

In [11]:
print(users.shape)
print(list(users.columns))

(278858, 3)
['User-ID', 'Location', 'Age']


# Collaborative Filtering Using k-Nearest Neighbors (kNN)

In [12]:
combine_book_rating = pd.merge(ratings, books, on='ISBN')
columns = ['Year-Of-Publication', 'Publisher', 'Book-Author']
print(combine_book_rating.columns)

Index(['User-ID', 'ISBN', 'Book-Rating', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher'],
      dtype='object')


In [13]:
# Rearranging the columns

combine_book_rating.drop_duplicates(['User-ID', 'Book-Title'])

combine_book_rating.head().iloc[:,[0,1,3,2,5,4,6]]

Unnamed: 0,User-ID,ISBN,Book-Title,Book-Rating,Year-Of-Publication,Book-Author,Publisher
0,276725,034545104X,Flesh Tones: A Novel,0,2002,M. J. Rose,Ballantine Books
1,2313,034545104X,Flesh Tones: A Novel,5,2002,M. J. Rose,Ballantine Books
2,6543,034545104X,Flesh Tones: A Novel,0,2002,M. J. Rose,Ballantine Books
3,8680,034545104X,Flesh Tones: A Novel,5,2002,M. J. Rose,Ballantine Books
4,10314,034545104X,Flesh Tones: A Novel,9,2002,M. J. Rose,Ballantine Books


In [14]:
combine_book_rating.shape



(1031136, 7)

In [15]:
 combine_book_rating.count(axis=0, level=None, numeric_only=False)

User-ID                1031136
ISBN                   1031136
Book-Rating            1031136
Book-Title             1031136
Book-Author            1031135
Year-Of-Publication    1031132
Publisher              1031134
dtype: int64

In [16]:
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['Book-Title'])

book_ratingCount = (combine_book_rating.
     groupby(by = ['Book-Title'])['Book-Rating'].
     count().
     reset_index().
     rename(columns = {'Book-Rating': 'totalRatingCount'})
     [['Book-Title', 'totalRatingCount']]
    )
book_ratingCount.head()

Unnamed: 0,Book-Title,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [17]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'Book-Title', right_on = 'Book-Title', how = 'left')
rating_with_totalRatingCount.head()
rating_with_totalRatingCount.shape

(1031136, 8)

In [18]:
# The statistics of total rating count

pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   241071.000
mean         4.277
std         16.739
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


In [19]:
# The median book has been rated only once.The top of the distribution:

print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

0.900    7.000
0.910    8.000
0.920    9.000
0.930   10.000
0.940   11.000
0.950   13.000
0.960   16.000
0.970   20.000
0.980   29.000
0.990   50.000
Name: totalRatingCount, dtype: float64


In [20]:
# About 1% of the books received 50 or more ratings.
# Because we have so many books in our data, we will limit it to the top 1%

popularity_threshold = 50
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
print(list(rating_popular_book.columns))
rating_popular_book.head()

['User-ID', 'ISBN', 'Book-Rating', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'totalRatingCount']


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60


In [21]:
rating_popular_book.shape

(288740, 8)

# Filter to users in US and Canada only
In order to improve computing speed, and not run into the “MemoryError” issue, I will limit our user data to those in the US and Canada. And then combine user data with the rating data and total rating count data.

In [22]:
combined = rating_popular_book.merge(users, left_on = 'User-ID', right_on = 'User-ID', how = 'left')
us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)
us_canada_user_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,totalRatingCount,Location
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60,"tyler, texas, usa"
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60,"cincinnati, ohio, usa"
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60,"strafford, missouri, usa"
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60,"st. charles county, missouri, usa"
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,60,"beaverton, oregon, usa"


# Implementing kNN
We convert our table to a 2D matrix, and fill the missing values with zeros (since we will calculate distances between rating vectors). We then transform the values(ratings) of the matrix dataframe into a scipy sparse matrix for more efficient calculations.
# Finding the Nearest Neighbors
We use unsupervised algorithms with sklearn.neighbors. The algorithm we use to compute the nearest neighbors is “brute”, and we specify “metric=cosine” so that the algorithm will calculate the cosine similarity between rating vectors. Finally, we fit the model.

In [23]:
from scipy.sparse import csr_matrix
us_canada_user_rating = us_canada_user_rating.drop_duplicates(['User-ID', 'Book-Title'])
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'Book-Title', columns = 'User-ID', values = 'Book-Rating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)



In [24]:
us_canada_user_rating_pivot.head()

User-ID,8,9,14,16,17,23,26,32,39,42,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
us_canada_user_rating_pivot.index

Index(['10 Lb. Penalty', '16 Lighthouse Road', '1984', '1st to Die: A Novel',
       '2010: Odyssey Two', '204 Rosewood Lane', '2061: Odyssey Three',
       '24 Hours', '2nd Chance', '3rd Degree',
       ...
       'YOU BELONG TO ME', 'Year of Wonders', 'You Belong To Me',
       'You Shall Know Our Velocity', 'Young Wives',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"',
       '\Surely You're Joking, Mr. Feynman!\": Adventures of a Curious Character"',
       'stardust'],
      dtype='object', name='Book-Title', length=2442)

In [26]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [27]:
us_canada_user_rating_pivot.iloc[:].values.reshape(1,-1)

array([[0., 0., 0., ..., 0., 0., 0.]])

In [28]:
us_canada_user_rating_pivot.shape

(2442, 40017)

In [29]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)


In [30]:
us_canada_user_rating_pivot.index[query_index]

'The Ritual Bath (Peter Decker &amp; Rina Lazarus Novels (Paperback))'

In [31]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for The Ritual Bath (Peter Decker &amp; Rina Lazarus Novels (Paperback)):

1: Sacred and Profane (Peter Decker &amp; Rina Lazarus Novels (Paperback)), with distance of 0.5914450268567838:
2: Silent Partner (Alex Delaware Novels (Paperback)), with distance of 0.8086968101472588:
3: Time Bomb (Alex Delaware Novels (Paperback)), with distance of 0.8139545385945312:
4: The Clinic (Alex Delaware Novels (Paperback)), with distance of 0.8249137044410277:
5: Survival of the Fittest: An Alex Delaware Novel, with distance of 0.8356221150601558:
