In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import seaborn
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
 

In [2]:
book=pd.read_csv("Books.csv",low_memory=False)
print(book.shape)
print(list(book.columns))
user=pd.read_csv("Users.csv",low_memory=False)
print(user.shape)
print(list(user.columns))
rating=pd.read_csv("Ratings.csv",low_memory=False)    
print(rating.shape)
print(list(rating.columns))

(271360, 8)
['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
(278858, 3)
['User-ID', 'Location', 'Age']
(1149780, 3)
['User-ID', 'ISBN', 'Book-Rating']


In [3]:
combine_book_rating=pd.merge(book,rating,on="ISBN")
columns=['Year-Of-Publication','Publisher','Book-Author']
combine_book_rating=combine_book_rating.drop(columns,axis=1)
combine_book_rating.head()
 

Unnamed: 0,ISBN,Book-Title,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [4]:
combine_book_rating=combine_book_rating.dropna(axis=0,subset=['Book-Title'])
combine_book_rating.head()

Unnamed: 0,ISBN,Book-Title,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [5]:
book_ratingCount=(combine_book_rating.
                  groupby(by=['Book-Title'])['Book-Rating'].
                  count().
                  reset_index().
                  rename(columns={'Book-Rating':'totalRatingCount'})
                  [['Book-Title','totalRatingCount']]
                 )
book_ratingCount.head()

Unnamed: 0,Book-Title,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [6]:
book_ratingCount.info()
book_ratingCount.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241071 entries, 0 to 241070
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Book-Title        241071 non-null  object
 1   totalRatingCount  241071 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


Unnamed: 0,totalRatingCount
count,241071.0
mean,4.277312
std,16.738685
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,2502.0


In [7]:
rating_with_totalRatingCount=combine_book_rating.merge(book_ratingCount,left_on='Book-Title',right_on='Book-Title',how="inner")
rating_with_totalRatingCount.head()
rating_with_totalRatingCount.info()
rating_with_totalRatingCount.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031136 entries, 0 to 1031135
Data columns (total 8 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   ISBN              1031136 non-null  object
 1   Book-Title        1031136 non-null  object
 2   Image-URL-S       1031136 non-null  object
 3   Image-URL-M       1031136 non-null  object
 4   Image-URL-L       1031132 non-null  object
 5   User-ID           1031136 non-null  int64 
 6   Book-Rating       1031136 non-null  int64 
 7   totalRatingCount  1031136 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 70.8+ MB


Unnamed: 0,User-ID,Book-Rating,totalRatingCount
count,1031136.0,1031136.0,1031136.0
mean,140594.5,2.839051,69.78162
std,80524.66,3.854157,175.3381
min,2.0,0.0,1.0
25%,70415.0,0.0,3.0
50%,141210.0,0.0,13.0
75%,211426.0,7.0,61.0
max,278854.0,10.0,2502.0


In [8]:
rating_with_totalRatingCount['totalRatingCount'].count()

1031136

In [9]:
popularity_threshold=50
#rating_popular_book=rating_with_totalRatingCount.query('totalRatingCount>=@popularity_threshold')

In [10]:
rating_popular_book=rating_with_totalRatingCount[rating_with_totalRatingCount['totalRatingCount']>popularity_threshold]
rating_popular_book.head()
 

Unnamed: 0,ISBN,Book-Title,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,totalRatingCount
31,399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,8,0,311
32,399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,11676,9,311
33,399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,29526,9,311
34,399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,36836,0,311
35,399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,46398,9,311


In [11]:
#I am filtering the users data to only US and Canada only
combined=rating_popular_book.merge(user,left_on='User-ID',right_on='User-ID',how="inner")
combined.head()
 

Unnamed: 0,ISBN,Book-Title,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,totalRatingCount,Location,Age
0,0399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,8,0,311,"timmins, ontario, canada",
1,1558746218,A Second Chicken Soup for the Woman's Soul (Ch...,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,8,0,56,"timmins, ontario, canada",
2,0399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,11676,9,311,"n/a, n/a, n/a",
3,080410753X,The Kitchen God's Wife,http://images.amazon.com/images/P/080410753X.0...,http://images.amazon.com/images/P/080410753X.0...,http://images.amazon.com/images/P/080410753X.0...,11676,8,311,"n/a, n/a, n/a",
4,1558746226,A Second Chicken Soup for the Woman's Soul (Ch...,http://images.amazon.com/images/P/1558746226.0...,http://images.amazon.com/images/P/1558746226.0...,http://images.amazon.com/images/P/1558746226.0...,11676,9,56,"n/a, n/a, n/a",


In [12]:
us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating.head()

Unnamed: 0,ISBN,Book-Title,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,totalRatingCount,Location,Age
0,399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,8,0,311,"timmins, ontario, canada",
1,1558746218,A Second Chicken Soup for the Woman's Soul (Ch...,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,8,0,56,"timmins, ontario, canada",
2216,399135782,The Kitchen God's Wife,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,29526,9,311,"knoxville, tennessee, usa",26.0
2217,679444815,Timeline,http://images.amazon.com/images/P/0679444815.0...,http://images.amazon.com/images/P/0679444815.0...,http://images.amazon.com/images/P/0679444815.0...,29526,10,552,"knoxville, tennessee, usa",26.0
2218,312261594,Female Intelligence,http://images.amazon.com/images/P/0312261594.0...,http://images.amazon.com/images/P/0312261594.0...,http://images.amazon.com/images/P/0312261594.0...,29526,8,76,"knoxville, tennessee, usa",26.0


In [13]:
us_canada_user_rating=us_canada_user_rating.drop(['Age'],axis=1)
us_canada_user_rating.head()
us_canada_user_rating.describe()
 

Unnamed: 0,User-ID,Book-Rating,totalRatingCount
count,248816.0,248816.0,248816.0
mean,140613.923767,3.029307,215.15591
std,79913.350862,3.941275,249.714586
min,8.0,0.0,51.0
25%,70062.5,0.0,81.0
50%,139742.0,0.0,136.0
75%,210959.0,7.0,258.0
max,278854.0,10.0,2502.0


In [14]:
us_canada_user_rating=us_canada_user_rating.drop_duplicates(['User-ID','Book-Title'])
us_canada_user_rating.info()
 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246169 entries, 0 to 285589
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ISBN              246169 non-null  object
 1   Book-Title        246169 non-null  object
 2   Image-URL-S       246169 non-null  object
 3   Image-URL-M       246169 non-null  object
 4   Image-URL-L       246169 non-null  object
 5   User-ID           246169 non-null  int64 
 6   Book-Rating       246169 non-null  int64 
 7   totalRatingCount  246169 non-null  int64 
 8   Location          246169 non-null  object
dtypes: int64(3), object(6)
memory usage: 18.8+ MB


In [15]:
# Similar to kNN, we convert our USA Canada user rating table into a 2D matrix (called a utility matrix here)
# and fill the missing values with zeros.

In [16]:
us_canada_user_rating_pivot2=us_canada_user_rating.pivot(index="User-ID",columns="Book-Title",values="Book-Rating").fillna(0)
us_canada_user_rating_pivot2
 

Book-Title,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,24 Hours,2nd Chance,3rd Degree,...,YOU BELONG TO ME,Year of Wonders,You Belong To Me,You Shall Know Our Velocity,Young Wives,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
us_canada_user_rating_pivot2.shape
X=us_canada_user_rating_pivot2.values.T  #Through this transposing our userID now becomes Columns and bookTitle now becomes Rows
X.shape
import sklearn
from sklearn.decomposition import TruncatedSVD
SVD=TruncatedSVD(n_components=12,random_state=17)  #Through this we are doing Compression.
matrix=SVD.fit_transform(X)
matrix.shape

(2379, 12)

In [18]:
import warnings
warnings.filterwarnings("ignore",category=RuntimeWarning)
corr=np.corrcoef(matrix)
corr.shape  
(2379, 2379)

(2379, 2379)

In [19]:
us_canada_book_title=us_canada_user_rating_pivot2.columns
#Now, Convert into the list
us_canada_book_list=list(us_canada_book_title)
us_canada_book_list  
query=us_canada_book_list.index("Godplayer")

In [20]:
#THEN AT LAST ->
queryans=corr[query]
print(queryans)
# array([0.51495888, 0.22421261, 0.38573675, ..., 0.54759568, 0.16899023,  0.11424015])
#We have it ->
list(us_canada_book_title[(queryans<1.0) & (queryans>0.9)])

[ 0.49196951  0.15348131  0.32173025 ...  0.54824856  0.14300504
 -0.00999896]


['A Case of Need',
 'Acceptable Risk',
 'Airframe',
 'Chromosome 6',
 'Disclosure',
 'Godplayer',
 'Invasion',
 'Masquerade',
 'Mindbend',
 "Pretend You Don't See Her",
 'Remember',
 'The Cat Who Robbed a Bank (Cat Who... (Paperback))',
 'The Gold Coast',
 'The Presence',
 'The Right Hand of Evil',
 'The Terminal Man',
 "Tom Clancy's Op-Center: Mirror Image (Tom Clancy's Op Center (Paperback))"]