In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD

In [25]:
book=pd.read_csv('Data/BX-Books.csv', sep=';', error_bad_lines=False, 
                 encoding="latin-1")
book.columns=['ISBN','bookTitle','bookAuthor','yearOfPublication', 
              'publisher', 'imageUr1S', 'imageUr1M','imageUr1L']
user=pd.read_csv('Data/BX-Users.csv', sep=';', error_bad_lines=False,
                encoding="latin-1")
user.columns=['userID', 'Location', 'Age']
rating=pd.read_csv('Data/BX-Book-Ratings.csv', sep=';', error_bad_lines=False,
                encoding="latin-1")
rating.columns=['userID','ISBN','bookRating']

Skipping line 6451: expected 8 fields, saw 9
Skipping line 43666: expected 8 fields, saw 10
Skipping line 51750: expected 8 fields, saw 9

Skipping line 92037: expected 8 fields, saw 9
Skipping line 104318: expected 8 fields, saw 9
Skipping line 121767: expected 8 fields, saw 9

Skipping line 144057: expected 8 fields, saw 9
Skipping line 150788: expected 8 fields, saw 9
Skipping line 157127: expected 8 fields, saw 9
Skipping line 180188: expected 8 fields, saw 9
Skipping line 185737: expected 8 fields, saw 9

Skipping line 209387: expected 8 fields, saw 9
Skipping line 220625: expected 8 fields, saw 9
Skipping line 227932: expected 8 fields, saw 11
Skipping line 228956: expected 8 fields, saw 10
Skipping line 245932: expected 8 fields, saw 9
Skipping line 251295: expected 8 fields, saw 9
Skipping line 259940: expected 8 fields, saw 9
Skipping line 261528: expected 8 fields, saw 9



In [5]:
rating.head()

Unnamed: 0,userID,ISBN,bookRating
0,276726,0155061224,5
1,276727,0446520802,0
2,276729,052165615X,3
3,276729,0521795028,6
4,276733,2080674722,0


In [6]:
user.head()

Unnamed: 0,1,"nyc, new york, usa",\N
0,2,"stockton, california, usa",18
1,3,"moscow, yukon territory, russia",\N
2,4,"porto, v.n.gaia, portugal",17
3,5,"farnborough, hants, united kingdom",\N
4,6,"santa monica, california, usa",61


In [7]:
book.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUr1S,imageUr1M,imageUr1L
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
1,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
4,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...


In [8]:
combine_book_rating=pd.merge(rating,book,on='ISBN')
columns=['yearOfPublication', 'publisher', 'bookAuthor', 
         'imageUr1S', 'imageUr1M','imageUr1L']
combine_book_rating=combine_book_rating.drop(columns, axis=1)
combine_book_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle
0,276726,155061224,5,Rites of Passage
1,159181,155061224,0,Rites of Passage
2,276727,446520802,0,The Notebook
3,278418,446520802,0,The Notebook
4,638,446520802,0,The Notebook


### Filter to only popular books
Remove rows where book title is missing

In [10]:
combine_book_rating=combine_book_rating.dropna(axis=0, subset=['bookTitle'])

In [11]:
book_ratingCount=(combine_book_rating.groupby(by=['bookTitle'])['bookRating'].
                  count().reset_index().rename(columns={'bookRating':'totalRatingCount'})
                 [['bookTitle','totalRatingCount']])
book_ratingCount.head()

Unnamed: 0,bookTitle,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


#### Now we can merge the total rating count data into the rating data, giving us exactly what we need to filter out the lesser known books.

In [16]:
rating_with_totalRatingCount=combine_book_rating.merge(book_ratingCount, 
                                                       left_on='bookTitle',right_on='bookTitle', how='left')
rating_with_totalRatingCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
0,276726,155061224,5,Rites of Passage,14
1,159181,155061224,0,Rites of Passage,14
2,276727,446520802,0,The Notebook,650
3,278418,446520802,0,The Notebook,650
4,638,446520802,0,The Notebook,650


In [17]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   241071.000
mean         4.277
std         16.739
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


#### The median book has only been rated one time. Let's take a look at the top of the distribution.

In [18]:
print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9,1,.01)))

0.900    7.000
0.910    8.000
0.920    9.000
0.930   10.000
0.940   11.000
0.950   13.000
0.960   16.000
0.970   20.000
0.980   29.000
0.990   50.000
Name: totalRatingCount, dtype: float64


#### So about 1% of books have 50 ratings, 2% have 29 ratings. Since we have so many books in our data, we will limit it to the top 1%, this will give us 2713 different books.

In [19]:
popularity_threshold=50
rating_popular_book=rating_with_totalRatingCount.query('totalRatingCount>=@popularity_threshold')
rating_popular_book.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
2,276727,446520802,0,The Notebook,650
3,278418,446520802,0,The Notebook,650
4,638,446520802,0,The Notebook,650
5,3363,446520802,0,The Notebook,650
6,7158,446520802,10,The Notebook,650


#### Filtering to US users only

In [24]:
user.head()

Unnamed: 0,1,"nyc, new york, usa",\N
0,2,"stockton, california, usa",18
1,3,"moscow, yukon territory, russia",\N
2,4,"porto, v.n.gaia, portugal",17
3,5,"farnborough, hants, united kingdom",\N
4,6,"santa monica, california, usa",61


In [29]:
combined=rating_popular_book.merge(user, left_on = 'userID', right_on='userID', how='left')
us_canada_user_rating=combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)
us_canada_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
1,278418,446520802,0,The Notebook,650,"omaha, nebraska, usa"
2,638,446520802,0,The Notebook,650,"san diego, california, usa"
3,3363,446520802,0,The Notebook,650,"knoxville, tennessee, usa"
4,7158,446520802,10,The Notebook,650,"omaha, nebraska, usa"
5,8253,446520802,10,The Notebook,650,"tulsa, oklahoma, usa"


In [30]:
if not us_canada_user_rating[us_canada_user_rating.duplicated(['userID', 'bookTitle'])].empty:
    initial_rows=us_canada_user_rating.shape[0]
    print('Initial dataframe shape {0}'. format(us_canada_user_rating.shape))
    us_canada_user_rating=us_canada_user_rating.drop_duplicates(['userID','bookTitle'])
    current_rows=us_canada_user_rating.shape[0]
    print('New dataframe shape{0}'.format(us_canada_user_rating.shape))
    print('Removed {0} rows'.format(initial_rows-current_rows))

Initial dataframe shape (251614, 6)
New dataframe shape(248948, 6)
Removed 2666 rows


In [31]:
us_canada_user_rating_pivot=us_canada_user_rating.pivot(index='bookTitle', columns='userID', values='bookRating').fillna(0)
us_canada_user_rating_matrix=csr_matrix(us_canada_user_rating_pivot.values)

In [33]:
from sklearn.neighbors import NearestNeighbors
model_knn=NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(us_canada_user_rating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [35]:
query_index=np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices=model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].reshape(1,-1), n_neighbors=6)
for i in range(0,len(distances.flatten())):
    if i==0:
        print('Recommendation for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}:{1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendation for Their Eyes Were Watching God: A Novel:

1:A Natural History of the Senses, with distance of 0.882007219778:
2:The Red Room, with distance of 0.892106990628:
3:Plague Dogs, with distance of 0.900317370223:
4:Invisible Man, with distance of 0.901310440746:
5:Oranges Are Not the Only Fruit, with distance of 0.902584301165:


  


In [36]:
us_canada_user_rating_pivot2=us_canada_user_rating.pivot(index='userID',columns='bookTitle',values='bookRating').fillna(0)

In [37]:
us_canada_user_rating_pivot2.head()

bookTitle,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,24 Hours,2nd Chance,3rd Degree,...,YOU BELONG TO ME,Year of Wonders,You Belong To Me,You Shall Know Our Velocity,Young Wives,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
us_canada_user_rating_pivot2.shape

(40016, 2442)

In [41]:
X=us_canada_user_rating_pivot2.values.T
X.shape

(2442, 40016)

In [42]:
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD =TruncatedSVD(n_components=12, random_state=17)
matrix =SVD.fit_transform(X)
matrix.shape

(2442, 12)

In [43]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
corr=np.corrcoef(matrix)
corr.shape

(2442, 2442)

In [44]:
us_canada_book_title=us_canada_user_rating_pivot2.columns
us_canada_book_list=list(us_canada_book_title)
coffey_hands=us_canada_book_list.index("The Green Mile: Coffey's Hands (Green Mile Series)")
print(coffey_hands)

1906


In [45]:
corr_coffey_hands=corr[coffey_hands]

In [46]:
list(us_canada_book_title[(corr_coffey_hands<1.0)&(corr_coffey_hands>0.9)])

[u'Cold Fire',
 u'Desperation',
 u'Needful Things',
 u'Rose Madder',
 u'The Bachman Books: Rage, the Long Walk, Roadwork, the Running Man',
 u'The Dark Half',
 u'The Dead Zone',
 u'The Green Mile: Coffey on the Mile (Green Mile Series)',
 u'The Green Mile: Night Journey (Green Mile Series)',
 u'The Green Mile: The Bad Death of Eduard Delacroix (Green Mile Series)',
 u'The Green Mile: The Mouse on the Mile (Green Mile Series)',
 u'The Shining',
 u'The Two Dead Girls (Green Mile Series)']