# Recommender Systems 2 - Collaborative Model-based Filtering

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

### Load (cleaned) data

In [2]:
# Load books
books = pd.read_csv('data/BX-Books_cleaned.csv', encoding="latin-1", low_memory=False)

# Load users
users = pd.read_csv('data/BX-Users_cleaned.csv', encoding="latin-1", low_memory=False)

# Load ratings
ratings = pd.read_csv('data/BX-Book-Ratings_cleaned.csv', encoding="latin-1", low_memory=False)

### Examine data

In [3]:
pd.set_option('display.max_colwidth', 100)

#### books (cleaned)

In [4]:
books.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


#### users (cleaned)

In [5]:
users.head()

Unnamed: 0,userID,Location,Age,country
0,1,"nyc, new york, usa",34,usa
1,2,"stockton, california, usa",18,usa
2,4,"porto, v.n.gaia, portugal",17,portugal
3,5,"farnborough, hants, united kingdom",34,united kingdom
4,6,"santa monica, california, usa",61,usa


#### ratings (cleaned)

In [6]:
ratings.head()

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276733,2080674722,0
4,276744,038550120X,7


### Generate rating statistics

In [7]:
# Calculate the average rating for each book
ratings_stats = pd.DataFrame(ratings.groupby('ISBN')['bookRating'].mean())

# Calculate the number of ratings (including bookRating=0) for each book
ratings_stats['num_ratings'] = pd.DataFrame(ratings.groupby('ISBN')['bookRating'].count())

# Rename column and reset index
ratings_stats.rename(columns={"bookRating" : "avg_rating"}, inplace=True)
ratings_stats.reset_index(inplace=True)

ratings_stats.head()

Unnamed: 0,ISBN,avg_rating,num_ratings
0,0000913154,8.0,1
1,0001010565,0.0,2
2,0001046438,9.0,1
3,0001046713,0.0,1
4,000104687X,6.0,1


### Filter to include only the most popular books (by number of ratings)

In [8]:
# Combine ratings_stats and books
ratings_books_counts = pd.merge(ratings_stats, books, on='ISBN')

# Drop unnecessary columns
columns = ['yearOfPublication', 'publisher', 'bookAuthor']
ratings_books_counts.drop(columns, axis=1, inplace=True)
ratings_books_counts.sample(5)

Unnamed: 0,ISBN,avg_rating,num_ratings,bookTitle
112832,553141430,0.0,1,White Shaman
126042,590921568,5.5,4,The Magic School Bus Out of This World: A Book About Space Rocks (Magic School Bus)
94555,449210014,0.0,3,Monday the Rabbi Took Off (Rabbi Small Mysteries)
69457,385500904,3.0,2,"The Disastrous Mrs. Weldon: The Life, Loves, and Lawsuits of a Legendary Victorian"
154095,743202457,0.0,2,"Bookends : Two Women, One Enduring Friendship"


In [9]:
ratings_books_counts['num_ratings'].describe()

count    264373.000000
mean          3.694920
std          13.200533
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max        2322.000000
Name: num_ratings, dtype: float64

We have over 260,000 books, but the median (50th percentile) book has only 1 rating.  
Let’s review the top 10% of the distribution.

In [10]:
ratings_books_counts['num_ratings'].quantile(np.arange(.9, 1, .01))

0.90     6.0
0.91     7.0
0.92     8.0
0.93     9.0
0.94    10.0
0.95    11.0
0.96    14.0
0.97    17.0
0.98    24.0
0.99    40.0
Name: num_ratings, dtype: float64

Only 4% of books have 14 or more ratings.  
Since we have so many books AND to speed up computation, let's limit our book recommendations to be based on the top 4% of books.

In [11]:
popularity_threshold = 14     # Set number of ratings threshold
most_popular_books = ratings_books_counts.query('num_ratings >= @popularity_threshold')
most_popular_books.shape

(10629, 4)

We still have over 10,000 different popular books

In [12]:
most_popular_books.sample(5)

Unnamed: 0,ISBN,avg_rating,num_ratings,bookTitle
121492,058351149X,1.318182,22,Worst Fears
1143,000716226X,2.9375,16,The Bride Stripped Bare : A Novel
165337,0786015233,0.8125,16,The Jasmine Trade
98999,0451206673,2.763636,55,Pen Pals
119322,0553574639,3.206349,63,The Main Corpse (Culinary Mysteries (Paperback))


In [13]:
# Merge most popular books with the individual ratings for each of these books
most_popular_books_ratings = most_popular_books.merge(ratings, \
                                                      left_on = 'ISBN', \
                                                      right_on = 'ISBN', \
                                                      how = 'left')

most_popular_books_ratings.shape

(418105, 6)

In [14]:
most_popular_books_ratings.head()

Unnamed: 0,ISBN,avg_rating,num_ratings,bookTitle,userID,bookRating
0,2558122,4.111111,18,Angelas Ashes,278474,0
1,2558122,4.111111,18,Angelas Ashes,27782,0
2,2558122,4.111111,18,Angelas Ashes,59616,0
3,2558122,4.111111,18,Angelas Ashes,71837,10
4,2558122,4.111111,18,Angelas Ashes,75187,0


In [15]:
# Remove duplicate ratings of userID and bookTitle
initial_rows = most_popular_books_ratings.shape[0]
most_popular_books_ratings = most_popular_books_ratings.drop_duplicates(['userID', 'bookTitle'])
current_rows = most_popular_books_ratings.shape[0]
print('Removed {0} rows'.format(initial_rows - current_rows))

Removed 1929 rows


### Recommendations based on NearestNeighbors model
#### Create user-item interactions matrix

In [16]:
%%time

# For the most popular books, create a pivot table with bookTitle as the index
most_popular_books_ratings_pivot = most_popular_books_ratings.pivot(index = 'bookTitle', \
                                                                    columns = 'userID', \
                                                                    values = 'bookRating').fillna(0)

most_popular_books_ratings_pivot.shape

CPU times: user 6.59 s, sys: 11.3 s, total: 17.9 s
Wall time: 15.4 s


(9426, 56342)

In [17]:
most_popular_books_ratings_pivot.head()

userID,8,9,10,14,16,17,23,26,32,39,...,278831,278832,278836,278838,278843,278844,278846,278849,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Q-Space (Star Trek The Next Generation, Book 47)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-01-00: The Novel of the Millennium,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"1,000 Places to See Before You Die",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Prepare data for NearestNeighbors model

In [18]:
%%time

# Convert dataframe to a compressed sparse row (CSR) matrix
most_popular_books_ratings_matrix = csr_matrix(most_popular_books_ratings_pivot.values)

type(most_popular_books_ratings_matrix)

CPU times: user 4.55 s, sys: 634 ms, total: 5.19 s
Wall time: 5.36 s


scipy.sparse._csr.csr_matrix

#### Train NearestNeighbors model

In [19]:
model_knn = NearestNeighbors(n_neighbors=7, metric='cosine', algorithm='brute')
model_knn.fit(most_popular_books_ratings_matrix)

In [20]:
# Select a book at random
query_index = np.random.choice(most_popular_books_ratings_pivot.shape[0])
print('query index = {0}, book title = {1}'.format(query_index, \
                                                   most_popular_books_ratings_pivot.index[query_index]))

query index = 7620, book title = The League of Extraordinary Gentlemen, Vol. 1


In [21]:
# Look up the random book's 5 nearest neighbors
distances, indices = model_knn.kneighbors(most_popular_books_ratings_pivot.iloc[query_index, :] \
                                                                          .values.reshape(1, -1), \
                                          n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(most_popular_books_ratings_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, most_popular_books_ratings_pivot. \
                                                          index[indices.flatten()[i]], \
                                                       distances.flatten()[i]))

Recommendations for The League of Extraordinary Gentlemen, Vol. 1:

1: Foundation (Foundation Series ; Bk 1), with distance of 0.7972382300701215:
2: Native Son: And How Bigger Was Born, with distance of 0.8035295206324354:
3: Preludes and Nocturnes (Sandman, Book 1), with distance of 0.8424617091959463:
4: Mattimeo (Redwall, Book 3), with distance of 0.844829520860003:
5: The Fermata, with distance of 0.8535857032695022:


### Recommendations based on TruncatedSVD model
#### Create user-item interactions matrix

In [22]:
%%time

# For the most popular books, create a pivot table with bookTitle as the columns
most_popular_books_ratings_pivot_2 = most_popular_books_ratings.pivot(index = 'userID', \
                                                                      columns = 'bookTitle', \
                                                                      values = 'bookRating').fillna(0)

most_popular_books_ratings_pivot_2.shape

CPU times: user 6.9 s, sys: 11.7 s, total: 18.6 s
Wall time: 14.9 s


(56342, 9426)

In [23]:
most_popular_books_ratings_pivot_2.head()

bookTitle,"Q-Space (Star Trek The Next Generation, Book 47)",'Salem's Lot,01-01-00: The Novel of the Millennium,"1,000 Places to See Before You Die",10 Lb. Penalty,1001 Ways to Be Romantic,101 Dalmatians,"14,000 Things to Be Happy About",16 Lighthouse Road,1916: A Novel of the Irish Rebellion (Irish Century),...,ZwÃ?ÃÂ¶lf.,"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""","\A\"" is for Alibi : A Kinsey Millhone Mystery (A Kinsey Millhone Mystery)""","\Girls' Night Out\""/\""Boys' Night In\""""","\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",e,iI Paradiso Degli Orchi,one hundred years of solitude,stardust
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Transpose pivot table for TruncatedSVD model training
X = most_popular_books_ratings_pivot_2.values.T
type(X)

numpy.ndarray

In [25]:
X.shape

(9426, 56342)

In [26]:
X[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### Train TruncatedSVD model selecting the 10 most important features

In [27]:
%%time

SVD = TruncatedSVD(n_components=10, random_state=42)
matrix = SVD.fit_transform(X)

CPU times: user 53.2 s, sys: 8.65 s, total: 1min 1s
Wall time: 25 s


In [28]:
type(matrix)

numpy.ndarray

In [29]:
matrix.shape

(9426, 10)

In [30]:
matrix[:5]

array([[ 0.38349165,  0.1525891 , -0.11319077, -0.0265984 , -0.09070456,
        -0.27559351,  0.2731215 , -0.33185272, -0.15560028,  0.0258307 ],
       [ 1.57621271,  0.56101198,  0.75164228, -1.64425993, -1.77106732,
         1.57340537,  3.00200246, -0.76725623,  0.54596591,  0.67262568],
       [ 0.25638425, -0.08060614,  0.03058778, -0.35003939, -0.14680605,
         0.07656018,  0.21961271,  0.12170965,  0.05408222, -0.08809896],
       [ 0.37387606, -0.13082815,  0.02378199, -0.34225383,  0.11299526,
        -0.08401051, -0.14051341,  0.13170381,  0.37164311, -0.52551389],
       [ 1.60655375, -0.17401017,  0.37490947, -0.01812349, -1.3256422 ,
         0.63161141, -1.22272657,  0.15261682, -1.87031211,  0.03157141]])

#### Generate Pearson correlation coefficients

In [31]:
%%time

corr = np.corrcoef(matrix)
corr.shape

CPU times: user 1.39 s, sys: 271 ms, total: 1.66 s
Wall time: 1.21 s


(9426, 9426)

In [32]:
# Get bookTitles for the most popular books
most_popular_books_ratings_titles = most_popular_books_ratings_pivot_2.columns
most_popular_books_ratings_titles_list = list(most_popular_books_ratings_titles)

#### Make recommendations using trained TruncatedSVD model for '1984'

In [33]:
# Get the index for book: '1984'
book_index = most_popular_books_ratings_titles_list.index("1984")
print(book_index)

10


In [34]:
# Get all the Pearson correlations for this book
book_correlations = corr[book_index]
book_correlations

array([0.94584215, 0.64591854, 0.44220136, ..., 0.45164203, 0.69089957,
       0.74970953])

In [35]:
# Get only those books with Pearson correlations >0.95 AND excluding this book itself
most_popular_books_ratings_titles[(book_correlations<1.0) & (book_correlations>0.95)]

Index(['American Hero', 'Animal Farm', 'Brave New World', 'Cancer Schmancer',
       'Comment je suis devenu stupide',
       'Dave Barry's Bad Habits a 100% Fact-Free Book', 'Delta Of Venus',
       'Der Campus.', 'Dreams Underfoot',
       'Driving Mr. Albert: A Trip Across America with Einstein's Brain',
       'Dune (Remembering Tomorrow)', 'EXQUISITE CORPSE', 'Emma',
       'Ender's Game (Ender Wiggins Saga (Paperback))',
       'Extension Du Domain De La Lutte', 'FIREBRAND',
       'Flowers for Algernon (Bantam Classic)',
       'From the Dust Returned: A Novel', 'Hocus Pocus', 'I, Robot',
       'In the Forests of the Night (Laurel Leaf Books)', 'Lord of the Flies',
       'Mostly Harmless (Hitchhiker's Trilogy, No 5)',
       'Notes from a Small Island', 'Post Office',
       'Pygmalion : A Romance in Five Acts',
       'Rise of a Merchant Prince (Serpentwar Saga)',
       'Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death',
       'Something Wicked This Wa

#### Make recommendations using trained TruncatedSVD model for 'The Shining'

In [36]:
# Get the index for book: 'The Shining'
book_index = most_popular_books_ratings_titles_list.index("The Shining")

# Get all the Pearson correlations for this book
book_correlations = corr[book_index]

# Get only those books with Pearson correlations >0.95 AND excluding this book itself
most_popular_books_ratings_titles[(book_correlations<1.0) & (book_correlations>0.95)]

Index(['Christine', 'Firestarter', 'It', 'Needful Things',
       'The Green Mile: The Bad Death of Eduard Delacroix (Green Mile Series)'],
      dtype='object', name='bookTitle')

#### Make recommendations using trained TruncatedSVD model for 'The Rainmaker'

In [37]:
# Get the index for book: 'The Rainmaker'
book_index = most_popular_books_ratings_titles_list.index("The Rainmaker")
print(book_index)

# Get all the Pearson correlations for this book
book_correlations = corr[book_index]

# Get only those books with Pearson correlations >0.95 AND excluding this book itself
most_popular_books_ratings_titles[(book_correlations<1.0) & (book_correlations>0.95)]

8008


Index(['Across Five Aprils', 'Beaches', 'Criminal Justice',
       'Desperate Measures (Barbara Holloway Novels (Paperback))',
       'Guilt by Association', 'Hart's War',
       'IN THE MEANTIME : Finding Yourself and the Love You Want',
       'Lie Down With Lions (Signet)', 'Now or Never',
       'SILENT PASSAGE: MENOPAUSE : SILENT PASSAGE: MENOPAUSE',
       'Shadow Watch (Tom Clancy's Power Plays (Paperback))', 'Snow Angel',
       'Spring Collection', 'The Chamber', 'The Deceiver',
       'The Eleventh Commandment', 'The Falls (Inspector Rebus S.)',
       'The Firm',
       'The Higher Taste: A Guide to Gourmet Vegetarian Cooking and a Karma-Free Diet',
       'The Partner', 'The Pelican Brief', 'The Runaway Jury',
       'The Street Lawyer',
       'Tick Tock, You're Dead! (Give Yourself Goosebumps, No 2)',
       'Tom Clancy's Op-Center Balance of Power (Tom Clancy's Op Center (Paperback))',
       'Tom Clancy's Op-Center: Line of Control (Tom Clancy's Op Center (Paperback))',