# Recommender Systems 1 - Popularity and Collaborative Memory-based Filtering

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

### Load (cleaned) Data

In [2]:
# Load books
books = pd.read_csv('data/BX-Books_cleaned.csv', encoding="latin-1", low_memory=False)

# Load users
users = pd.read_csv('data/BX-Users_cleaned.csv', encoding="latin-1", low_memory=False)

# Load ratings
ratings = pd.read_csv('data/BX-Book-Ratings_cleaned.csv', encoding="latin-1", low_memory=False)

### Examine data

In [3]:
# Change display setting to display full text in columns
pd.set_option('display.max_colwidth', None)

#### books (cleaned)

In [4]:
books.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


#### users (cleaned)

In [5]:
users.head()

Unnamed: 0,userID,Location,Age,country
0,1,"nyc, new york, usa",34,usa
1,2,"stockton, california, usa",18,usa
2,4,"porto, v.n.gaia, portugal",17,portugal
3,5,"farnborough, hants, united kingdom",34,united kingdom
4,6,"santa monica, california, usa",61,usa


#### ratings (cleaned)

In [6]:
ratings.head()

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276733,2080674722,0
4,276744,038550120X,7


### Generate rating statistics

In [7]:
# Calculate the average rating for each book
ratings_stats = pd.DataFrame(ratings.groupby('ISBN')['bookRating'].mean())

# Calculate the number of ratings (including bookRating=0) for each book
ratings_stats['num_ratings'] = pd.DataFrame(ratings.groupby('ISBN')['bookRating'].count())
ratings_stats.rename(columns={"bookRating" : "avg_rating"}, inplace=True)

ratings_stats.sample(10)

Unnamed: 0_level_0,avg_rating,num_ratings
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
345244168,8.0,2
708905307,0.0,1
451406184,2.5,2
671725505,0.888889,9
1558740708,8.0,1
2070403785,8.0,1
307102467,0.0,1
395730996,9.0,1
61067393,0.0,1
72124903,0.0,3


## Part 1: Recommendations based on popularity

### Recommendations based on top 100 books (by number of ratings)

In [8]:
# Get ISBNs for the 100 books with the most ratings ordered by number of ratings per book
most_rated_books_list = ratings_stats.sort_values('num_ratings', ascending=False).head(100).index

# Create dataframe with ISBNs for the most rated books
most_rated_books_df = pd.DataFrame(most_rated_books_list, columns = ['ISBN'])

# Merge most rated books with rating stats
most_rated_books_df = most_rated_books_df.join(ratings_stats, on='ISBN', how='inner')

# Merge most rated books with books
most_rated_books_df = pd.merge(most_rated_books_df, books, on='ISBN')
most_rated_books_df.shape

(100, 7)

In [9]:
most_rated_books_df.head()

Unnamed: 0,ISBN,avg_rating,num_ratings,bookTitle,bookAuthor,yearOfPublication,publisher
0,971880107,0.979759,2322,Wild Animus,Rich Shapero,2004,Too Far
1,316666343,4.458716,1199,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
2,385504209,4.624113,846,The Da Vinci Code,Dan Brown,2003,Doubleday
3,60928336,3.454161,709,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial
4,312195516,4.318182,682,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA


#### Synopsis
The book that received the most ratings in this data set is Rich Shapero's "Wild Animus". Something in common among these five most rated books - they are fictions or novels. This recommendation system suggests that novels and fictions are popular and likely receive more ratings. And if someone likes "Wild Animus", we should probably recommend "The Lovely Bones: A Novel" to them.
#### Observation
Though "Wild Animus" received the most ratings. it is not highly rated at all.

### Recommendations based on top 100 books (by highest average ratings)

In [10]:
# Get ISBNs for the 100 books with the highest average rating ordered by rating
highest_rated_books_list = ratings_stats.sort_values('avg_rating', ascending=False).head(100).index

# Create dataframe with ISBNs for the highest rated books
highest_rated_books_df = pd.DataFrame(highest_rated_books_list, columns = ['ISBN'])

# Merge most rated books with rating stats
highest_rated_books_df = highest_rated_books_df.join(ratings_stats, on='ISBN', how='inner')

# Merge most rated books with books
highest_rated_books_df = pd.merge(highest_rated_books_df, books, on='ISBN')
highest_rated_books_df.shape

(100, 7)

In [11]:
highest_rated_books_df.head()

Unnamed: 0,ISBN,avg_rating,num_ratings,bookTitle,bookAuthor,yearOfPublication,publisher
0,1410732746,10.0,1,"Precious and Few: Volume I in the \Polly's Heartsongs\"" Trilogy""",Bonnie Sue Bradshaw,2003,Authorhouse
1,1550377310,10.0,1,Generals Die in Bed: A Story from the Trenches,Charles Yale Harrison,2002,Annick Press
2,534526772,10.0,1,"Sight, Sound, Motion: Applied Media Aesthetics",Herbert Zettl,1998,Wadsworth Publishing
3,534528252,10.0,1,The Place of Mind,Brian Cooney,1999,Wadsworth Publishing
4,534534600,10.0,1,Communication Between Cultures With Infotrac,Larry A. Samovar,2000,Thomson Learning


#### Synopsis
The highest rated books are different from the most rated books.
#### Observation
Some of the highest rated books have only 1 rating.

## Part 2: Recommendations based on Collaborative Memory-based filtering

#### To ensure statistical significance, EXCLUDE users with < 200 ratings and books with < 100 ratings

In [12]:
# Keep ONLY ratings from users with 200 or more ratings
counts_1 = ratings['userID'].value_counts()
ratings_2 = ratings[ratings['userID'].isin(counts_1[counts_1 >= 200].index)]

# Keep ONLY ratings for books with 100 or more ratings
counts_2 = ratings_2['bookRating'].value_counts()
ratings_2 = ratings_2[ratings_2['bookRating'].isin(counts_2[counts_2 >= 100].index)]
ratings_2.shape

(456460, 3)

In [13]:
ratings_2.head()

Unnamed: 0,userID,ISBN,bookRating
1133,277427,002542730X,10
1134,277427,0026217457,0
1135,277427,003008685X,8
1136,277427,0030615321,0
1137,277427,0060002050,0


#### Create user-item interactions matrix
Convert ratings dataframe into a 2D matrix.  
The matrix will be very sparse because not every user rates every book.

In [14]:
%%time

# Create pivot table with userID as the index, ISBNs as the columns and bookRating in the cells
ratings_2_pivot = ratings_2.pivot(index='userID', columns='ISBN').bookRating
ratings_2_pivot.shape

CPU times: user 753 ms, sys: 450 ms, total: 1.2 s
Wall time: 1.45 s


(797, 169543)

In [15]:
ratings_2_pivot.head()

ISBN,0000913154,0001010565,0001046438,000104687X,0001047213,0001047663,0001047868,0001047973,000104799X,0001048082,...,B0001FZGRQ,B0001FZGTO,B0001GDNCK,B0001GMSV2,B0001I1JII,B0001I1KOG,B0001PIOX4,B000234N3A,B000234N76,B000234NC6
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,,,,,,,...,,,,,,,,,,
2766,,,,,,,,,,,...,,,,,,,,,,
2977,,,,,,,,,,,...,,,,,,,,,,
3363,,,,,,,,,,,...,,,,,,,,,,


#### Find which books are correlated with the 2nd most rated book "The Lovely Bones: A Novel".

In [16]:
# Get bookRatings for the 2nd most rated book "The Lovely Bones: A Novel"
ratings_selected_book = ratings_2_pivot['0316666343']
ratings_selected_book.head()

userID
254     NaN
2276    NaN
2766    9.0
2977    NaN
3363    0.0
Name: 0316666343, dtype: float64

In [17]:
%%time

# Generate Pearson correlation coefficients for the selected book
#  coefficients range between -1 and 1; 0 means no correlation
similar_books = ratings_2_pivot.corrwith(ratings_selected_book)     # default method='pearson'

# Convert results to a dataframe
similar_books_df = pd.DataFrame(similar_books, columns=['pearson'])
similar_books_df.head()

CPU times: user 25 s, sys: 216 ms, total: 25.2 s
Wall time: 25.6 s


Unnamed: 0_level_0,pearson
ISBN,Unnamed: 1_level_1
0000913154,
0001010565,
0001046438,
000104687X,
0001047213,


In [18]:
similar_books_df.shape

(169543, 1)

In [19]:
# Drop rows with nulls
similar_books_df.dropna(inplace=True)
similar_books_df.shape

(12363, 1)

In [20]:
# Join similar books with ratings_stats
similar_books_ratings_df = similar_books_df.join(ratings_stats['num_ratings'])
similar_books_ratings_df.head()

Unnamed: 0_level_0,pearson,num_ratings
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
0006157629,-1.0,7
0006485936,0.57735,10
0006492347,-1.0,5
000649840X,0.201809,81
000651202x,1.0,2


In [21]:
# Extract top 5 correlated books results, restrict to books with 300 or more ratings
most_similar_books = similar_books_ratings_df[similar_books_ratings_df['num_ratings']>=300]. \
                                              sort_values('pearson', ascending=False).head()
most_similar_books.reset_index(inplace=True)

# Display top 5 correlated books results
pd.merge(most_similar_books, books, on='ISBN')

Unnamed: 0,ISBN,pearson,num_ratings,bookTitle,bookAuthor,yearOfPublication,publisher
0,316666343,1.0,1199,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
1,312291639,0.479272,346,The Nanny Diaries: A Novel,Emma McLaughlin,2003,St. Martin's Griffin
2,316601950,0.430278,528,The Pilot's Wife : A Novel,Anita Shreve,1999,Back Bay Books
3,446610038,0.421779,381,1st to Die: A Novel,James Patterson,2002,Warner Vision
4,446672211,0.412331,547,Where the Heart Is (Oprah's Book Club (Paperback)),Billie Letts,1998,Warner Books


These results appear to be somewhat correlated with "The Lovely Bones".

#### Find which books are correlated with the 3rd most rated book "The Da Vinci Code".

In [22]:
%%time

# Get bookRatings for the 2nd most rated book "The Da Vinci Code"
ratings_selected_book = ratings_2_pivot['0385504209']

# Generate Pearson correlation coefficients for the selected book
similar_books = ratings_2_pivot.corrwith(ratings_selected_book)     # default method='pearson'
similar_books_df = pd.DataFrame(similar_books, columns=['pearson'])

# Drop rows with nulls
similar_books_df.dropna(inplace=True)

# Join similar books with ratings_stats
similar_books_ratings_df = similar_books_df.join(ratings_stats['num_ratings'])

# Extract top 5 correlated books results, restrict to books with 300 or more ratings
most_similar_books = similar_books_ratings_df[similar_books_ratings_df['num_ratings']>=300]. \
                                              sort_values('pearson', ascending=False).head()
most_similar_books.reset_index(inplace=True)

# Display top 5 correlated books results
pd.merge(most_similar_books, books, on='ISBN')

CPU times: user 23.5 s, sys: 209 ms, total: 23.7 s
Wall time: 24 s


Unnamed: 0,ISBN,pearson,num_ratings,bookTitle,bookAuthor,yearOfPublication,publisher
0,0385504209,1.0,846,The Da Vinci Code,Dan Brown,2003,Doubleday
1,044651652X,0.495333,352,The Bridges of Madison County,Robert James Waller,1992,Warner Books
2,044022165X,0.463824,373,The Rainmaker,JOHN GRISHAM,1996,Dell
3,0671021001,0.360846,442,She's Come Undone (Oprah's Book Club),Wally Lamb,1998,Pocket
4,0440222656,0.356361,404,The Horse Whisperer,Nicholas Evans,1996,Dell


These results do not appear as good.