In [1]:
import numpy as np
import pandas as pd
import math
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import matplotlib as mpl
from bokeh.plotting import figure, output_file, show
from bokeh.palettes import magma
import pandas as pd

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-09-06 16:43:34--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2024-09-06 16:43:35 (31.8 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})


In [4]:
# 1- Find all the books ISBN with same title and author
foo = df_books.groupby(['title', 'author']).agg(copies=('isbn', list), count=('isbn', 'count'))

# 2- Update all the ratings to point to the 1st ISBN found for that book
isbn_list = foo.query('count > 3')['copies'].tolist();
for ids in isbn_list:
  anchor, to_replace_list = ids[0], ids[1:]
  df_ratings.loc[df_ratings['isbn'].isin(to_replace_list), 'isbn'] = anchor

In [5]:
# CLEAN UP THE DATA

duplicate_books = df_books.groupby(['title', 'author']).title.agg(['count']).reset_index().query('count > 1')
duplicates_books_count = duplicate_books['count'].sum() - len(duplicate_books)

duplicate_ratings = df_ratings.groupby(['isbn', 'user']).isbn.agg(['count']).reset_index().query('count > 1')
duplicates_ratings_count = duplicate_ratings['count'].sum() - len(duplicate_ratings)

## Modify the DF to drop duplicate or irrelevant rows
df_books = df_books.drop_duplicates(subset=['title', 'author'])
df_ratings = df_ratings.drop_duplicates(subset=['isbn', 'user'])

print("Found and removed {:,} duplicate copies of books and {:,} duplicate copies of ratings".format(duplicates_books_count, duplicates_ratings_count))

## Books
books_count_before = len(df_books)
books_with_ratings = df_books.merge(df_ratings, on='isbn')
grouped_by_isbn = books_with_ratings.groupby(['isbn', 'title']).rating.agg(['count', 'mean']).reset_index()
books_min_count = 100
acceptable_books = grouped_by_isbn.query('count >= {}'.format(books_min_count))['isbn'].tolist()
grouped_by_isbn = grouped_by_isbn[grouped_by_isbn['isbn'].isin(acceptable_books)]
df_books = df_books[df_books['isbn'].isin(acceptable_books)]
books_count_after = len(df_books)
b_percent_change = round((books_count_before-books_count_after)/books_count_before*100, 2)
print('Removed {:,} rows ({}%) of books with less than {} reviews'.format(books_count_before - books_count_after, b_percent_change, books_min_count))

## Users
users_count_before = len(df_ratings)
ratings_min_count = 200;
df_ratings = df_ratings[df_ratings['isbn'].isin(acceptable_books)]
acceptable_users = df_ratings.groupby(['user']).rating.agg(['count']).reset_index().query('count >= {}'.format(ratings_min_count))['user'].tolist()
df_ratings = df_ratings[df_ratings['user'].isin(acceptable_users)]
users_count_after = len(df_ratings)
u_percent_change = round((users_count_before-users_count_after)/users_count_before*100,2)
print('Removed {:,} rows ({}%) of user ratings with less than {} reviews per account or invalid books'.format(users_count_before - users_count_after, u_percent_change, ratings_min_count))

Found and removed 20,175 duplicate copies of books and 1,385 duplicate copies of ratings
Removed 250,496 rows (99.72%) of books with less than 100 reviews
Removed 1,142,576 rows (99.49%) of user ratings with less than 200 reviews per account or invalid books


In [6]:
### REVIEWS AND USER BEHAVIOR GRAPHS AND VISUALS

grouped_by_user = df_ratings.groupby(['user'])
user_ratings = grouped_by_user['rating'].agg(['sum', 'count', 'mean']).reset_index()

print('--- BASIC STATS\n')
user_count = len(grouped_by_user['user'])
review_count = len(df_ratings)
most_active_list = user_ratings.sort_values(by='count')
most_active_user = most_active_list.iloc[-1];
print('There is over {:,} reviews in the database written by {:,} users'.format(review_count, user_count))
print('The most active user (ID: #{}) has written {:,} reviews\n'.format(most_active_user.name, int(most_active_user['count'])))


--- BASIC STATS

There is over 5,819 reviews in the database written by 22 users
The most active user (ID: #0) has written 635 reviews


--- WHERE DO MOST OF OUR REVIEWS COME FROM?



In [7]:
# FORMAT THE LAST DATA BEFORE CREATING MODAL
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

df = df_ratings
counts1 = df['user'].value_counts()
counts2 = df['isbn'].value_counts()

df = df[~df['user'].isin(counts1[counts1 < 200].index)]
df = df[~df['isbn'].isin(counts2[counts2 < 100].index)]


merged_df = pd.merge(right=df, left = df_books, on="isbn")
merged_df = merged_df.drop_duplicates(subset=["title", "user"])

books_features_pivot = merged_df.pivot(
  index='title',
  columns='user',
  values='rating'
).fillna(0)

mat_books_features = csr_matrix(books_features_pivot.values)

In [8]:
## function to return recommended books - this will be tested
def get_recommends(book = "", n = 5):
  """
  make top n books recommendations
  Parameters
  ----------
  book: str, name of user input book
  n: int, top n recommendations
  """
  # Prepare for model
  pivot = books_features_pivot
  titles = list(pivot.index.values)
  data = pivot.values

  def title_2_index(title):
    ind = titles.index(title)
    return data[ind,:]

  def index_2_title(ind):
    return titles[ind]


  # Build model
  model = NearestNeighbors(metric="cosine",algorithm="brute", p=2)
  model.fit(data)

  # Run model to get recommendations
  idx = title_2_index(book)
  distances, indices = model.kneighbors(
    np.reshape(idx,[1,-1]),
    n_neighbors=n+1
  )

  raw_recommends = sorted(
    list(
      zip(
        indices.squeeze().tolist(),
        distances.squeeze().tolist()
      )
    ),
    key=lambda x: x[1]
  )[1:]

  # print results
  recommended_books = []
  print('Recommendations for {}:'.format(book))
  for i, (idx, dist) in enumerate(raw_recommends):
      dist = dist
      recommended_books.append([index_2_title(idx), dist])
      print('{0}: {1}, with distance of {2:,.2f}'.format(i+1, index_2_title(idx), dist))
  print('-----------------')
  return [book, recommended_books]

In [11]:
def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))", 10)
  if recommends[0] != "The Queen of the Damned (Vampire Chronicles (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True', 'The Lovely Bones: A Novel']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77, 0.72]
  recommended_books.reverse()
  recommended_books_dist.reverse()

  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

Recommendations for The Queen of the Damned (Vampire Chronicles (Paperback)):
1: The Vampire Lestat (Vampire Chronicles, Book II), with distance of 0.52
2: The Tale of the Body Thief (Vampire Chronicles (Paperback)), with distance of 0.54
3: Interview with the Vampire, with distance of 0.73
4: The Witching Hour (Lives of the Mayfair Witches), with distance of 0.74
5: Catch 22, with distance of 0.79
6: Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches), with distance of 0.80
7: The Gunslinger (The Dark Tower, Book 1), with distance of 0.81
8: Neuromancer (Remembering Tomorrow), with distance of 0.81
9: The Search, with distance of 0.83
10: Purity in Death, with distance of 0.84
-----------------
You haven't passed yet. Keep trying!
