In [2]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

## initialization

In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-02-28 11:19:05--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.2’


2025-02-28 11:19:05 (174 MB/s) - ‘book-crossings.zip.2’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [4]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [5]:
# quick check of the columns for books
print(df_books.head())

         isbn                                              title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 author  
0    Mark P. O. Morford  
1  Richard Bruce Wright  
2          Carlo D'Este  
3      Gina Bari Kolata  
4       E. J. W. Barber  


In [6]:
# quick check of the columns for reviews
print(df_ratings.head())

     user        isbn  rating
0  276725  034545104X     0.0
1  276726  0155061224     5.0
2  276727  0446520802     0.0
3  276729  052165615X     3.0
4  276729  0521795028     6.0


## user filtering

In [65]:
ratings = df_ratings['user'].value_counts() # amount of ratings per user

# now we get a distribution in groups to check data distribution (adjusting bins)
ratings_users_bins = pd.cut(ratings,
                            bins=[0, 10, 100, 200, 500, 1000, 5000, 10000],
                            labels=['0-10', '11-100', '101-200', '201-500', '501-1000', '1001-5000', '5001+'],
                            right=False)
ratings_users_distribution = ratings_users_bins.value_counts().sort_index()

print(ratings_users_distribution)

count
0-10         92186
11-100       11250
101-200        942
201-500        618
501-1000       170
1001-5000      112
5001+            4
Name: count, dtype: int64


Given there are many users with few reviews (and therefore examples to train), we can keep only those with a certain amount of reviews. This bar is adjustable.

In [68]:
users_bar_limit = 1000

In [75]:
users_filtered = ratings[ratings >= users_bar_limit].index  # obtain the indexes of the users with more than user_bar_limit reviews

print('users before filter:',ratings.shape[0])
print('users after filter:',len(users_filtered))

users before filter: 105283
users after filter: 117


In [92]:
df_ratings_filtered = df_ratings[df_ratings['user'].isin(users_filtered)]   # filter with those indexes

## book filtering

In [107]:
book_review_counts = df_ratings['isbn'].value_counts()

# now we get a distribution in groups to check data distribution (adjusting bins)
ratings_books_bins = pd.cut(ratings,
                            bins=[0, 10, 50, 100, 200, 500, 1000, 2000],
                            labels=['0-10', '11-50', '51-100', '101-200', '201-500', '501-1000', '1001+'],
                            right=False)
ratings_books_distribution = ratings_books_bins.value_counts().sort_index()

print(ratings_books_distribution)

count
0-10        92186
11-50        9670
51-100       1580
101-200       942
201-500       618
501-1000      170
1001+          88
Name: count, dtype: int64


In [136]:
books_bar_limit = 1000

In [137]:
books_filtered = book_review_counts[book_review_counts >= books_bar_limit].index  # obtain the indexes of the books with more than books_bar_limit reviews

print('books before filter:',df_books.shape[0])
print('books after filter:',len(books_filtered))

books before filter: 271379
books after filter: 2


In [138]:
df_books_filtered = df_ratings[df_ratings['isbn'].isin(books_filtered)]   # filter with those indexes
print(len(df_books_filtered))

3797


In [139]:
df_books_filtered = df_books_filtered[df_books_filtered.isbn.isin(df_books.isbn)]   # remove books not in df_books
print(len(df_books_filtered))

3797


## merge after filtering

In [140]:
df_merge_filtered = df_ratings_filtered.merge(df_books_filtered, on="isbn", how="left")
df_merge_filtered = df_merge_filtered.drop(columns=['user_y', 'rating_y'])

print(df_merge_filtered.head())

   user_x        isbn  rating_x
0  278418  0006128831       0.0
1  278418  0006542808       5.0
2  278418  0020209606       0.0
3  278418  0020418809       0.0
4  278418  0020420900       0.0


In [141]:
print(f'N. of books: {df_merge_filtered.isbn.nunique()}',
      f'N. of users: {df_merge_filtered.user_x.nunique()}',
      sep='\n')

N. of books: 120694
N. of users: 117


## model

In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):


  return recommended_books

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()