<a href="https://colab.research.google.com/github/sofiaesc/fcc/blob/main/Copia_de_fcc_book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [267]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv' # get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [322]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [323]:
# quick check of the columns for books
print(df_books.head())

         isbn                                              title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 author  
0    Mark P. O. Morford  
1  Richard Bruce Wright  
2          Carlo D'Este  
3      Gina Bari Kolata  
4       E. J. W. Barber  


In [324]:
# quick check of the columns for reviews
print(df_ratings.head())

     user        isbn  rating
0  276725  034545104X     0.0
1  276726  0155061224     5.0
2  276727  0446520802     0.0
3  276729  052165615X     3.0
4  276729  0521795028     6.0


In [325]:
# merge both datasets
df = df_ratings.merge(df_books,on="isbn",how="left")

## Filtering users

In [326]:
ratings_per_user = df["user"].value_counts()

# now we get a distribution in groups to check data distribution (adjusting bins)
ratings_users_bins = pd.cut(ratings_per_user,
                            bins=[0, 10, 50, 100, 200, 500, 1000, 2000],
                            labels=['0-10', '11-50', '51-100', '101-200', '201-500', '501-1000', '1001+'],
                            right=False)
ratings_users_distribution = ratings_users_bins.value_counts().sort_index()

print(ratings_users_distribution)

count
0-10        92186
11-50        9670
51-100       1580
101-200       942
201-500       618
501-1000      170
1001+          88
Name: count, dtype: int64


In [327]:
ratings_per_book = df["isbn"].value_counts()

# now we get a distribution in groups to check data distribution (adjusting bins)
ratings_books_bins = pd.cut(ratings_per_book,
                            bins=[0, 10, 50, 100, 200, 500, 1000, 2000],
                            labels=['0-10', '11-50', '51-100', '101-200', '201-500', '501-1000', '1001+'],
                            right=False)
ratings_books_distribution = ratings_books_bins.value_counts().sort_index()

print(ratings_books_distribution)

count
0-10        322237
11-50        16134
51-100        1454
101-200        536
201-500        175
501-1000        18
1001+            1
Name: count, dtype: int64


In [368]:
# having seen the distribution of reviews per user and reviews per book, we filter these as to have significant data:
users_filtered = ratings_per_user[ratings_per_user >= 200].index       # filter threshold determined arbitrarily, adjustable along with k
books_filtered = ratings_per_book[ratings_per_book >= 100].index

In [369]:
df_filtered = df.loc[(df["user"].isin(users_filtered.values)) & (df["isbn"].isin(books_filtered.values))] # filter with those user and books only
df_filtered = df_filtered.drop_duplicates(['title', 'user'])  # drop duplicates (review of the same book by the same person)

print(df_filtered.head())

        user        isbn  rating  \
1456  277427  002542730X    10.0   
1469  277427  0060930535     0.0   
1471  277427  0060934417     0.0   
1474  277427  0061009059     9.0   
1484  277427  0140067477     0.0   

                                                  title              author  
1456  Politically Correct Bedtime Stories: Modern Ta...   James Finn Garner  
1469                      The Poisonwood Bible: A Novel  Barbara Kingsolver  
1471                                 Bel Canto: A Novel        Ann Patchett  
1474  One for the Money (Stephanie Plum Novels (Pape...     Janet Evanovich  
1484                                    The Tao of Pooh       Benjamin Hoff  


In [370]:
df_pivot = df_filtered.pivot(index = 'title', columns = 'user', values = 'rating').fillna(0)  # NaN -> 0

# each row represents a book (indexed by 'title'),
# each column represents a user,
# the values are the ratings of the books by users.

In [371]:
df_matrix = csr_matrix(df_pivot.values)

# pivoted dataframe to a sparse matrix for memory efficiency

## Model

In [372]:
neigh = NearestNeighbors(metric='cosine')
neigh.fit(df_matrix)

In [380]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
    dist,idx = neigh.kneighbors([df_pivot.loc[book]], 5)  # with the model we get 5 nearest neighbors, one will be the same book

    recommendation_name = df_pivot.iloc[idx[0][1:]].index.to_list()
    #print(recommendation_name)   # debug print
    recommendation_dist = dist[0][1:].tolist()
    #print(recommendation_dist)   # debug print

    # setup the list to return based on the test case
    recommended_books = []

    for i in range(len(recommendation_name)-1,-1,-1): # list was giving the neighbors in asc order, flipping it gives it desc
      recommended_books.append([recommendation_name[i], recommendation_dist[i]])

    return [book, recommended_books]

# recommended_books = get_recommends(book="The Catcher in the Rye")
# print(recommended_books)

In [381]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Weight of Water', 0.7708583474159241], ['The Surgeon', 0.7699410915374756], ['I Know This Much Is True', 0.7677075266838074], ['The Lovely Bones: A Novel', 0.7234864234924316]]]
You passed the challenge! 🎉🎉🎉🎉🎉
