<a href="https://colab.research.google.com/github/rlancaster243/Machine-Learning-FreeCodeCamp/blob/main/fcc_book_recommendation_knn1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Get data files (Step get data files - from notebook)
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

# 1: Load & Explore Data
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'], # Corrected names for books
    usecols=['isbn', 'title', 'author'], # Corrected usecols for books
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'}
)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'}
)

print("Shape of df_books (initial):", df_books.shape)
print("Shape of df_ratings (initial):", df_ratings.shape)

print("\n--- df_ratings INFO immediately after loading ---")
df_ratings.info() # Check data types
print("\n--- df_ratings HEAD immediately after loading ---")
print(df_ratings.head()) # Display first few rows

# --- DEBUGGING INSERTION POINT ---
print("\nShape of df_ratings BEFORE rating > 0 filter:", df_ratings.shape) # ADDED - Shape before filter


# 2: Remove sparse users/books
df_ratings = df_ratings[df_ratings['rating'] > 0] # Keep only positive ratings
print("Shape of df_ratings after removing zero or negative ratings:", df_ratings.shape)

user_rating_counts = df_ratings.groupby('user')['rating'].count()
print("Shape of user_rating_counts:", user_rating_counts.shape)
print("Example of user_rating_counts:\n", user_rating_counts.head())
df_ratings = df_ratings[df_ratings['user'].isin(user_rating_counts[user_rating_counts >= 50].index)] # CHANGED THRESHOLD TO 50
print("Shape of df_ratings after user filtering:", df_ratings.shape)

book_rating_counts = df_ratings.groupby('isbn')['rating'].count()
print("Shape of book_rating_counts:", book_rating_counts.shape)
print("Example of book_rating_counts:\n", book_rating_counts.head())
df_ratings = df_ratings[df_ratings['isbn'].isin(book_rating_counts[book_rating_counts >= 50].index)] # CHANGED THRESHOLD TO 50
print("Shape of df_ratings after book filtering:", df_ratings.shape)

# 3: Create utility matrix and train KNN
df_merged = pd.merge(df_ratings, df_books, on='isbn', how='left')
print("Shape of df_merged:", df_merged.shape)
book_user_matrix = df_merged.pivot_table(index='title', columns='user', values='rating').fillna(0)
print("Shape of book_user_matrix:", book_user_matrix.shape)
book_user_sparse = csr_matrix(book_user_matrix)
print("Shape of book_user_sparse:", book_user_sparse.shape)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(book_user_sparse)

# 4: Define get_recommends function
def get_recommends(book_title, num_recommends=5):
    if book_title not in book_user_matrix.index:
        return [book_title, "not found in the utility matrix"]
    idx = book_user_matrix.index.get_loc(book_title)
    distances, indices = model_knn.kneighbors(book_user_sparse[idx].reshape(1, -1), n_neighbors=num_recommends + 1)
    recommended_books = []
    for i in range(1, len(distances[0])):
        recommended_title = book_user_matrix.index[indices[0][i]]
        distance = distances[0][i]
        recommended_books.append([recommended_title, distance])
    return [book_title, recommended_books]

# 5: Test the function
test_book = "The Queen of the Damned (Vampire Chronicles (Paperback))"
recommendations = get_recommends(test_book)
print(recommendations)

--2025-02-28 21:36:33--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-02-28 21:36:33 (67.0 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            
Shape of df_books (initial): (271379, 3)
Shape of df_ratings (initial): (1149780, 3)

--- df_ratings INFO immediately after loading ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   