In [24]:
"""
Cell 1: Import Libraries
------------------------
This cell imports the necessary Python libraries for:
- Data manipulation (pandas, numpy)
- Sparse matrix operations (scipy)
- KNN model for recommendations (sklearn)
- Optional plotting (matplotlib)
"""

import numpy as np              # For numerical operations
import pandas as pd             # For dataframes and data manipulation
from scipy.sparse import csr_matrix  # For efficient sparse matrix representation
from sklearn.neighbors import NearestNeighbors  # KNN algorithm for finding similar books
import matplotlib.pyplot as plt  # For optional data visualization

In [25]:
"""
Cell 2: Download and Extract Book-Crossings Dataset
---------------------------------------------------
This cell downloads the Book-Crossings dataset from FreeCodeCamp's CDN
and extracts the CSV files for books and ratings.
"""

# Download the zip file containing the dataset
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

# Unzip the dataset to extract CSV files
!unzip -o book-crossings.zip  # -o to overwrite if files already exist

# Define filenames for convenience
books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-10-13 17:02:56--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.1’


2025-10-13 17:02:57 (137 MB/s) - ‘book-crossings.zip.1’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [26]:
"""
Cell 3: Load CSV Files into Pandas DataFrames
--------------------------------------------
Purpose:
- Read the books and ratings data into pandas DataFrames for manipulation.
- Select only the columns we need: ISBN, title, author, user, rating.
- Set appropriate data types to save memory and improve performance.
- Handle encoding issues since book titles and authors may contain special characters.
"""

# Load the books dataset
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",       # Ensures special characters (accents, symbols) are read correctly
    sep=";",                     # File uses semicolons instead of commas
    header=0,                     # First row contains column names
    names=['isbn', 'title', 'author'],  # Assign explicit column names
    usecols=['isbn', 'title', 'author'], # Only load needed columns to save memory
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'}  # Use string types for consistency
)

# Display first few rows to verify the data
df_books.head()

# Load the ratings dataset
df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",       # Handles special characters in titles/ratings
    sep=";",                     # Semicolon-separated values
    header=0,                     # First row contains column names
    names=['user', 'isbn', 'rating'],  # Assign explicit column names
    usecols=['user', 'isbn', 'rating'], # Only load necessary columns
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'} # Optimize memory usage
)

# Display first few rows to verify the data
df_ratings.head()

# Notes:
# - df_books contains metadata: ISBN, title, author
# - df_ratings contains users' ratings for each book
# - These two DataFrames will be merged later for filtering and pivot table creation

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [27]:
"""
Cell 4: Filter Users and Books
------------------------------
Purpose:
- Remove users who have rated very few books and books with very few ratings.
- This reduces noise and ensures statistical significance in the KNN model.
- We adjust thresholds to include less popular books like
  'The Queen of the Damned (Vampire Chronicles (Paperback))'.
"""

# Step 1: Count ratings per user
user_rating_counts = df_ratings.groupby('user')['rating'].count()
# user_rating_counts now contains the number of ratings each user has given

# Step 2: Keep only users with at least 50 ratings
filtered_users = user_rating_counts[user_rating_counts >= 50].index

# Step 3: Count ratings per book
book_rating_counts = df_ratings.groupby('isbn')['rating'].count()
# book_rating_counts now contains the number of ratings each book has received

# Step 4: Keep only books with at least 20 ratings
filtered_books = book_rating_counts[book_rating_counts >= 20].index

# Step 5: Filter the ratings DataFrame to include only selected users and books
df_filtered = df_ratings[
    df_ratings['user'].isin(filtered_users) &
    df_ratings['isbn'].isin(filtered_books)
]

# Step 6: Merge filtered ratings with book metadata to include titles and authors
df_filtered = df_filtered.merge(df_books, on='isbn')

# Step 7: Display dataset info and sample rows
print(f"Filtered dataset: {df_filtered.shape[0]} ratings, {len(filtered_users)} users, {len(filtered_books)} books")
df_filtered.head()

# Notes:
# - df_filtered now contains only active users and popular enough books
# - This filtered dataset will be used to create the ratings matrix (pivot table)
# - Thresholds (50 ratings per user, 20 ratings per book) are chosen to balance data size and inclusivity


Filtered dataset: 243136 ratings, 3427 users, 7490 books


Unnamed: 0,user,isbn,rating,title,author
0,276847,0446364193,0.0,Along Came a Spider (Alex Cross Novels),James Patterson
1,276847,3379015180,0.0,Schlafes Bruder,Robert Schneider
2,276847,3551551677,10.0,Harry Potter und der Stein der Weisen,Joanne K. Rowling
3,276925,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
4,276925,0060520507,0.0,"Sushi for Beginners : A Novel (Keyes, Marian)",Marian Keyes


In [28]:
"""
Cell 5: Create Ratings Matrix (Pivot Table)
-------------------------------------------
Purpose:
- Transform the filtered ratings DataFrame into a matrix suitable for KNN.
- Rows = users, Columns = book titles, Values = ratings.
- Fill missing ratings with 0 (indicating the user hasn't rated that book).
"""

# Create pivot table from filtered dataset
ratings_matrix = df_filtered.pivot_table(
    index='user',       # Each row represents a user
    columns='title',    # Each column represents a book title
    values='rating',    # Cell values are ratings given by the user
    fill_value=0        # Fill missing ratings with 0
)

# Display shape of the ratings matrix
print(f"Ratings matrix shape: {ratings_matrix.shape}")

# Display first few rows for verification
ratings_matrix.head()

# Notes:
# - ratings_matrix is ready for KNN model training
# - Later, we will transpose it so that books are rows for similarity comparisons
# - Users with no ratings or books with very few ratings were already filtered in Cell 4


Ratings matrix shape: (3375, 6485)


title,'Salem's Lot,01-01-00: The Novel of the Millennium,10 Lb. Penalty,"14,000 Things to Be Happy About",16 Lighthouse Road,1916: A Novel of the Irish Rebellion (Irish Century),1984,1st to Die: A Novel,"20,000 Leagues Under the Sea (Wordsworth Collection)",2001: A Space Odyssey,...,Zoya,ZwÃ?Â¶lf.,"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""","\A\"" is for Alibi : A Kinsey Millhone Mystery (A Kinsey Millhone Mystery)""","\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",e,iI Paradiso Degli Orchi,one hundred years of solitude,stardust
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
"""
Cell 6: Train KNN Model
-----------------------
Purpose:
- Train a K-Nearest Neighbors (KNN) model to find books similar to a given book.
- Steps:
  1. Transpose the ratings matrix so that books are rows and users are columns.
  2. Convert the matrix to a sparse format to save memory.
  3. Fit the KNN model using cosine distance for similarity measurement.
"""

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Step 1: Transpose the ratings matrix so rows = books, columns = users
ratings_matrix_T = ratings_matrix.T

# Step 2: Convert the transposed matrix to a sparse matrix for efficiency
ratings_sparse_matrix_T = csr_matrix(ratings_matrix_T.values)

# Step 3: Initialize the KNN model
knn_model = NearestNeighbors(
    metric='cosine',   # Cosine distance measures similarity of ratings patterns
    algorithm='brute', # Brute-force is efficient for this dataset size
    n_neighbors=6      # 1 neighbor = the book itself + 5 recommendations
)

# Step 4: Fit the KNN model on the transposed sparse matrix
knn_model.fit(ratings_sparse_matrix_T)

print("KNN model training complete (books as rows).")


KNN model training complete (books as rows).


In [30]:
"""
Cell 7: Define get_recommends() Function
----------------------------------------
Purpose:
- Given a book title, return a list of 5 similar books along with their
  cosine distances.
- Uses the trained KNN model on the transposed ratings matrix (books as rows).
- Handles cases where the book is not found in the dataset.
"""

def get_recommends(book_title=""):
    # Step 1: Check if the book exists in the ratings matrix
    if book_title not in ratings_matrix_T.index:
        return f"Book '{book_title}' not found in the dataset."

    # Step 2: Find the index of the book in the transposed ratings matrix
    book_idx = ratings_matrix_T.index.get_loc(book_title)

    # Step 3: Query the KNN model for nearest neighbors (distances + indices)
    distances, indices = knn_model.kneighbors(
        ratings_matrix_T.iloc[book_idx, :].values.reshape(1, -1)
    )

    # Step 4: Build list of recommended books
    recommended_books = []
    # Skip the first neighbor (the book itself) and take next 5 closest books
    for i in range(1, 6):
        neighbor_idx = indices.flatten()[i]
        neighbor_title = ratings_matrix_T.index[neighbor_idx]
        distance = distances.flatten()[i]
        recommended_books.append([neighbor_title, distance])

    # Step 5: Return the input book and its recommendations
    return [book_title, recommended_books]

# Example usage:
# get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")


In [31]:
"""
Cell 8: Test get_recommends() Function
--------------------------------------
Purpose:
- Test the recommendation function on a sample book from the dataset.
- Display the recommended books along with their cosine distances.
- Handles cases where the book is not found.
"""

# Step 1: Select a book to test (must exist in the filtered dataset)
test_book = "The Queen of the Damned (Vampire Chronicles (Paperback))"

# Step 2: Get recommendations for the selected book
recommendations = get_recommends(test_book)

# Step 3: Check if the book exists in the dataset
if isinstance(recommendations, str):
    # If not found, display error message
    print(recommendations)
else:
    # Display the results in a readable format
    print(f"Recommendations for '{recommendations[0]}':")
    for title, distance in recommendations[1]:
        print(f"- {title} (distance: {distance:.4f})")

# Notes:
# - The first element in 'recommendations' is the input book title
# - The second element is a list of 5 recommended books with distances
# - Smaller distance values indicate higher similarity

Recommendations for 'The Queen of the Damned (Vampire Chronicles (Paperback))':
- The Tale of the Body Thief (Vampire Chronicles (Paperback)) (distance: 0.4745)
- The Vampire Lestat (Vampire Chronicles, Book II) (distance: 0.5228)
- Memnoch the Devil (Vampire Chronicles, No 5) (distance: 0.6525)
- Interview with the Vampire (distance: 0.6709)
- Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches) (distance: 0.7881)


In [32]:
"""
Cell 9: Automated Test for get_recommends()
-------------------------------------------
Purpose:
- Verify that the get_recommends() function works correctly.
- Checks:
  1. The input book title is correctly returned.
  2. Five recommendations are returned.
  3. Distances are valid (between 0 and 1 for cosine distance).
- Handles books that may not exist in the dataset.
"""

def test_book_recommendation():
    test_pass = True  # Flag to track if all tests pass

    # Step 1: Choose a book that exists in the filtered dataset
    test_book = "The Queen of the Damned (Vampire Chronicles (Paperback))"

    # Step 2: Get recommendations
    recommends = get_recommends(test_book)

    # Step 3: Handle case where book is not found
    if isinstance(recommends, str):
        print(recommends)
        return

    # Step 4: Check if the returned book matches the input
    if recommends[0] != test_book:
        test_pass = False
        print(f"Error: Returned book '{recommends[0]}' does not match input '{test_book}'.")

    # Step 5: Check if 5 recommendations are returned
    if len(recommends[1]) != 5:
        test_pass = False
        print(f"Error: Expected 5 recommendations, got {len(recommends[1])}.")

    # Step 6: Check if distances are within valid range (0 to 1)
    for title, distance in recommends[1]:
        if not (0 <= distance <= 1):
            test_pass = False
            print(f"Error: Distance for '{title}' is out of range: {distance}")

    # Step 7: Print final test result
    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")

# Run the automated test
test_book_recommendation()

You passed the challenge! 🎉🎉🎉🎉🎉
