In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
books_df = pd.read_csv("../data/processed/books_final.csv")
ratings_df = pd.read_csv("../data/processed/ratings_final.csv")

books_df['ISBN'] = books_df['ISBN'].astype(str).str.strip()
ratings_df['ISBN'] = ratings_df['ISBN'].astype(str).str.strip()

In [3]:
books_df['text_features'] = (
    books_df['Title'].fillna('') + ' ' +
    books_df['Author'].fillna('') + ' ' +
    books_df['Publisher'].fillna('')
)

In [4]:
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=5000)
tfidf_matrix = tfidf.fit_transform(books_df['text_features'])

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
indices = pd.Series(books_df.index, index=books_df['ISBN'])

In [7]:
def get_similar_books(isbn, books_df, cosine_sim=cosine_sim, top_n=10):
    idx = indices.get(isbn)
    if idx is None:
        return []

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    book_indices = [i[0] for i in sim_scores]
    return books_df.iloc[book_indices][['Title', 'Author', 'Publisher']]

In [8]:
random_isbn = books_df['ISBN'].sample(1).iloc[0]
print("Selected book:", books_df[books_df['ISBN'] == random_isbn]['Title'].values[0])

similar_books = get_similar_books(random_isbn, books_df)
display(similar_books)

Selected book: James and the Giant Peach


Unnamed: 0,Title,Author,Publisher
179,James and the Giant Peach,Roald Dahl,Penguin USA (Paper)
4995,"Ah, Sweet Mystery of Life: The Country Stories...",Roald Dahl,Penguin Books
9725,The Twits,Roald Dahl,Penguin USA
1604,Tales of the Unexpected,Roald Dahl,Penguin Putnam~trade
6431,My Uncle Oswald,Roald Dahl,Penguin Books
4076,Switch Bitch,Roald Dahl,Penguin Books
2409,Relatos de Los Inesperado,Roald Dahl,Anagrama
14226,The B. F. G.: Dahl (Puffin Books),Roald Dahl,Penguin USA
11699,"\"" Lamb to the Slaughter and Other Stories (Pe...",Roald Dahl,Penguin Books Ltd
3370,Twits,Roald Dahl,Scholastic Inc


In [12]:
def build_user_profile(user_id, ratings_df, tfidf_matrix, book_indices):
    liked_books = ratings_df[
        (ratings_df['User-ID'] == user_id) & (ratings_df['Rating'] >= 7)
    ]['ISBN'].tolist()

    liked_indices = [book_indices.get(isbn) for isbn in liked_books if isbn in book_indices]
    liked_indices = [idx for idx in liked_indices if idx is not None]

    if not liked_indices:
        return None

    user_vector = tfidf_matrix[liked_indices].mean(axis=0)
    return np.asarray(user_vector)


In [13]:
def recommend_books_for_user(user_id, ratings_df, books_df, tfidf_matrix, book_indices, top_n=10):
    user_profile = build_user_profile(user_id, ratings_df, tfidf_matrix, book_indices)
    if user_profile is None:
        return []

    cosine_scores = linear_kernel(user_profile, tfidf_matrix).flatten()
    books_df["similarity_score"] = cosine_scores

    seen_isbns = ratings_df[ratings_df['User-ID'] == user_id]['ISBN'].tolist()
    recs = books_df[~books_df['ISBN'].isin(seen_isbns)]

    recs = recs.sort_values(by="similarity_score", ascending=False)
    return recs[['Title', 'Author', 'Publisher', 'similarity_score']].head(top_n)

In [51]:
target_user = ratings_df['User-ID'].sample(1).iloc[0]
print("Target User:", target_user)

user_recs = recommend_books_for_user(target_user, ratings_df, books_df, tfidf_matrix, indices)

if user_recs.empty:
    print("No recommendations could be generated for this user.")
else:
    display(user_recs)

Target User: 37950


Unnamed: 0,Title,Author,Publisher,similarity_score
12930,It (R),Stephen King,Signet Book,0.100619
1741,Desperation,Stephen King,Signet Book,0.100619
12608,Firestarter (R),Stephen King,Signet Book,0.100619
8097,It,Stephen King,Signet Book,0.100619
2657,The Regulators,Stephen King,Signet Book,0.100619
2158,Firestarter,Stephen King,Signet Book,0.100619
3303,Rose Madder,Stephen King,Signet Book,0.090547
10846,Christine,Stephen King,Signet Book,0.0888
8431,The Shining,Stephen King,Signet Book,0.088748
6152,The Shining,Stephen King,Signet Book,0.088748


## Content-Based Filtering – Final Summary and Evaluation

This notebook implemented a content-based filtering (CBF) system using metadata from the Book-Crossing dataset. The aim was to address limitations identified during collaborative filtering, particularly related to cold-start users and the sparsity of user–item interactions.

### Key Implementation Steps:
- Combined book features (`Title`, `Author`, `Publisher`) into a single text field.
- Applied `TfidfVectorizer` to extract numerical representations from the text.
- Used cosine similarity to calculate item–item similarity between books.
- Built user profiles by averaging the TF-IDF vectors of books they rated ≥7.
- Recommended unseen books most similar to the user's profile vector.

### Observed Strengths:
- Effectively handled the cold-start problem for users with limited historical data.
- Provided transparent and interpretable recommendation logic based on content similarity.
- Scalable and computationally efficient for medium-sized datasets.

### Limitations:
- Repeated recommendations of the same book (or its variants) due to data duplication (e.g., multiple ISBNs for the same title).
- Lack of content richness: the dataset did not include genres, descriptions, or user-generated tags.
- Overemphasis on lexical similarity: books with similar wording in titles or authors were ranked higher, even if semantically redundant.
- Limited diversity in recommendations: the model often recommended similar books from the same author or series.

### Comparative Context:
Compared to collaborative filtering:
- CBF is superior for new users and cold-start situations.
- CF produces more personalized and diverse recommendations when enough data is available.
- A hybrid system combining both approaches would balance cold-start robustness and long-term personalization.

### Next Steps:
In the next phase, a hybrid recommendation system will be developed by combining the content-based model with the collaborative filtering model implemented earlier. This will aim to optimize both personalization and coverage across different user profiles.
