In [None]:
import os
# Set the token as an environment variable
os.environ.get("HUGGINGFACEHUB_API_TOKEN")

In [158]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

import pandas as pd


In [159]:
#Load Dataset
books = pd.read_csv("books_cleaned.csv")
books["isbn13"] = books["isbn13"].astype(str)

In [160]:
books["isbn13"].dtype

dtype('O')

In [161]:
documents = []
for index, row in books.iterrows():
    if row["tagged_description"]:
        doc = Document(
            page_content=str(row["tagged_description"]), 
            metadata={
                # Store ISBN and other useful fields in metadata for easy and reliable retrieval
                "isbn13": str(row["isbn13"]),
                "title": str(row["title"]),
            }
        )
        documents.append(doc)

In [162]:
# Load text file
#raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()
#text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap = 0)
#documents = text_splitter.split_documents(raw_documents)

In [163]:
documents[0]

Document(metadata={'isbn13': '9780002005883', 'title': 'Gilead'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. Itâ€™s 1956 in Gilead, Iowa, towards the end of the Reverend Amesâ€™s life, and he is absorbed in recording his familyâ€™s story, a legacy for the young son he will never see grow up. Haunted by his grandfatherâ€™s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friendâ€™s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Amesâ€™s joyous, rambling voice that finds beauty, humour and truth in the smallest

In [164]:
print(f"Created {len(documents)} documents for embedding.")

Created 5197 documents for embedding.


In [165]:
# Local embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [150]:
# Create Chroma vector DB
print("Creating/Loading Chroma vector store...")
persist_directory = "./chroma_books_fixed"
db_books = Chroma.from_documents(documents, embedding=embeddings, persist_directory=persist_directory)
db_books.persist()
print(f"Chroma DB created and persisted to {persist_directory}")

python-dotenv could not parse statement starting at line 2


Creating/Loading Chroma vector store...
Chroma DB created and persisted to ./chroma_books_fixed


In [151]:
query = "A novel exploring nihilism and the meaning of life"
results = db_books.similarity_search(
    query,
    k=3
)

# --- 4. Retrieval Logic (Uses Metadata) ---
print("\n" + "="*50)
print(f"ðŸŽ¯ Query: **{query}**")
print("="*50)


ðŸŽ¯ Query: **A novel exploring nihilism and the meaning of life**


In [152]:
for i, doc in enumerate(results):
    # Retrieve the crucial information from the metadata
    isbn = doc.metadata.get("isbn13", "N/A")
    title = doc.metadata.get("title", "N/A")
    
    print(f"## Result {i+1}")
    print(f"  **Title:** {title}")
    print(f"  **ISBN13:** {isbn}")
    print(doc.page_content[:400])
    print("-" * 50)

## Result 1
  **Title:** The Will to Power
  **ISBN13:** 9780394704371
9780394704371 Offers a selection from the author's notebooks, chosen by his sister, that reveals his views on nihilism, art, morality, religion, the theory of knowledge, and other subjects
--------------------------------------------------
## Result 2
  **Title:** The Will to Power
  **ISBN13:** 9780394704371
9780394704371 Offers a selection from the author's notebooks, chosen by his sister, that reveals his views on nihilism, art, morality, religion, the theory of knowledge, and other subjects
--------------------------------------------------
## Result 3
  **Title:** The Will to Power
  **ISBN13:** 9780394704371
9780394704371 Offers a selection from the author's notebooks, chosen by his sister, that reveals his views on nihilism, art, morality, religion, the theory of knowledge, and other subjects
--------------------------------------------------


In [153]:
top_isbn = results[0].metadata.get("isbn13")

In [154]:
print(f"\nVerifying top result with ISBN: {top_isbn}")
books[books["isbn13"] == top_isbn]


Verifying top result with ISBN: 9780394704371


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
1957,9780394704371,394704371,The Will to Power,Friedrich Wilhelm Nietzsche;Walter Arnold Kauf...,Philosophy,http://books.google.com/books/content?id=jRa6g...,Offers a selection from the author's notebooks...,1968.0,4.05,575.0,7692.0,The Will to Power,9780394704371 Offers a selection from the auth...


In [155]:
def retrieve_semantic_recommendation(query: str, top_k: int = 10):
    """
    Performs semantic search and returns results as a DataFrame, extracting 
    the ISBN from the page_content (as shown in the image logic).

    Args:
        query (str): The search query.
        top_k (int): The number of top results to return.

    Returns:
        pandas.DataFrame: A DataFrame containing the recommended books.
    """
    print(f"\n--- Searching for: '{query}' ---")
    
    # Use a large k for the search to ensure a good pool of results
    recs = db_books.similarity_search(query, k=50) 
    
    books_list = []
    
    # Loop through the search results (Document objects)
    for doc in recs:
        # Extract the ISBN from the START of the page_content string
        # We strip quotes, split by space, take the first item, and keep it as a STRING.
        try:
            isbn = doc.page_content.strip('"').split()[0].strip('"')
            books_list.append(isbn)
        except IndexError:
            # Skip if the description is somehow empty or malformed
            continue
            
    # Filter the original DataFrame using the list of ISBNs
    # .isin() finds all rows where 'isbn13' matches an ISBN in books_list
    recommended_df = books[books["isbn13"].isin(books_list)]
    
    # Sort the results to reflect the search order by mapping the ISBNs
    # This step is added to the image's logic to maintain result order
    isbn_to_rank = {isbn: i + 1 for i, isbn in enumerate(books_list)}
    recommended_df['rank'] = recommended_df['isbn13'].map(isbn_to_rank)
    recommended_df = recommended_df.sort_values(by='rank').head(top_k).reset_index(drop=True)

    return recommended_df[['rank', 'title', 'authors', 'average_rating', 'isbn13']]

In [156]:
retrieve_semantic_recommendation("A book to teach children about nature")


--- Searching for: 'A book to teach children about nature' ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_df['rank'] = recommended_df['isbn13'].map(isbn_to_rank)


Unnamed: 0,rank,title,authors,average_rating,isbn13
0,4,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,3.89,9780786808069
1,8,Baby Einstein: Babies,Julie Aigner-Clark,4.03,9780786808380
2,12,Baby Einstein: Dogs,Julie Aigner-Clark,3.81,9780786808397
3,16,Prodigal Summer,Barbara Kingsolver,4.0,9780060959036
4,20,Baby Einstein: Birds,Julie Aigner-Clark,3.78,9780786808373
5,24,The Control of Nature,John McPhee,4.24,9780374522599
6,28,"Baby Einstein: Water, Water Everywhere","Disney Book Group,",3.7,9780786819119
7,32,Racso and the Rats of NIMH,Jane Leslie Conly,3.76,9780064402453
8,36,"R-T, Margaret, and the Rats of NIMH",Jane Leslie Conly,3.52,9780064403870
9,40,The 10 Commandments of Parenting,H. Edwin Young,4.0,9780802431486
