In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pandas as pd

books = pd.read_csv('books_cleaned.csv')

In [None]:
books

In [None]:
books["tagged_description"]

In [None]:
#filtering by appending the isbn as an identifier so what we can do is when we get back the recommendations we can split the isbn from the description  as use that as the thing to filter

In [None]:
#text loader method in lang chain it doesnot work with pandas data frame so first thing we do is save these tagged description and only the tag description in a text file

In [None]:
#using 2 csv method in pandas to change into text file and make sure its seperated by new lines
#doesnot have an index and header
#just a file that contains just text description

#If you're exporting just one column as text with new lines:
books["tagged_description"].to_csv("tagged_description.txt",index=False,header=False)


In [None]:
#step 1 load the text using text loader method
#Why not use chunk_size=0?LangChain does not officially support chunk_size=0. It may seem to work in rare cases, but:It’s not reliable across versions.It can throw unexpected ValueError or cause incorrect splits.Official documentation recommends setting a large enough chunk size if you want to split purely on a separator.
# Step 1: Load the text file using utf-8 encoding
loader = TextLoader("tagged_description.txt", encoding="utf-8")
raw_documents = loader.load()
# Step 2: Split by newlines (1 description per document)
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=10000,  # Very large so each line becomes its own chunk
    chunk_overlap=0
)

documents = text_splitter.split_documents(raw_documents)



In [None]:
documents[0]

In [None]:
#use chroma to assign that to a variable called db books and use the form documents method in chroma and pass that two arguments ,we are going to pass in the documents that we created the ones that have been split using text splitter and tell it what embedding we want to use and we use our open ai embeddings
# Use HuggingFace embeddings model that runs locally
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db_books = Chroma.from_documents(
    documents,
    embedding=embeddings,
    persist_directory="./chroma_db"  # this is crucial
)
db_books.persist()  # this saves the database to disk



In [None]:
query="A book to teach children about nature"
docs = db_books.similarity_search(query,k=5)#k=no of outputs we want/get top 5 similar docs
docs


In [None]:
isbn_str = docs[0].page_content.split()[0].strip('":')  # Remove quotes and colon
isbn = int(isbn_str)
filtered_books = books[books["isbn13"] == isbn]
filtered_books

In [None]:
#bundle this functionality into a function so we can do this for every query we want to do and its going to return all the recomendation
def get_all_recommendations(docs, books_df):
    isbns = []
    for doc in docs:
        try:
            isbn = doc.page_content.split()[0].strip('":')
            if books_df["isbn13"].dtype == "int64":
                isbn = int(isbn)
            isbns.append(isbn)
        except Exception as e:
            print("Skipped one doc due to error:", e)

    return books_df[books_df["isbn13"].isin(isbns)]


In [None]:
recommended_books = get_all_recommendations(docs, books)
recommended_books

In [None]:
query = "crime and punishment"
docs = db_books.similarity_search(query, k=5)  # get top 10 similar documents
recommended_books = get_all_recommendations(docs, books)
recommended_books.head()  # shows first 5 rows
