In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

from dotenv import load_dotenv

import pandas as pd

In [None]:
load_dotenv()

In [None]:
books = pd.read_csv("book_cleaned.csv")

In [None]:
books

In [None]:
books["tagged_description"]

In [None]:
books["tagged_description"].to_csv("tagged_description.txt",
                                   sep="\n",
                                   index=False,
                                   header=False)

In [None]:
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [None]:
type(raw_documents)

In [None]:
documents[0]

In [None]:
# #TODO create a Embedding class for this model and test this one out. 
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
db_books = Chroma.from_documents(
    documents,
    embedding=embeddings
)

In [None]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k=10)
docs

In [None]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip().strip(':'))]

In [None]:
def retrieve_semantic_recommendation(
    query:str,
    top_k:int = 10,
) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k=top_k)
    
    books_list = []
    
    for i in range(0, len(recs)):
        books_list += [int(recs[i].page_content.strip('"').split()[0].strip(':'))]
    
    return books[books["isbn13"].isin(books_list)]

In [None]:
retrieve_semantic_recommendation("A book for children about nature.")