In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

import pandas as pd

In [None]:
books = pd.read_csv("books_cleaned.csv")

In [None]:
books['tagged_description']

In [None]:
books['tagged_description'].to_csv("tagged_description.txt", 
                                   sep="\n", 
                                   index=False, 
                                   header=False)

In [None]:
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size = 1, chunk_overlap = 0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[0]

In [None]:
model_name = "all-MiniLM-L6-v2"

print(f"Initializing embedding model: {model_name} (this may download the model on first run)...")
embeddings = HuggingFaceEmbeddings(
    model_name=model_name
)

In [None]:
vectorstore = Chroma.from_documents(
    documents=documents, 
    embedding=embeddings
)

In [None]:
query = 'A book to teach children about nature'
docs = vectorstore.similarity_search(query, k = 10)

In [None]:
docs[0]

In [None]:
def retrieve_semantic_recommendations(
    query: str,
    top_k: int = 10
) -> pd.DataFrame:
    recs = vectorstore.similarity_search(query, k=top_k)

    books_list = []
    for i in range(0, len(recs)):
        books_list.append(int(recs[i].page_content.strip('"').split()[0]))
    
    return books[books['isbn13'].isin(books_list)]

In [None]:
results = retrieve_semantic_recommendations("A book to teach children about nature", 10)

In [None]:
len(results)

In [None]:
results