In [None]:
import pandas as pd

data = pd.read_csv("../data/books_refined.csv")

In [2]:
documents = list(data.description)
ids = [f"id{i}" for i in range(1, len(documents) + 1)]

metadatas = []
for row in data.itertuples():
    metadatas.append(
        {
            "average_rating": row.average_rating,
            "ratings_count": int(row.ratings_count),
            "num_pages": int(row.num_pages),
            "published_year": int(row.published_year),
            "title": row.title_and_subtitle,
            "emotion": row.emotion,
            "category": row.refined_category,
            "url": row.thumbnail,
            "author": row.authors,
        }
    )

In [3]:
metadatas[0]

{'average_rating': 3.85,
 'ratings_count': 361,
 'num_pages': 247,
 'published_year': 2004,
 'title': 'Gilead',
 'emotion': 'joy',
 'category': 'Fiction',
 'url': 'http://books.google.com/books/content?id=KQZCPgAACAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api',
 'author': 'Marilynne Robinson'}

In [2]:
import chromadb

client = chromadb.PersistentClient(path="./database")
collection = client.get_or_create_collection(
    name="books",
    metadata={"hnsw:batch_size": 10000},
)

In [None]:
import ollama
from tqdm import tqdm

batch_size = 500

for i in tqdm(range(0, len(documents), batch_size), desc="Processing Batches"):

    batch_documents = documents[i : i + batch_size]
    batch_metadatas = metadatas[i : i + batch_size]
    batch_ids = ids[i : i + batch_size]

    batch_embeddings = ollama.embed(
        model="nomic-embed-text",
        input=batch_documents,
    ).embeddings

    collection.add(
        documents=batch_documents,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
        ids=batch_ids,
    )

In [6]:
query = "A book to teach children about nature."
query_embeddings = ollama.embed(model="nomic-embed-text", input=query).embeddings

In [8]:
results = collection.query(
    query_embeddings=query_embeddings,
    n_results=3,
    # where={"category": "Science"},
)
results

{'ids': [['id3748', 'id3750', 'id4232']],
 'embeddings': None,
 'documents': [['Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.',
   "Introduce your babies to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to expose little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies each image, introducing the baby to some basic -- and sometimes playful -- information about the subjects.",
   'This is a book about how we see: the environment around us (its surfaces, their layout, and their colors and textures); where we are in the environment; whether or not we are moving and, if we are, where we are going; what things are good for; 

In [17]:
from pydantic import BaseModel


class SearchResults(BaseModel):
    title: str
    description: str
    url: str
    category: str
    tone: str
    rating: float
    rating_count: int
    date: int
    pages: int
    author: str

In [19]:
trial = [
    SearchResults(
        **{
            "title": results["metadatas"][0][i]["title"],
            "url": results["metadatas"][0][i]["url"],
            "category": results["metadatas"][0][i]["category"],
            "tone": results["metadatas"][0][i]["emotion"],
            "rating": results["metadatas"][0][i]["average_rating"],
            "rating_count": results["metadatas"][0][i]["ratings_count"],
            "date": results["metadatas"][0][i]["published_year"],
            "pages": results["metadatas"][0][i]["num_pages"],
            "author": results["metadatas"][0][i]["author"],
            "description": results["documents"][0][i],
        }
    )
    for i in range(3)
]
trial

[SearchResults(title='Baby Einstein: Neighborhood Animals', description='Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.', url='http://books.google.com/books/content?id=X9a4PAAACAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api', category='Fiction', tone='joy', rating=3.89, rating_count=180, date=2001, pages=16, author='Marilyn Singer;Julie Aigner-Clark'),
 SearchResults(title='Baby Einstein: Babies', description="Introduce your babies to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to expose little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies each image, introducing the baby to some basic -- and sometimes playful