In [1]:
import os
import json
from IPython.display import JSON

from fastembed import TextEmbedding

import weaviate
from weaviate.classes.data import DataObject

from helper import suppress_output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
COLLECTION_NAME = "Books"  # capitalize the first letter of collection names
BOOK_DESCRIPTION_FOLDER = "include/data"
EMBEDDING_MODEL_NAME = "BAAI/bge-small-en-v1.5"

### Instantiate Embedded Weaviate client

In [3]:
with suppress_output():
    client = weaviate.connect_to_embedded(
        persistence_data_path= "tmp/weaviate",
    )
print("Started new embedded Weaviate instance.")
print(f"Client is ready: {client.is_ready()}")

Started new embedded Weaviate instance.
Client is ready: True


### Create the collection

In [4]:
existing_collections = client.collections.list_all()
existing_collection_names = existing_collections.keys()

if COLLECTION_NAME not in existing_collection_names:
    print(f"Collection {COLLECTION_NAME} does not exist yet. Creating it...")
    collection = client.collections.create(name=COLLECTION_NAME)
    print(f"Collection {COLLECTION_NAME} created successfully.")
else:
    print(f"Collection {COLLECTION_NAME} already exists. No action taken.")
    collection = client.collections.get(COLLECTION_NAME)

Collection Books already exists. No action taken.


### Extract text from local files

In [5]:
# list the book description files
book_description_files = [
    f for f in os.listdir(BOOK_DESCRIPTION_FOLDER)
    if f.endswith('.txt')
]

print(f"The following files with book descriptions were found: {book_description_files}")

The following files with book descriptions were found: ['book_descriptions_2.txt', 'book_descriptions_1.txt']


In [6]:
book_description_files = [
    f for f in os.listdir(BOOK_DESCRIPTION_FOLDER)
    if f.endswith('.txt')
]

list_of_book_data = []

for book_description_file in book_description_files:
    with open(
        os.path.join(BOOK_DESCRIPTION_FOLDER, book_description_file), "r"
    ) as f:
        book_descriptions = f.readlines()
    
    titles = [
        book_description.split(":::")[1].strip()
        for book_description in book_descriptions
    ]
    authors = [
        book_description.split(":::")[2].strip()
        for book_description in book_descriptions
    ]
    book_description_text = [
        book_description.split(":::")[3].strip()
        for book_description in book_descriptions
    ]
    
    book_descriptions = [
        {
            "title": title,
            "author": author,
            "description": description,
        }
        for title, author, description in zip(
            titles, authors, book_description_text
        )
    ]

    list_of_book_data.append(book_descriptions)

In [7]:
JSON(json.dumps(list_of_book_data))



<IPython.core.display.JSON object>

### Create vector embeddings from descriptions

In [8]:
embedding_model = TextEmbedding(EMBEDDING_MODEL_NAME)  

list_of_description_embeddings = []

for book_data in list_of_book_data:
    book_descriptions = [book["description"] for book in book_data]
    description_embeddings = [
        list(embedding_model.embed([desc]))[0] for desc in book_descriptions
    ]

    list_of_description_embeddings.append(description_embeddings)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.35it/s]


### Load embeddings to Weaviate

In [9]:
for book_data_list, emb_list in zip(list_of_book_data, list_of_description_embeddings):
    items = []
    
    for book_data, emb in zip(book_data_list, emb_list):
        item = DataObject(
            properties={
                "title": book_data["title"],
                "author": book_data["author"],
                "description": book_data["description"],
            },
            vector=emb
        )
        items.append(item)
    
    collection.data.insert_many(items)

### Query for a book recommendation using semantic search

In [10]:
query_str = "A philosophical book"

embedding_model = TextEmbedding(EMBEDDING_MODEL_NAME)  
collection = client.collections.get(COLLECTION_NAME)

query_emb = list(embedding_model.embed([query_str]))[0]

results = collection.query.near_vector(
    near_vector=query_emb,
    limit=1,
)
for result in results.objects:
    print(f"You should read: {result.properties['title']} by {result.properties['author']}")
    print("Description:")
    print(result.properties["description"])

You should read: The Idea of the World (2019) by Bernardo Kastrup
Description:
A rigorous case for the primacy of mind in nature, from philosophy to neuroscience, psychology and physics. The Idea of the World offers a grounded alternative to the frenzy of unrestrained abstractions and unexamined assumptions in philosophy and science today. This book examines what can be learned about the nature of reality based on conceptual parsimony, straightforward logic and empirical evidence from fields as diverse as physics and neuroscience. It compiles an overarching case for idealism - the notion that reality is essentially mental - from ten original articles the author has previously published in leading academic journals. The case begins with an exposition of the logical fallacies and internal contradictions of the reigning physicalist ontology and its popular alternatives, such as bottom-up panpsychism. It then advances a compelling formulation of idealism that elegantly makes sense of - and

### Cleanup Resources

In [11]:
## Remove a collection from an existing Weaviate instance

client.collections.delete(COLLECTION_NAME)

In [12]:
## Delete a Weaviate instance
## This cell can take a few seconds to run  

import shutil

client.close()

EMBEDDED_WEAVIATE_PERSISTENCE_PATH = "tmp/weaviate"

if os.path.exists(EMBEDDED_WEAVIATE_PERSISTENCE_PATH):
    shutil.rmtree(EMBEDDED_WEAVIATE_PERSISTENCE_PATH)
    if not os.path.exists(EMBEDDED_WEAVIATE_PERSISTENCE_PATH):
        print(f"Verified: '{EMBEDDED_WEAVIATE_PERSISTENCE_PATH}' no longer exists.")
        print(f"Weaviate embedded data at '{EMBEDDED_WEAVIATE_PERSISTENCE_PATH}' deleted.")

Verified: 'tmp/weaviate' no longer exists.
Weaviate embedded data at 'tmp/weaviate' deleted.
