In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import weaviate
import weaviate.classes as wvc

In [2]:
def check_class_exists(client, class_name):
    # Get the current schema
    schema = client.schema.get()

    # Check if the class name exists in the schema
    for cls in schema['classes']:
        if cls['class'] == class_name:
            return True
    return False

In [3]:
load_dotenv()

True

Load the source data about books, contained in a CSV file, into a list of python dictionaries, with some cleaning of the comments field

In [4]:
csv_file_path = os.getenv("CSVFILE")
columns_to_read = ['authors', 'comments', 'publisher', 'tags', 'title'] 
# Read only the specified columns from the CSV file
df = pd.read_csv(csv_file_path, usecols=columns_to_read)
# Clean the 'comments' column
# Remove '\n' characters
df['comments'] = df['comments'].str.replace('\n', ' ', regex=False)
# Convert the DataFrame into a list of dictionaries
books_dict = df.to_dict(orient='records')

Create the Weaviate client (v4) with the OpenAI API key, connect and reply timeouts

In [5]:
# connect to client V4
wport = os.getenv("WPORT")
grpcport = os.getenv("GRPCPORT")
wclient = weaviate.connect_to_local(
    port=int(wport), grpc_port=int(grpcport), headers={"X-OpenAI-Api-Key": os.environ["OPENAI_APIKEY"]}, timeout=(2, 5)
)

Define the MyBook schema and create the corresponding collection

In [6]:
collname = os.getenv("COLLNAME")
collection = wclient.collections.create(
    name=collname,
    vectorizer_config=wvc.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.Configure.Generative.openai(), 
    properties=[
        wvc.Property(
            name="authors",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="comments",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="publisher",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="tags",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="title",
            data_type=wvc.DataType.TEXT
        ),
    ]
)

Use the MyBooks collection and batch insert the data (will take around 30 seconds)

In [7]:
books = wclient.collections.get(collname)
books.data.insert_many(books_dict)  # This uses batching under the hood

BatchObjectReturn(all_responses=[UUID('3e71f65f-da6f-416d-9720-b277de8fe500'), UUID('dd366212-5f65-4c82-a0b1-3ccb621e7fa2'), UUID('063aca44-0dd7-4561-9957-8c36a2951ac9'), UUID('cbe9df85-6ed7-4539-8cf9-f2fe154f706d'), UUID('5be30a2f-3f36-4031-b8d9-2cf325800df6'), UUID('d5b87d19-89e5-42c1-97e8-8f5c1c2b7791'), UUID('0bd24d49-d3a9-4edd-9d72-485ccd80182f'), UUID('31adf5bb-485e-4621-aab9-5487b5d154fe'), UUID('1430f951-ff1d-4f58-8ebc-ef0d63344560'), UUID('126ac14a-c07e-4c38-a30d-0c102b49b1f9'), UUID('d1ac95b9-1773-4282-ad88-b05ad58b49e9'), UUID('cdc2238c-332a-46d9-b5d9-46ee6867623d'), UUID('22279af1-50ad-48f9-9340-256570d33824'), UUID('8e11517b-1d75-4b62-a6c2-dce365e112d8'), UUID('aa8ee84d-a0b1-4e60-b356-56621faa96db'), UUID('9501d48f-f049-4bea-8b03-b8ba194cbe0e'), UUID('d4e854b1-f87e-400c-9e20-664704437093'), UUID('82d8b137-c91c-46ba-8d6e-45f3663ddf61'), UUID('b27c9a58-194c-4a1b-8410-eed3cd6e54a1'), UUID('c018b7d0-5a74-4f8a-b484-1bb33f1a916c'), UUID('3470a875-379b-421f-bf45-0a7d08bb2770'), U

In [8]:
limit = int(os.getenv("K_VECTORS"))

# Now do some searches

### First a semantic search

In [9]:
TOPIC = "exploration and traveling"
response = books.query.near_text(
    query=TOPIC,
    limit=limit
)

# Loop through each object in the 'objects' list
for obj in response.objects:
    # Print the properties of each object
    print(obj.properties)

{'title': 'The Age of Exploration: From Christopher Columbus to Ferdinand Magellan', 'tags': 'Juvenile Nonfiction, Reference, General, Adventure & Adventurers, history, Exploration & Discovery', 'publisher': 'Britanncia Educational Publishing', 'authors': 'Britannica Educational Publishing', 'comments': 'The Age of Exploration, which spanned roughly from 1400 to 1550, was the first time in history that European powers—eyeing new trade routes to the East or seeking to establish empires—began actively looking far past their own borders to gain a better understanding of the world and its many resources. The individuals who set out on behalf of the countries they represented came from a variety of backgrounds, and included master navigators such as Christopher Columbus and Ferdinand Magellan—the latter of whom was the first to circle the globe—as well as the often ruthless conquistadors of the New World such as Francisco Pizarro and Hernan Cortes. The exciting and sometimes tragic lives an

### Semantic search with a Boolean filter. 
Using the equal filter on the tags property. Filter is case insensitive

In [10]:
FILTER = "crime"
response = books.query.near_text(
    query="biology",
    limit=limit,
    filters=wvc.Filter(path="tags").equal(FILTER)
)

for obj in response.objects:
    # Print the properties of each object
    print(obj.properties)

{'title': 'The Scientist and the Spy: A True Story of China, the FBI, and Industrial Espionage', 'tags': 'Business & Economics, True Crime, Espionage, Globalization, Political Science, Intelligence & Espionage', 'publisher': 'Penguin', 'authors': 'Mara Hvistendahl', 'comments': 'A riveting true story of industrial espionage in which a Chinese-born scientist is pursued by the U.S. government for trying to steal trade secrets, by a finalist for the Pulitzer Prize in nonfiction. \xa0 In September 2011, sheriff’s deputies in Iowa encountered three ethnic Chinese men near a field where a farmer was growing corn seed under contract with Monsanto. What began as a simple trespassing inquiry mushroomed into a two-year FBI operation in which investigators bugged the men’s rental cars, used a warrant intended for foreign terrorists and spies, and flew surveillance planes over corn country—all in the name of protecting trade secrets of corporate giants Monsanto and DuPont Pioneer. In The Scientist

### Generative search (single prompt)

In [11]:
response = books.generate.near_text(
    query="The history of human evolution",
    limit=limit,
    single_prompt="Explain {title} in a short paragraph of maximum thirty words."
)

#print(response.objects[0].generated)  # Inspect the generated text
for o in response.objects:
    print(o.properties["title"])
    print("-"*30)
    print(o.generated)
    print("\n")

Sapiens: A Brief History of Humankind
------------------------------
"Sapiens" is a concise exploration of human history, from the emergence of Homo sapiens to the present, examining our species' impact on the world and the forces that shaped our societies.


The Invisible History of the Human Race: How DNA and History Shape Our Identities and Our Futures
------------------------------
The Invisible History of the Human Race explores how our DNA and historical events intertwine to shape our identities and influence our future.


The Evolution of Everything: How New Ideas Emerge
------------------------------
"The Evolution of Everything" explores how complex systems, from language to technology, evolve through the interactions of countless individuals, challenging the notion of top-down control and highlighting the power of spontaneous order.


A Troublesome Inheritance: Genes, Race and Human History
------------------------------
A Troublesome Inheritance explores the controversial to

### Grouped task search
Grouped task search returns one response that includes all of the query results. By default grouped task search uses all object properties in the prompt.
grouped_properties: define object properties to use in the prompt. This limits the information in the prompt and reduces prompt length.

In [19]:
task = "What do these books have in common, if anything?"

response = books.generate.near_text(
    query="The universe and it's history",
    limit=limit,
    grouped_task=task,
    grouped_properties=["title", "comment"]
)

# print the generated response
print(response.generated)

These books have the common theme of exploring and providing knowledge about the universe, astrophysics, and related topics.
