In [30]:
import os
from dotenv import load_dotenv
import pandas as pd
import weaviate
import weaviate.classes as wvc

In [31]:
def check_class_exists(client, class_name):
    # Get the current schema
    schema = client.schema.get()

    # Check if the class name exists in the schema
    for cls in schema['classes']:
        if cls['class'] == class_name:
            return True
    return False

In [32]:
load_dotenv()

True

Load the source data about books, contained in a CSV file, into a list of python dictionaries, with some cleaning of the comments field

In [33]:
csv_file_path = os.getenv("CSVFILE")
columns_to_read = ['authors', 'comments', 'publisher', 'tags', 'title'] 
# Read only the specified columns from the CSV file
df = pd.read_csv(csv_file_path, usecols=columns_to_read)
# Clean the 'comments' column
# Remove '\n' characters
df['comments'] = df['comments'].str.replace('\n', ' ', regex=False)
# Convert the DataFrame into a list of dictionaries
books_dict = df.to_dict(orient='records')

Create the Weaviate client (v4) with the OpenAI API key, connect and reply timeouts

In [34]:
# connect to client V4
wport = os.getenv("WPORT")
grpcport = os.getenv("GRPCPORT")
wclient = weaviate.connect_to_local(
    port=int(wport), grpc_port=int(grpcport), headers={"X-OpenAI-Api-Key": os.environ["OPENAI_APIKEY"]}, timeout=(2, 5)
)

Define the MyBook schema and create the corresponding collection

In [35]:
collname = os.getenv("COLLNAME")
if not wclient.collections.exists(collname): # if the schema/collection is missing create it
    collection = wclient.collections.create(
        name=collname,
        vectorizer_config=wvc.Configure.Vectorizer.text2vec_openai(),
        generative_config=wvc.Configure.Generative.openai(), 
        properties=[
            wvc.Property(
                name="authors",
                data_type=wvc.DataType.TEXT
            ),
            wvc.Property(
                name="comments",
                data_type=wvc.DataType.TEXT
            ),
            wvc.Property(
                name="publisher",
                data_type=wvc.DataType.TEXT
            ),
            wvc.Property(
                name="tags",
                data_type=wvc.DataType.TEXT
            ),
            wvc.Property(
                name="title",
                data_type=wvc.DataType.TEXT
            ),
        ]
    )
    books = wclient.collections.get(collname)
    uuids = books.data.insert_many(books_dict)  # This uses batching under the hood

# Now do some searches

In [37]:
limit = int(os.getenv("K_VECTORS"))

### First a semantic search

In [38]:
TOPIC = "exploration and traveling"
response = books.query.near_text(
    query=TOPIC,
    limit=limit
)

# Loop through each object in the 'objects' list
for obj in response.objects:
    print('*' * 50)
    for key, value in obj.properties.items():
        print(f'{key}: {value}')
    print('*' * 50)

**************************************************
title: The Age of Exploration: From Christopher Columbus to Ferdinand Magellan
tags: Juvenile Nonfiction, Reference, General, Adventure & Adventurers, history, Exploration & Discovery
publisher: Britanncia Educational Publishing
authors: Britannica Educational Publishing
comments: The Age of Exploration, which spanned roughly from 1400 to 1550, was the first time in history that European powers—eyeing new trade routes to the East or seeking to establish empires—began actively looking far past their own borders to gain a better understanding of the world and its many resources. The individuals who set out on behalf of the countries they represented came from a variety of backgrounds, and included master navigators such as Christopher Columbus and Ferdinand Magellan—the latter of whom was the first to circle the globe—as well as the often ruthless conquistadors of the New World such as Francisco Pizarro and Hernan Cortes. The exciting an

### Semantic search with a Boolean filter. 
Using the equal filter on the tags property. Filter is case insensitive

In [39]:
FILTER = "crime"
response = books.query.near_text(
    query="biology",
    limit=limit,
    filters=wvc.Filter(path="tags").equal(FILTER)
)

for obj in response.objects:
    print('*' * 50)
    for key, value in obj.properties.items():
        print(f'{key}: {value}')
    print('*' * 50)

**************************************************
title: The Scientist and the Spy: A True Story of China, the FBI, and Industrial Espionage
tags: Business & Economics, True Crime, Espionage, Globalization, Political Science, Intelligence & Espionage
publisher: Penguin
authors: Mara Hvistendahl
comments: A riveting true story of industrial espionage in which a Chinese-born scientist is pursued by the U.S. government for trying to steal trade secrets, by a finalist for the Pulitzer Prize in nonfiction.   In September 2011, sheriff’s deputies in Iowa encountered three ethnic Chinese men near a field where a farmer was growing corn seed under contract with Monsanto. What began as a simple trespassing inquiry mushroomed into a two-year FBI operation in which investigators bugged the men’s rental cars, used a warrant intended for foreign terrorists and spies, and flew surveillance planes over corn country—all in the name of protecting trade secrets of corporate giants Monsanto and DuPont P

### Generative search (single prompt)

In [40]:
response = books.generate.near_text(
    query="The history of human evolution",
    limit=limit,
    single_prompt="Explain {title} in a short paragraph of maximum thirty words."
)

#print(response.objects[0].generated)  # Inspect the generated text
for o in response.objects:
    print(o.properties["title"])
    print("-"*30)
    print(o.generated)
    print("\n")

Sapiens: A Brief History of Humankind
------------------------------
"Sapiens" is a concise exploration of human history, from the emergence of Homo sapiens to the present, examining our species' impact on the world and the forces that shaped our societies.


Sapiens: A Brief History of Humankind
------------------------------
"Sapiens" is a concise exploration of human history, from the emergence of Homo sapiens to the present, examining our species' impact on the world and the forces that shaped our societies.


The Invisible History of the Human Race: How DNA and History Shape Our Identities and Our Futures
------------------------------
The Invisible History of the Human Race explores how our DNA and historical events intertwine to shape our identities and influence our future.


The Invisible History of the Human Race: How DNA and History Shape Our Identities and Our Futures
------------------------------
The Invisible History of the Human Race explores how our DNA and historical 

### Grouped task search
Grouped task search returns one response that includes all of the query results. By default grouped task search uses all object properties in the prompt.
grouped_properties: define object properties to use in the prompt. This limits the information in the prompt and reduces prompt length.

In [41]:
task = "What do these books have in common, if anything?"

response = books.generate.near_text(
    query="The universe and it's history",
    limit=limit,
    grouped_task=task,
    grouped_properties=["title", "comment"]
)

# print the generated response
print(response.generated)

The books listed have the common theme of exploring and discussing the universe and the Earth. They delve into topics such as the history, knowledge, and age of the universe, as well as the myths and theories surrounding it.
