In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import weaviate
import weaviate.classes as wvc

In [2]:
def check_class_exists(client, class_name):
    # Get the current schema
    schema = client.schema.get()

    # Check if the class name exists in the schema
    for cls in schema['classes']:
        if cls['class'] == class_name:
            return True
    return False

In [3]:
load_dotenv()

True

Load the source data about books, contained in a CSV file, into a list of python dictionaries, with some cleaning of the comments field

In [4]:
csv_file_path = os.getenv("CSVFILE")
columns_to_read = ['authors', 'comments', 'publisher', 'tags', 'title'] 
# Read only the specified columns from the CSV file
df = pd.read_csv(csv_file_path, usecols=columns_to_read)
# Clean the 'comments' column
# Remove '\n' characters
df['comments'] = df['comments'].str.replace('\n', ' ', regex=False)
# Convert the DataFrame into a list of dictionaries
books_dict = df.to_dict(orient='records')

Create the Weaviate client (v4) with the OpenAI API key, connect and reply timeouts

In [5]:
# connect to client V4
wport = os.getenv("WPORT")
grpcport = os.getenv("GRPCPORT")
wclient = weaviate.connect_to_local(
    port=int(wport), grpc_port=int(grpcport), headers={"X-OpenAI-Api-Key": os.environ["OPENAI_APIKEY"]}, timeout=(2, 5)
)

Define the MyBook schema and create the corresponding collection

In [6]:
collname = os.getenv("COLLNAME")
collection = wclient.collections.create(
    name=collname,
    vectorizer_config=wvc.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.Configure.Generative.openai(), 
    properties=[
        wvc.Property(
            name="authors",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="comments",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="publisher",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="tags",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="title",
            data_type=wvc.DataType.TEXT
        ),
    ]
)

Use the MyBooks collection and batch insert the data (will take around 30 seconds)

In [7]:
books = wclient.collections.get(collname)
books.data.insert_many(books_dict)  # This uses batching under the hood

BatchObjectReturn(all_responses=[UUID('45c3bfb5-d974-4f0a-b9c0-71e31d414902'), UUID('47665fd8-f05d-4b19-a8ea-2c99be667053'), UUID('b23c2930-7ace-4713-b124-bee44874bc5b'), UUID('26cca8a9-2d5e-4f05-a587-87bd49025b0a'), UUID('249d51ff-3f6c-4264-9ad5-41512a3f8915'), UUID('dc0d9ea3-9f50-4980-b795-feee5bca1363'), UUID('541ff688-1706-4a46-ac04-5cbf611b85ef'), UUID('44a020f8-6c7a-4c45-bf39-1a622d4f2b68'), UUID('b50d1e9c-d410-466a-a062-4a0ad675bd57'), UUID('73158977-6f59-4168-ba19-0d755ed5df18'), UUID('73a8e0fa-4a95-49c5-ba89-635c4695b5cc'), UUID('923fa7a5-c598-4199-a3e0-b7656ec04a31'), UUID('5ed3d87d-a13a-420e-a5c4-3b0a38d1658c'), UUID('2cbefefc-a75e-43cc-bfc0-f668bacd2b3e'), UUID('63667a73-5724-4654-b624-dd48ce55caad'), UUID('3fd780b3-6fe9-448a-8c9d-315914d7666b'), UUID('178846d0-1e4d-478a-a253-616a81d42668'), UUID('21369f3e-bf8f-4163-9060-68e9bc40afad'), UUID('22f3ec9d-ccc3-4e17-8424-9b3269206186'), UUID('c4a732de-bc8d-46a5-824f-c7380059cf05'), UUID('d61ca0d6-0151-4b9d-98f5-e5ce4693dd3e'), U

In [8]:
limit = int(os.getenv("K_VECTORS"))

# Now do some searches

First a semantic search

In [15]:
TOPIC = "exploration and traveling"
response = books.query.near_text(
    query=TOPIC,
    limit=limit
)

# Loop through each object in the 'objects' list
for obj in response.objects:
    # Print the properties of each object
    print(obj.properties)

{'title': 'The Age of Exploration: From Christopher Columbus to Ferdinand Magellan', 'tags': 'Juvenile Nonfiction, Reference, General, Adventure & Adventurers, history, Exploration & Discovery', 'publisher': 'Britanncia Educational Publishing', 'authors': 'Britannica Educational Publishing', 'comments': 'The Age of Exploration, which spanned roughly from 1400 to 1550, was the first time in history that European powers—eyeing new trade routes to the East or seeking to establish empires—began actively looking far past their own borders to gain a better understanding of the world and its many resources. The individuals who set out on behalf of the countries they represented came from a variety of backgrounds, and included master navigators such as Christopher Columbus and Ferdinand Magellan—the latter of whom was the first to circle the globe—as well as the often ruthless conquistadors of the New World such as Francisco Pizarro and Hernan Cortes. The exciting and sometimes tragic lives an

Semantic search with a Boolean filter. Using the equal filter on the tags property. Filter is case insensitive

In [13]:
FILTER = "crime"
response = books.query.near_text(
    query="biology",
    limit=limit,
    filters=wvc.Filter(path="tags").equal(FILTER)
)

for obj in response.objects:
    # Print the properties of each object
    print(obj.properties)

{'title': 'The Scientist and the Spy: A True Story of China, the FBI, and Industrial Espionage', 'tags': 'Business & Economics, True Crime, Espionage, Globalization, Political Science, Intelligence & Espionage', 'publisher': 'Penguin', 'authors': 'Mara Hvistendahl', 'comments': 'A riveting true story of industrial espionage in which a Chinese-born scientist is pursued by the U.S. government for trying to steal trade secrets, by a finalist for the Pulitzer Prize in nonfiction. \xa0 In September 2011, sheriff’s deputies in Iowa encountered three ethnic Chinese men near a field where a farmer was growing corn seed under contract with Monsanto. What began as a simple trespassing inquiry mushroomed into a two-year FBI operation in which investigators bugged the men’s rental cars, used a warrant intended for foreign terrorists and spies, and flew surveillance planes over corn country—all in the name of protecting trade secrets of corporate giants Monsanto and DuPont Pioneer. In The Scientist

Generative search (single prompt)

In [31]:
response = books.generate.near_text(
    query="The history human evolution",
    limit=limit,
    single_prompt="Explain {title} in a short paragraph of maximum thirty words."
)

#print(response.objects[0].generated)  # Inspect the generated text
for o in response.objects:
    print(o.properties["title"])
    print("-"*30)
    print(o.generated)
    print("\n")

Sapiens: A Brief History of Humankind
------------------------------
"Sapiens" is a concise exploration of human history, from the emergence of Homo sapiens to the present, examining our species' impact on the world and the forces that shaped our societies.


The Invisible History of the Human Race: How DNA and History Shape Our Identities and Our Futures
------------------------------
The Invisible History of the Human Race explores how our DNA and historical events intertwine to shape our identities and influence our future.


A People's History of the World: From the Stone Age to the New Millennium
------------------------------
A People's History of the World is a comprehensive account of human history, focusing on the struggles and achievements of ordinary people throughout different eras and civilizations.


[ Man the Hunted: Primates, Predators, and Human Evolution (Expanded) ] by Hart, Donna L. ( Author ) [ 2008 ) [ Paperback ]
------------------------------
"Man the Hunted: Pr