Connect to Weaviate

In [33]:
import weaviate

client = weaviate.connect_to_local()

client.is_ready()

True

Creating a collection in Weaviate that supports both Image and Text data <br>
Has 3 fields: <br>
"name", "image", "text" <br>
10%, 70%, 20% <br>
"name": name of image file <br>
"image": base64 encoding of image itself <br>
"text": any textual data <br>

In [34]:
# Chunking
# Insert embedded text into weaviate

import weaviate.classes.config as wc

collection_name = "DemoCollection"  # Replace collection name here

# Check if the collection already exists and delete it if it does
if client.collections.exists(collection_name):
    client.collections.delete(collection_name)

client.collections.create(
        name=collection_name,
        properties=[
            wc.Property(name="name", data_type=wc.DataType.TEXT),
            wc.Property(name="image", data_type=wc.DataType.BLOB),
            wc.Property(name="text", data_type=wc.DataType.TEXT),
        ],
        # Define & configure the vectorizer module
        vectorizer_config=wc.Configure.Vectorizer.multi2vec_clip(
            image_fields=[wc.Multi2VecField(name="image", weight=0.7)],    # 70% of the vector is from the image
            text_fields=[wc.Multi2VecField(name="name", weight=0.1),       # 10% of the vector is from the name
                         wc.Multi2VecField(name="text", weight=0.2)],      # 20% of the vector is from the text
        ),
    )

print("Collection created")


Collection created


In [35]:
import base64
import os
import requests

# Helper function to convert file to base64 representation
def to_base64(url_or_path):
    if url_or_path.startswith('http://') or url_or_path.startswith('https://'):
        # Handle URL
        image_response = requests.get(url_or_path)
        content = image_response.content
    elif os.path.exists(url_or_path):
        # Handle local file path
        with open(url_or_path, 'rb') as image_file:
            content = image_file.read()
    else:
        raise ValueError("The provided string is neither a valid URL nor a local file path.")
    
    return base64.b64encode(content).decode("utf-8")

Downloading images using simple image search

In [21]:
import argparse
from simple_image_download import simple_image_download as simp
from PIL import Image
import os

def image_download(query, number=5):
    # Initialize the argument parser
    parser = argparse.ArgumentParser(description="Download images.")
    parser.add_argument('query', type=str, help='Search query for images')
    parser.add_argument('number', type=int, nargs='?', default=5, help='Number of images to download (default: 5)')
    args = parser.parse_args([query, str(number)])

    # Extract arguments
    search_query = args.query
    num_images = args.number

    # Initialize the simple image download instance
    response = simp.simple_image_download

    # Specify the directory where images will be downloaded
    download_directory = 'downloads'

    # Download images for the specified query
    response().download(search_query, num_images)  # Download images for the search query

# Example usage
image_download("sample charts", 10)


  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)
  open(os.path.join(path, filename), 'wb').write(r.content)


Insert images into Weaviate

In [37]:
collection = client.collections.get(collection_name)

directory = "./simple_images/cat/"
source = os.listdir(directory)

with collection.batch.dynamic() as batch:
    for src_obj in source:
        path = directory + src_obj
        poster_b64 = to_base64(path)
        weaviate_obj = {
            "name": src_obj,
            "image": poster_b64  # Add the image in base64 encoding
            # "text": # Optional text field for image data
        }

        # The model provider integration will automatically vectorize the object
        batch.add_object(
            properties=weaviate_obj,
        )
print("images added")

  with collection.batch.dynamic() as batch:


images added


Insert text into Weaviate

In [38]:
collection = client.collections.get(collection_name)

text_list = ["Cats in fact do not like dogs because they are enemies",
             "Eggplant is a natural enemy of the cat",
             "Cats love fish",
             "Monsters are enemies of battle cats",             
             ]

with collection.batch.dynamic() as batch:
    for text in text_list:
        weaviate_obj = {
            "text": text
        }

        # The model provider integration will automatically vectorize the object
        batch.add_object(
            properties=weaviate_obj,
            # vector=vector  # Optionally provide a pre-obtained vector
        )

Iterate through collection and print items

In [39]:
collection = client.collections.get(collection_name)

for item in collection.iterator(
    include_vector=True  # If using named vectors, you can specify ones to include e.g. ['title', 'body'], or True to include all
):
    print(item.properties)
    print(item.vector)


{'text': None, 'name': 'cat_10.jpg'}
{'default': [-0.009262467734515667, -0.021669311448931694, -0.02965744398534298, -0.030758971348404884, 0.011381901800632477, -0.04983258619904518, 0.0019207950681447983, -0.049943968653678894, 0.023534145206212997, 0.010973664000630379, 0.022279979661107063, -0.03288796544075012, 0.0836581364274025, -0.04344154894351959, 0.029455261304974556, 0.050731390714645386, 0.07788645476102829, -0.03320949152112007, -0.0105122746899724, 0.007612486835569143, -0.041387904435396194, 0.013101105578243732, 0.05370136350393295, -0.03723573312163353, -0.07332020252943039, 0.006293165031820536, 0.022696783766150475, 0.004078669473528862, -0.013972785323858261, 0.01709846965968609, 0.0021561915054917336, 0.030108686536550522, 0.015353970229625702, 0.028023656457662582, 0.00236251438036561, 0.026085691526532173, 0.030260473489761353, -0.009622653014957905, 0.02653174102306366, 0.1351548433303833, -0.04051363095641136, -0.05076313018798828, -0.0432853028178215, -0.042

Sample nearText query

In [40]:
import weaviate.classes as wvc
from weaviate.classes.query import Move

collection = client.collections.get(collection_name)

response = collection.query.near_text(
    query="What do cats consider as their enemies?",
    distance=0.6,
    # move_to=Move(force=0.85, concepts="enemy"),
    # move_away=Move(force=0.45, concepts="friends"),
    return_metadata=wvc.query.MetadataQuery(distance=True),
    limit=2
)

for o in response.objects:
    print(o.properties)
    print(o.metadata)



{'text': 'Cats in fact do not like dogs because they are enemies', 'name': None}
MetadataReturn(creation_time=None, last_update_time=None, distance=0.029944539070129395, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None)
{'text': 'Monsters are enemies of battle cats', 'name': None}
MetadataReturn(creation_time=None, last_update_time=None, distance=0.05284714698791504, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None)


In [31]:
import weaviate.classes as wvc
from weaviate.classes.query import Move

collection = client.collections.get(collection_name)

response = collection.query.near_text(
    query="pie chart",
    distance=0.6,
    # move_to=Move(force=0.85, concepts="enemy"),
    # move_away=Move(force=0.45, concepts="friends"),
    return_metadata=wvc.query.MetadataQuery(distance=True),
    limit=2
)

for o in response.objects:
    print(o.properties)
    print(o.metadata)

{'text': None, 'name': 'chart2.png'}
MetadataReturn(creation_time=None, last_update_time=None, distance=0.3431311845779419, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None)
{'text': None, 'name': 'chart1.png'}
MetadataReturn(creation_time=None, last_update_time=None, distance=0.37204670906066895, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None)


In [None]:
# todo: 2 collections with different weightages
# API with flag for normal vs pictorial query
# normal query: emphasis on image
# pictorial query: emphasis on caption/text of image