## Building MultiModal Search with Vector Databases 

### Dependencies

    1. The Weaviate Python Client

In [1]:
#! pip install -U "weaviate-client==4.5.4"

## Connect to Weaviate

In [2]:
%env PALM_API_KEY=ya29.a0Ad52N39AI9_3VqOY

env: PALM_API_KEY=ya29.a0Ad52N39AI9_3VqOY


In [3]:
client.close()

NameError: name 'client' is not defined

In [4]:
import weaviate, os

client = weaviate.connect_to_embedded(
    version="1.24.4",
    environment_variables={
        "ENABLE_MODULES": "multi2vec-palm",
    },
    headers={
        "X-PALM-Api-Key": os.getenv("PALM_API_KEY"),
    }
)

client.is_ready()



Binary /Users/esmaeilatashpazgargari/.cache/weaviate-embedded did not exist. Downloading binary from https://github.com/weaviate/weaviate/releases/download/v1.24.4/weaviate-v1.24.4-Darwin-all.zip
Started /Users/esmaeilatashpazgargari/.cache/weaviate-embedded: process ID 94841


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-03-20T23:55:11-07:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-03-20T23:55:11-07:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-03-20T23:55:11-07:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-03-20T23:55:11-07:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-03-20T23:55:11-07:00"}


True

{"action":"telemetry_push","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:a6a14a6a-e496-4ee8-b3d2-fba478729c17 Type:INIT Version:1.24.4 Modules:multi2vec-palm NumObjects:0 OS:darwin Arch:arm64}","time":"2024-03-20T23:55:12-07:00"}


In [5]:
client.get_meta()

{'hostname': 'http://127.0.0.1:8079',
 'modules': {'multi2vec-palm': {'documentationHref': 'https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-multimodal-embeddings',
   'name': 'Google PaLM Multimodal Module'}},
 'version': '1.24.4'}

## Create the `Animals` Collection

In [6]:
from weaviate.classes.config import Configure

if(client.collections.exists("Animals")):
    client.collections.delete("Animals")

client.collections.create(
    name="Animals",
        vectorizer_config=Configure.Vectorizer.multi2vec_palm(
        text_fields=["content"],
        image_fields=["image"],
        video_fields=["video"],

        project_id="semi-random-dev",
        location="us-central1",
        model_id="multimodalembedding@001", # The multimodal embedding model
        dimensions=1408, # default: 1408 available settings: 128, 256, 512, 1408ar - video embeddings require 1408
    )
)

{"level":"info","msg":"Created shard animals_MV0Xkjm3Z6ZD in 2.050709ms","time":"2024-03-20T23:55:17-07:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-20T23:55:17-07:00","took":78375}


<weaviate.collections.collection.Collection at 0x107658d30>

In [7]:
import base64

# Helper function to convert a file to base64 representation
def toBase64(path):
    with open(path, 'rb') as file:
        return base64.b64encode(file.read()).decode('utf-8')


## Insert Images into Weaviate

> if you get timeout errors, reduce the value from '5' in
> `if (len(items) == 5):`

In [8]:
animals = client.collections.get("Animals")

source = os.listdir("./source/image/")

with animals.batch.rate_limit(requests_per_minute=100) as batch:
    for name in source:
        print(f"Adding {name}")
        
        path = "./source/image/" + name
    
        batch.add_object({
            "name": name,            # name of the file
            "path": path,            # path to the file to display result
            "image": toBase64(path), # this gets vectorized - "image" was configured in vectorizer_config as the property holding images
            "mediaType": "image",    # a label telling us how to display the resource 
        })


Adding dog3.jpg
Adding dog2.jpg
Adding dog1.jpg
Adding cat1.jpg
Adding cat2.jpg
Adding cat3.jpg
Adding meerkat3.jpg
Adding meerkat2.jpg
Adding meerkat1.jpg
Adding .ipynb_checkpoints


IsADirectoryError: [Errno 21] Is a directory: './source/image/.ipynb_checkpoints'

In [None]:
# Check for failed objects
if len(animals.batch.failed_objects) > 0:
    print(f"Failed to import {len(animals.batch.failed_objects)} objects")
    for failed in animals.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")
else:
    print("No errors")

## Check count
> Total count should be 9 (9x image)

In [None]:
#Object count
animals = client.collections.get("Animals")
animals.aggregate.over_all()

## Insert Video Files into Weaviate
> Note. the input video must be at least 4 seconds long

In [None]:
animals = client.collections.get("Animals")

source = os.listdir("./source/video/")

for name in source:
    print(f"Adding {name}")
    path = "./source/video/" + name    

    # insert videos one by one
    animals.data.insert({
        "name": name,
        "path": path,
        "video": toBase64(path),
        "mediaType": "video"
    })

## Check count
> Total count should be 15 (9x image + 6x video)

In [None]:
animals.aggregate.over_all()

In [None]:
agg = animals.aggregate.over_all(
    group_by="mediaType"
)

for group in agg.groups:
    print(group)


## Check all the media files added to the Vector Database

In [None]:
itr = animals.iterator(
    return_properties=["name", "mediaType"],
    # include_vector=True, # in case you want to see the vectors
)

for item in itr:
    print(item.properties)

In [None]:
client.close()