## Building MultiModal Search with Vector Databases 

### Weaviate Setup

run the docker image with:

```
docker compose
```

### Dependencies

    1. The Weaviate Python Client

In [None]:
! pip install --pre -I "weaviate-client==4.4.1"

### Connect to Weaviate

In [3]:
import weaviate, os

client = weaviate.connect_to_local()

client.is_ready()

True

In [5]:
client.get_meta()

{'hostname': 'http://[::]:8080',
 'modules': {'multi2vec-bind': {'model': 'ImageBindModel', 'version': 1}},
 'version': '1.23.7'}

### Create the `Animals` Collection

In [11]:
import weaviate.classes.config as wc

if(client.collections.exists("Animals")):
    client.collections.delete("Animals")

client.collections.create(
    name="Animals",
    vectorizer_config=wc.Configure.Vectorizer.multi2vec_bind(
        audio_fields=["audio"],
        image_fields=["image"],
        video_fields=["video"],
    )
)

<weaviate.collections.collection.Collection at 0x11ba480d0>

In [8]:
import base64

# Helper function to convert a file to base64 representation
def toBase64(path):
    with open(path, 'rb') as file:
        return base64.b64encode(file.read()).decode('utf-8')


### Insert Images into Weaviate

> if you get timeout errors, reduce the value from '5' in
> `if (len(items) == 5):`

In [12]:
animals = client.collections.get("Animals")

source = os.listdir("./source/image/")
items = list()

for name in source:
    print(f"Adding {name}")
    
    path = "./source/image/" + name
    
    items.append({
        "name": name,            # name of the file
        "path": path,            # path to the file to display result
        "image": toBase64(path), # this gets vectorized - "image" was configured in vectorizer_config as the property holding images
        "mediaType": "image",    # a label telling us how to display the resource 
    })

    # import images in batches of 5
    if (len(items) == 5):
        print(f"Inserting 5 new image objects.")
        animals.data.insert_many(items)
        items.clear()

# Insert any remaining items
if (len(items) > 0):
    print(f"Inserting remaining ({len(items)}) items.")
    animals.data.insert_many(items)

Adding dog3.jpg
Adding dog2.jpg
Adding dog1.jpg
Adding cat1.jpg
Adding cat2.jpg
Inserting 5 new image objects.
Adding cat3.jpg
Adding meerkat3.jpg
Adding meerkat2.jpg
Adding meerkat1.jpg
Inserting remaining (4) items.


## Check count
> Total count should be 9 (9x image)

In [2]:
#Object count
animals = client.collections.get("Animals")
animals.aggregate.over_all()

AggregateReturn(properties={}, total_count=9)

### Insert Audio Files into Weaviate

In [14]:
animals = client.collections.get("Animals")

source = os.listdir("./source/audio/")
items = list()

for name in source:
    print(f"Adding {name}")
    
    path = "./source/audio/" + name
    items.append({
        "name": name,
        "path": path,
        "audio": toBase64(path),
        "mediaType": "audio"
    })

    # import images in batches of 3
    if(len(items) == 3):
        print(f"Inserting 3 new audio objects.")
        animals.data.insert_many(items)
        items.clear()

# Insert any remaining items
if (len(items) > 0):
    print(f"Inserting remaining ({len(items)}) items.")
    animals.data.insert_many(items)

Adding mixkit-little-birds-singing-in-the-trees-17.wav
Adding mixkit-jungle-ape-sound-2419.wav
Adding mixkit-rooster-crowing-in-the-morning-2462.wav
Inserting 3 new audio objects.
Adding mixkit-dog-barking-twice-1.wav
Adding mixkit-cow-moo-1744.wav
Adding mixkit-cartoon-kitty-begging-meow-92.wav
Inserting 3 new audio objects.


## Check count
> Total count should be 15 (9x image + 6x audio)

In [15]:
animals.aggregate.over_all()

AggregateReturn(properties={}, total_count=15)

### Insert Video Files into Weaviate

In [16]:
animals = client.collections.get("Animals")

source = os.listdir("./source/video/")

for name in source:
    print(f"Adding {name}")
    
    path = "./source/video/" + name
    item = {
        "name": name,
        "path": path,
        "video": toBase64(path),
        "mediaType": "video"
    }
    
    # insert videos one by one
    animals.data.insert(item)

Adding dog-high-five.mp4
Adding dog-with-stick.mp4
Adding cat-clean.mp4
Adding meerkat-dig.mp4
Adding cat-play.mp4
Adding meerkat-watch.mp4


## Check count
> Total count should be 21 (9x image + 6x audio + 9x video)

In [4]:
animals.aggregate.over_all()

AggregateReturn(properties={}, total_count=21)

In [5]:
agg = animals.aggregate.over_all(
    group_by="mediaType"
)

for group in agg.groups:
    print(group)


AggregateGroup(grouped_by=GroupedBy(prop='mediaType', value='image'), properties={}, total_count=9)
AggregateGroup(grouped_by=GroupedBy(prop='mediaType', value='audio'), properties={}, total_count=6)
AggregateGroup(grouped_by=GroupedBy(prop='mediaType', value='video'), properties={}, total_count=6)


### Check all the media files added to the Vector Database

In [8]:
itr = animals.iterator(
    return_properties=["name", "mediaType"],
    # include_vector=True, # in case you want to see the vectors
)

for item in itr:
    print(item.properties)

{'mediaType': 'audio', 'name': 'mixkit-dog-barking-twice-1.wav'}
{'mediaType': 'audio', 'name': 'mixkit-rooster-crowing-in-the-morning-2462.wav'}
{'mediaType': 'video', 'name': 'dog-with-stick.mp4'}
{'mediaType': 'audio', 'name': 'mixkit-little-birds-singing-in-the-trees-17.wav'}
{'mediaType': 'image', 'name': 'cat1.jpg'}
{'mediaType': 'image', 'name': 'dog1.jpg'}
{'mediaType': 'image', 'name': 'meerkat3.jpg'}
{'mediaType': 'image', 'name': 'cat3.jpg'}
{'mediaType': 'audio', 'name': 'mixkit-cow-moo-1744.wav'}
{'mediaType': 'image', 'name': 'meerkat2.jpg'}
{'mediaType': 'audio', 'name': 'mixkit-cartoon-kitty-begging-meow-92.wav'}
{'mediaType': 'video', 'name': 'cat-clean.mp4'}
{'mediaType': 'image', 'name': 'dog2.jpg'}
{'mediaType': 'image', 'name': 'meerkat1.jpg'}
{'mediaType': 'video', 'name': 'meerkat-watch.mp4'}
{'mediaType': 'audio', 'name': 'mixkit-jungle-ape-sound-2419.wav'}
{'mediaType': 'video', 'name': 'meerkat-dig.mp4'}
{'mediaType': 'image', 'name': 'dog3.jpg'}
{'mediaType':