#### Friday, January 19, 2024

Vectorize the Match Images and then store them to Elastic

In [1]:
# Only create the index once ...
createIndex = False

In [2]:
import torch
import os
import torchvision.transforms as transforms
import json
from PIL import Image
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch, helpers
from tqdm import tqdm

In [3]:

# only target the 4090 ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Set the directory containing your images
image_dir = '../../NLP4HTML/express/express-match/public/images'

# set index name
index_name = 'match_images'

Establish the connection to Elastic Search

In [5]:
from elasticsearch import Elasticsearch 

# Password for the 'elastic' user generated by Elasticsearch
# esHost = "https://172.19.0.3:9200"
esHost = "https://172.19.0.2:9200"

ELASTIC_PASSWORD = "*l-ncw4tr*UF-qzEybkq"

path2cert = "/elasticsearch/http_ca.crt"

esClient = Elasticsearch(esHost, ca_certs=path2cert, basic_auth=("elastic", ELASTIC_PASSWORD))

esClient.info()

ObjectApiResponse({'name': 'f8bb06ea76ae', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'JtIKr7BlTaWWORxN6-qEEw', 'version': {'number': '8.11.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '76013fa76dcbf144c886990c6290715f5dc2ae20', 'build_date': '2023-12-05T10:03:47.729926671Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
# Download and load the image model
model = SentenceTransformer('clip-ViT-B-32-multilingual-v1')

# Prepare the image transformation function
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    lambda image: image.convert("RGB"),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

Scan the target images folder, then save the image path names into a pandas dataframe.

In [7]:
import os
import pandas as pd

def scan_images(folder_path):
    image_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')):
                image_files.append(os.path.join(root, file))
    return image_files

In [8]:
ImageFiles = scan_images(image_dir)

In [9]:
someMatchImage = ImageFiles[42]
someMatchImage

'../../NLP4HTML/express/express-match/public/images/47uOeLUWYFpDG9gp2yVOhA2/c8008f65-9dde-eb11-910f-a0369ff9ce34.jpeg'

In [10]:
someMatchImage = '../../NLP4HTML/express/express-match/public/images/ko1yxtyDnBYUnshNvpaiAw2/df2d5f53-5683-ed11-90fd-f40343f49558.jpeg'

In [11]:
# # Check if the index exists and kill it if it does
# if esClient.indices.exists(index=index_name):
#     # Create the index with the defined mapping
#     esClient.indices.delete(index=index_name)

In [12]:
def create_mapping_if_new(index_name, es):

    # Define the mapping
    mapping = {
      "mappings": {
        "properties": {
          "image_vector": {
            "type": "dense_vector",
            "dims": 512,
            "index": True,
            "similarity": "cosine"
          } ,
          "filename": {
            "type": "keyword"
          }
      }
    }
  }

    # Check if the index does not exist
    if not es.indices.exists(index=index_name):
        # Create the index with the defined mapping
        es.indices.create(index=index_name, body=mapping)

In [41]:
def embed_image(image_path):
    
    # Open the image file
    with Image.open(image_path) as img:
        
        # Apply the transformations to the image
        image = transform(img).unsqueeze(0)

        # If a GPU is available, move the image to the GPU
        if torch.cuda.is_available():
            image = image.to('cuda')
            model.to('cuda')

        # Generate the image vector using the model
        image_vector = model.encode(image)

        # Check if it's a torch tensor and move to CPU if so
        if isinstance(image_vector, torch.Tensor):
            image_vector = image_vector.cpu().numpy()

        # Convert to list
        image_vector = image_vector.tolist()

        # Return the image vector
        return image_vector

In [14]:
# validate this is going to work ... 
imageVector = embed_image(ImageFiles[0])

In [15]:
if createIndex:
    # Create new Index with correct mapping if index does not exist
    create_mapping_if_new(index_name, esClient)

In [16]:
if createIndex:
    
    # Initialize a dictionary to hold the image filename and vector
    data = {}

    totalImages = len(ImageFiles)

    # For each image file in the directory
    for i in tqdm(range(totalImages)):

        fileName = ImageFiles[i]
        
        # Get image vector
        image_vector = embed_image(fileName)

        # Store it in the data dictionary
        data[fileName] = image_vector[0]

# 5m 30.2s

In [17]:
if createIndex:
    # Index the image vectors to Elasticsearch
    documents = []
    for filename, vector in data.items():

        # Create document
        document = {'_index': index_name,
                    '_source': {"filename": filename,
                                "image_vector": vector
                        }
            }


        documents.append(document)

In [18]:
if createIndex:
  
  from elasticsearch.helpers import BulkIndexError

  # Index document
  try:
    helpers.bulk(esClient, documents)
  except BulkIndexError as e:
    for x in e.errors:
      print(x)

# 13.4s 

### kNN Search

Generate a vector for the search image

In [42]:
# this will only come back one time
someMatchImage = '../../NLP4HTML/express/express-match/public/images/47uOeLUWYFpDG9gp2yVOhA2/c8008f65-9dde-eb11-910f-a0369ff9ce34.jpeg'

In [49]:
# this should come back twice
someMatchImage = '../../NLP4HTML/express/express-match/public/images/ko1yxtyDnBYUnshNvpaiAw2/df2d5f53-5683-ed11-90fd-f40343f49558.jpeg'

In [51]:
from PIL import Image
search_image_vector = embed_image(someMatchImage)

Perform a kNN vector search

In [57]:
knn = {
    "field": "image_vector",
    "query_vector": search_image_vector[0],
    "k": 10,
    "num_candidates": 10
  }
fields = ["filename"]
size = 10
source = False

In [58]:
results = esClient.search(index=index_name,
                    knn=knn,
                    source=source,
                    fields=fields,
                    size=size
                  )

In [59]:
import json
json.dumps(results.body)

'{"took": 9, "timed_out": false, "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, "hits": {"total": {"value": 10, "relation": "eq"}, "max_score": 1.0, "hits": [{"_index": "match_images", "_id": "9wtxI40BKwEdzIJQdAZQ", "_score": 1.0, "fields": {"filename": ["../../NLP4HTML/express/express-match/public/images/SomeMadeUpFolder/df2d5f53-5683-ed11-90fd-f40343f49558.jpeg"]}}, {"_index": "match_images", "_id": "hwtxI40BKwEdzIJQlTIR", "_score": 1.0, "fields": {"filename": ["../../NLP4HTML/express/express-match/public/images/ko1yxtyDnBYUnshNvpaiAw2/df2d5f53-5683-ed11-90fd-f40343f49558.jpeg"]}}, {"_index": "match_images", "_id": "TwtxI40BKwEdzIJQcgII", "_score": 0.9998051, "fields": {"filename": ["../../NLP4HTML/express/express-match/public/images/ngc3jRLiTSvujyHkVL1iYw2/2d9074bc-48b3-eb11-910d-a0369ff9d0f4.jpeg"]}}, {"_index": "match_images", "_id": "SwtxI40BKwEdzIJQkzF5", "_score": 0.9997879, "fields": {"filename": ["../../NLP4HTML/express/express-match/public/images/ACB2xG

In [25]:
results['hits']['total']['value']

10

In [26]:
results['hits']['hits'][0]['_score']

1.0

In [27]:
results['hits']['hits'][1]['_score']

1.0

In [55]:
result_filename = results['hits']['hits'][0]['fields']['filename'][0]
result_filename

'../../NLP4HTML/express/express-match/public/images/SomeMadeUpFolder/df2d5f53-5683-ed11-90fd-f40343f49558.jpeg'

Display the top hit

In [None]:
from IPython.display import Image
Image(result_filename, width=400)