Processing pipeline adapted from Taylor Arnold and Lauren Tilton's "Explainable Search and Discovery of Visual Cultural Heritage Collections with Multimodal Large Language Models" https://2024.computational-humanities-research.org/papers/paper28/ 

In [None]:
import os
import tropy
from dotenv import load_dotenv
import numpy as np
import faiss
import base64
from PIL import Image
from typing import List, Tuple
from ollama import chat
from ollama import embeddings
from ollama import ChatResponse

In [None]:
# Configuration -- not currently used
# Set ollama model
# model='llama3.2-vision'

In [None]:
# Initialize a vector database (using FAISS for simplicity)
DIMENSION = 1024  # Embedding dimension (ensure this matches your embedding model) 3072 for text-embedding-3-large; 1024 for mxbai-embed-large
index = faiss.IndexFlatL2(DIMENSION)

In [None]:
# Mapping of image embeddings to Tropy metadata
metadata_store = {}

In [None]:
# Function to process an image and generate a caption with ollama
def generate_caption(image_path: str) -> str:
    """
    Generate a caption for an image using ollama.
    """
    
    prompt = "Please provide a detailed plain text description of what this photograph portrays. Also provide a description of its composition as a photograph."

    
    response = chat(
        model='llama3.2-vision',
        messages=[{
        'role': 'user',
        'content': prompt,
        'images': [image_path]
        }]
    )
    return response['message']['content']

    
    # return response.choices[0].message.content.strip()

In [None]:
# Function to generate embeddings for a text with ollama
def generate_embedding(text: str) -> np.ndarray:
    """
    Generate a text embedding using ollama
    """

    response = embeddings(
        model='mxbai-embed-large',
        prompt=text,
    )

    return np.array(response["embedding"])


In [None]:
# Retrieve all items from Tropy
items = tropy.get_all_items()
print(items)

In [None]:
def process_images():
    """
    Process images, generate captions and embeddings, and store them in the vector database.
    """
    global metadata_store

    for item in items:
        try:
            photos = tropy.get_item_photos(item)
            if not photos:
                print(f"No photos found for item {item}")
                continue
            
            # Ensure photos is a list or iterable
            if not isinstance(photos, list):
                print(f"Unexpected format for photos in item {item}: {photos}")
                continue

            # Get the first photo -- TODO: handle multi-image / multi-page items
            image_path = photos[0]

            if not os.path.isfile(image_path):
                print(f"File not found: {image_path}")
                continue

            # Generate a caption
            caption = generate_caption(image_path)
            caption = caption.replace("\n", " ") 
                        
            # Generate an embedding
            embedding = generate_embedding(caption)
            
            # Add the embedding to the vector database
            embedding_array = np.array([embedding], dtype=np.float32)

            # Validate embedding before adding
            if embedding_array.shape[1] != DIMENSION:
                raise ValueError(f"Embedding dimension mismatch. Expected {DIMENSION}, got {embedding_array.shape[1]}")

            index.add(embedding_array)
                
            # Store metadata
            metadata_store[len(metadata_store)] = {
                "file_path": image_path, # we probably don't need this
                "caption": caption,
                "item_id": item  # Adding Tropy item ID for traceability back to the application
            }

            
#            print(f"Processed {image_path}: {caption}")
        except ValueError as ve:
            print(f"ValueError while processing item {item}: {ve}")
        except Exception as e:
            print(f"Error processing item {item}: {e}")


In [None]:
# Function to search for images
def search_images(query: str, top_k: int = 5) -> List[Tuple[str, str, str]]:
    """
    Search for images using a natural language query.
    Returns a list of tuples containing file_path, caption, and item_id.
    """
    query_embedding = generate_embedding(query)

    # Perform a similarity search
    distances, indices = index.search(np.array([query_embedding], dtype=np.float32), top_k)

    results = []
    for i in range(len(indices[0])):
        idx = indices[0][i]
        if idx in metadata_store:
            results.append((
                metadata_store[idx]["file_path"],
                metadata_store[idx]["caption"],
                metadata_store[idx]["item_id"]
            ))

    return results

In [None]:
# Let's go!
process_images()

In [None]:
# Perform a search
query = "Aerospace"
results = search_images(query)

In [None]:
# Display results
for file_path, caption, item_id in results:
    print(f"Image: {file_path}, Caption: {caption}, Item ID: {item_id}")

In [None]:
matching_item_ids = [result[2] for result in results]  # Assuming item_id is the 3rd element in the tuple

In [None]:
# Tag matching items with the search query
for item in matching_item_ids:
    tropy.tag_item_by_tag_name(item, [query])