Processing pipeline adapted from Taylor Arnold and Lauren Tilton's "Explainable Search and Discovery of Visual Cultural Heritage Collections with Multimodal Large Language Models" https://2024.computational-humanities-research.org/papers/paper28/ 

In [1]:
import os
import tropy
from openai import OpenAI
from dotenv import load_dotenv
import numpy as np
import faiss
import base64
from PIL import Image
from typing import List, Tuple

In [2]:
# Configuration
# Set OpenAI API key
load_dotenv()
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

In [3]:
# Initialize a vector database (using FAISS for simplicity)
DIMENSION = 3072  # Embedding dimension (ensure this matches your embedding model)
index = faiss.IndexFlatL2(DIMENSION)

In [4]:
# Mapping of image embeddings to Tropy metadata
metadata_store = {}

In [5]:
# Function to process an image and generate a caption
def generate_caption(image_path: str) -> str:
    """
    Generate a caption for an image using OpenAI's API.
    """
    # Ensure the image file is not too large
    max_size = 500 * 1024  # 500KB
    if os.path.getsize(image_path) > max_size:

        # Resize the image to reduce file size
        with Image.open(image_path) as img:
            img.thumbnail((800, 800))  # Resize to fit within 800x800 pixels
            resized_path = "temp_resized_image.jpg"
            img.save(resized_path, "JPEG")
            image_path = resized_path

    with open(image_path, "rb") as image_file:
        img_b64_str = base64.b64encode(image_file.read()).decode("utf-8")

    # Delete temp file if it exists
    try:
        os.remove("temp_resized_image.jpg")
    except OSError:
        pass

    prompt = "Please provide a detailed plain text description of what this photograph portrays. Also provide a description of its composition as a photograph."
    
    # Construct the payload
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_b64_str}"
                        },
                    },
                ],
            }
        ],
        max_tokens=150  # Adjust this based on desired caption length
    )

    return response.choices[0].message.content.strip()

In [6]:
# Function to generate embeddings for a text
def generate_embedding(text: str) -> np.ndarray:
    """
    Generate a text embedding using OpenAI's embedding API.
    """
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large"
    )
    return np.array(response.data[0].embedding)


In [7]:
# Retrieve all items from Tropy
items = tropy.get_all_items()
print(items)

[1018, 1020, 1022, 1024, 1026, 1028, 1030, 1032, 1034, 1036, 1038, 1040, 1042, 1044, 1046, 1048, 1050, 1052, 1054, 1056, 1058, 1060, 1062, 1064, 1066, 1068, 1070, 1072, 1074, 1076, 1078, 1080, 1082, 1084, 1086, 1088, 1090, 1092, 1094, 1097]


In [8]:
def process_images():
    """
    Process images, generate captions and embeddings, and store them in the vector database.
    """
    global metadata_store

    for item in items:
        try:
            photos = tropy.get_item_photos(item)
            if not photos:
                print(f"No photos found for item {item}")
                continue
            
            # Ensure photos is a list or iterable
            if not isinstance(photos, list):
                print(f"Unexpected format for photos in item {item}: {photos}")
                continue

            # Get the first photo -- TODO: handle multi-image / multi-page items
            image_path = photos[0]

            if not os.path.isfile(image_path):
                print(f"File not found: {image_path}")
                continue

            # Generate a caption
            caption = generate_caption(image_path)
            caption = caption.replace("\n", " ") 
                        
            # Generate an embedding
            embedding = generate_embedding(caption)
            
            # Add the embedding to the vector database
            embedding_array = np.array([embedding], dtype=np.float32)

            # Validate embedding before adding
            if embedding_array.shape[1] != DIMENSION:
                raise ValueError(f"Embedding dimension mismatch. Expected {DIMENSION}, got {embedding_array.shape[1]}")

            index.add(embedding_array)
                
            # Store metadata
            metadata_store[len(metadata_store)] = {
                "file_path": image_path, # we probably don't need this
                "caption": caption,
                "item_id": item  # Adding Tropy item ID for traceability back to the application
            }

            
#            print(f"Processed {image_path}: {caption}")
        except ValueError as ve:
            print(f"ValueError while processing item {item}: {ve}")
        except Exception as e:
            print(f"Error processing item {item}: {e}")


In [9]:
# Function to search for images
def search_images(query: str, top_k: int = 5) -> List[Tuple[str, str, str]]:
    """
    Search for images using a natural language query.
    Returns a list of tuples containing file_path, caption, and item_id.
    """
    query_embedding = generate_embedding(query)

    # Perform a similarity search
    distances, indices = index.search(np.array([query_embedding], dtype=np.float32), top_k)

    results = []
    for i in range(len(indices[0])):
        idx = indices[0][i]
        if idx in metadata_store:
            results.append((
                metadata_store[idx]["file_path"],
                metadata_store[idx]["caption"],
                metadata_store[idx]["item_id"]
            ))

    return results

In [10]:
# Let's go!
process_images()

In [11]:
# Perform a search
query = "Aerospace"
results = search_images(query)

In [12]:
# Display results
for file_path, caption, item_id in results:
    print(f"Image: {file_path}, Caption: {caption}, Item ID: {item_id}")

Image: /Users/stakats/Documents/Mason/CHNM/Tropy/Training/Tropy-HAEU/ELDO-4592_03.jpg, Caption: The photograph portrays a black and white image of a rocket on a launch pad. The rocket is vertically positioned and dominates the central portion of the image. It features a sleek, cylindrical design with distinct patterns and markings visible along its body. Near the top, the rocket narrows into a pointed tip, indicating its readiness for launch.  The setting is an open area under a clear sky, which stretches across a significant portion of the frame. The launch pad, visible at the bottom, includes supporting structures and scaffolding necessary for rocket support and maintenance. These structures appear intricate, with an array of beams and platforms.  In terms of composition, the photograph is well-balanced, with the rocket firmly placed in the center, drawing the viewer's eye upward. The, Item ID: 1048
Image: /Users/stakats/Documents/Mason/CHNM/Tropy/Training/Tropy-HAEU/ELDO-4592_01.jpg

In [13]:
matching_item_ids = [result[2] for result in results]  # Assuming item_id is the 3rd element in the tuple

In [14]:
# Tag matching items with the search query
for item in matching_item_ids:
    tropy.tag_item_by_tag_name(item, [query])