# Generate titles, decriptions, and subjects for images in multiple steps while keeping a human in the loop. 

## make sure virutal environment is active
on mac:
`source venv/bin/activate`

In [1]:
import os
import glob
from PIL import Image
from tqdm import tqdm
import ipywidgets as widgets
import pandas as pd

import os
import json
from PIL import Image
import base64
from datetime import datetime
import yaml


from io import BytesIO
import base64
import glob


from IPython.display import display, JSON

import re
from collections import Counter


from IPython.display import display
import torch
import torchvision
from transformers import (
    AutoProcessor,
    Qwen2_5_VLForConditionalGeneration,
)

# set some path stings for later use
img_base = "img/"
output_base = "output/"

In [2]:
# convert all images to standard jpg format

def pre_process_images(input_dir, output_dir=None, target_width=1024, quality=85):
    """
    Convert all input files in input_dir (and subfolders) to JPG with a fixed width.
    Maintains aspect ratio for height. Saves JPGs in the same directory or output_dir.
    
    Args:
        input_dir (str): Directory containing image files (searches subfolders).
        output_dir (str, optional): Directory to save JPGs. If None, saves in same directory as TIFFs.
        target_width (int): Desired width of JPGs (default: 1024 pixels).
        quality (int): JPG quality (0-100, default: 85 for good balance of size and quality).
    
    Returns:
        list: List of (img_path, jpg_path, error) tuples for each processed file.
    """
    # Ensure input directory exists
    if not os.path.isdir(input_dir):
        raise ValueError(f"Input directory does not exist: {input_dir}")
    
    if not output_dir:
        output_dir = input_dir + "_jpg"
    # If output_dir is specified, ensure it exists
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Find all image files recursively
    file_set = glob.glob(os.path.join(input_dir, "**", "*.[tTjJpP][iIpPnN][fFgG]*"), recursive=True)
    print(f"Found {len(file_set)} files.")
    
    results = []
    
    for file_path in tqdm(file_set, desc="Preprocessing images", unit="file"):
        try:
            # Open image
            with Image.open(file_path) as img:
                # Convert to RGB 
                img = img.convert("RGB")
                
                # Calculate proportional height
                original_width, original_height = img.size
                aspect_ratio = original_height / original_width
                target_height = int(target_width * aspect_ratio)
                
                # Resize image
                img = img.resize((target_width, target_height), Image.LANCZOS)
                
                # Save in output_dir, preserving relative path
                rel_path = os.path.relpath(file_path, input_dir)
                orinal_file_name = os.path.basename(file_path)
                base_name, ext = os.path.splitext(orinal_file_name)
                jpg_filename = base_name + ".jpg"
                jpg_path = os.path.join(output_dir, jpg_filename)
                os.makedirs(os.path.dirname(jpg_path), exist_ok=True)

                
                # Save as JPG
                img.save(jpg_path, format="JPEG", quality=quality)
                
            results.append((orinal_file_name, file_path, jpg_path, base_name))
        
        except Exception as e:
            print(f"Error converting {file_path}: {e}")
            results.append((file_path, None, str(e)))
    
    # # Create a pandas DataFrame
    df = pd.DataFrame(results, columns=["original_file_name","relative_path","compressed_file_path","key"])


    output_filename = f"{output_dir}/compressed_files_list.csv"

    # save to CSV
    df.to_csv(output_filename, index=False)


# pre-process images widget
# List only directories in the target path, excluding those ending with '_jpgs'
folder_options = sorted([
    f for f in os.listdir(img_base)
    if os.path.isdir(os.path.join(img_base, f)) and not f.endswith('_jpgs')
])

folder_widget = widgets.Dropdown(
    options=folder_options,
    description='Select image folder:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='70%')
)
display(folder_widget)

Dropdown(description='Select image folder:', layout=Layout(width='70%'), options=('campus_scenes',), style=Des…

In [3]:
# Process images
selected_folder = os.path.join(img_base, folder_widget.value)
pre_process_images(selected_folder, output_dir= img_base + folder_widget.value + "_jpgs/")

Found 3 files.


Preprocessing images: 100%|██████████| 3/3 [00:00<00:00,  3.38file/s]


In [3]:


# set the project folder containing the images to be processed
project_folder = folder_widget.value + "_jpgs/"

# Export project_folder to a .env file for bash scripts
with open("project_folder.env", "w") as f:
    f.write(f'PROJECT_FOLDER="{project_folder}"\n')



## spinup
Create image and reconciliation services for open refine (also verifies venv is active, but no longer necessary)

    bash scripts/spinup.sh



In [5]:
print(torch.backends.mps.is_available())

True


In [6]:

# For Apple ARM/MLX
if torch.backends.mps.is_available():
    from mlx_vlm import generate, load
    from mlx_vlm.prompt_utils import apply_chat_template
    from mlx_vlm.utils import load_config, load_image

    model_id="mlx-community/Qwen2.5-VL-7B-Instruct-4bit"
    torch.set_default_device("mps:0")
    device = "mps:0"
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

if torch.backends.mps.is_available():
    model, processor = load(model_id)
    config = model.config
else:
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id, torch_dtype="auto", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained(model_id)
    config = None


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


## Prompt Snippets

In [5]:
system_prompt = "You are an archivist librarian tasked with generating metadata for image sets."

title_only_query = (
    f"Analyze the provided image and generate a title caption of 50 words or less. "
    "Return a valid JSON string with the following fields: "
    "{{\"title\": \"title caption you generated\"}}. "
    "Ensure the output is a clean JSON string with double quotes around keys and string values. "
    "Do not wrap the JSON in backticks, markdown, or any extra text. "
    "Return only the JSON string, e.g., {{\"title\": \"example title caption\"}}."
)

description_only_query = (
    "Analyze the provided image with title \"{title}\" generate a description of 500 words or less. "
    "Return a valid JSON string with the following fields: "
    "{{\"title\": \"{title}\", \"description\": \"example description\"}}. "
    "Ensure the output is a clean JSON string with double quotes around keys and string values. "
    "Do not wrap the JSON in backticks, markdown, or any extra text. "
    "Return only the JSON string, e.g., {{\"title\": \"{title}\", \"description\": \"example description\"}}."
)

subjects_only_query = (
    "Analyze the provided image with title \"{title}\" and description \"{description}\" to generate LCSH compliant subject headings. "
    "Ensure the subjects are valid LCSH terms from the Library of Congress Authorities (http://authorities.loc.gov) or LC Linked Data Service (http://id.loc.gov). "
        "Use only standardized LCSH terms, avoiding generic or invented terms. "
        "Follow these guidelines: "
        "1. Use standardized LCSH terms "
        "2. Avoid invented or colloquial terms "
        "3. Use hierarchical terms with qualifiers when appropriate (e.g., \"part 1--part 2--YYYY-\"). "
        "4. Base subjects on the key concepts in the title and description, prioritizing semantic relevance. "
        "5. Limit the number of subjects to 10 or fewer, separated by '|'. "
        "Ensure the output is a clean JSON string with double quotes around keys and string values. "
        "Do not wrap the JSON in backticks, markdown, or any extra text. "
        "Return only the JSON string, e.g., {{\"subjects\": \"subject1|subject2|subject3|etc.\"}}. "        
)



## Prompt Functions

In [7]:

def resolve_template(template, **kwargs):
    return template.format(**kwargs)

def get_prompt(query, processor=processor, config=config, system_prompt=system_prompt, template_kwargs=None):
    """
    Generate a prompt for the LLM, optionally formatting the user query with template_kwargs.
    
    Args:
        query (str): The user query or template string.
        processor: The processor for the model.
        config: The model config.
        system_prompt (str): The system prompt string.
        template_kwargs (dict, optional): If provided, used to format the query string.
        
    Returns:
        str: The formatted prompt.
    """
    if template_kwargs:
        user_content = resolve_template(query, **template_kwargs)
    else:
        user_content = query

    message_content = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_content}
    ]
    prompt = apply_chat_template(processor, config, message_content)
    return prompt


def process_path(path):
    """
    Process the file path to extract the filename and make into uri.
    
    Args:
        path (str): The file path to process.
        
    Returns:
        str: the uri.
    """
    # uri = "http://localhost:8000/" + project_folder + os.path.basename(path)
    uri = "http://localhost:8000/" + os.path.basename(path)
    
    return uri

  

In [8]:
# creat input selector
input_file_widget = widgets.Dropdown(
    options=sorted([f for f in os.listdir(img_base + project_folder) if f.endswith("list.csv")]),
    description='Input file:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='70%')
)
display(input_file_widget)

Dropdown(description='Input file:', layout=Layout(width='70%'), options=('compressed_files_list.csv',), style=…


## Generate Titles



In [9]:
# MULTI-IMAGE step 1
TEMP = 0

input_file = img_base+project_folder+input_file_widget.value
df = pd.read_csv(input_file)

print(f"Found {len(df)} images.")

# Add a new column for the LLM-generated title
df["title"] = ""

for idx, item in tqdm(enumerate(df["compressed_file_path"][:]), total=len(df), desc="Processing images", unit="image"):
    try:
        img = load_image(item)
        prompt = get_prompt(title_only_query)
        
        # Invoke the LLM
        output = generate(
            model,
            processor,
            prompt,
            img,
            max_tokens=1500,
            temperature=TEMP,
            verbose=False
        )
        json_item = json.loads(output)
        df.at[idx, "title"] = json_item.get("title", "")

    except Exception as e:
        print(f"Error processing {item}: {e}")
        print(json.dumps(json_item, indent=2))
        df.at[idx, "title"] = f"Error: {str(e)}"



# Generate ISO 8601 timestamp and prepend to filename
timestamp = datetime.now().isoformat().replace(":", "-")
timestamp = timestamp.split(".")[0]  # Remove milliseconds
temp_str = str(TEMP).replace(".", "-")  # Replace "." with "-" for filename compatibility
llm_model_short = model_id.replace('/','-').replace("mlx-community-", "")  # Replace ":" with "-" for filename compatibility
output_filename = img_base + project_folder + f"{timestamp}_{llm_model_short}_{temp_str}__results_mlx_Step1.csv"

# save to CSV
df["image"] =  df["compressed_file_path"].map(process_path)
df.to_csv(output_filename, index=False)



Found 3 images.


Processing images: 100%|██████████| 3/3 [00:46<00:00, 15.41s/image]


## Generate Descriptions

In [11]:
# Step 2 widget
input_file_widget_step2 = widgets.Dropdown(
    options=sorted([f for f in os.listdir(img_base + project_folder) if f.endswith("Step1.csv")]),
    description='Step 2 Input:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='70%')
)
display(input_file_widget_step2)

Dropdown(description='Step 2 Input:', layout=Layout(width='70%'), options=('2025-10-01T11-13-42_Qwen2.5-VL-7B-…

In [12]:
# MULTI-IMAGE step 2
TEMP = 0
input_file = img_base+project_folder+input_file_widget_step2.value


df = pd.read_csv(input_file)


#df = df.head(2)
# Add a new column for the LLM-generated description
df["description"] = ""

for idx, item in tqdm(df.iterrows(), total=len(df), desc="Processing images", unit="image"):
    try:
        img = load_image(item["compressed_file_path"])
        prompt = get_prompt(description_only_query, template_kwargs={"title": item["title"]})
        
        # Invoke the LLM
        output = generate(
            model,
            processor,
            prompt,
            img,
            max_tokens=1500,
            temperature=TEMP,
            verbose=False
        )
        json_item = json.loads(output)
        #df.at[idx, "title"] = json_item.get("title", "")
        df.at[idx, "description"] = json_item.get("description", "")
    except Exception as e:
        print(f"Error processing {item}: {e}")
        df.at[idx, "title"] = f"Error: {str(e)}"



# Generate ISO 8601 timestamp and prepend to filename
timestamp = datetime.now().isoformat().replace(":", "-")
timestamp = timestamp.split(".")[0]  # Remove milliseconds
temp_str = str(TEMP).replace(".", "-")  # Replace "." with "-" for filename compatibility
llm_model_short = model_id.replace('/','-').replace("mlx-community-", "")  # Replace ":" with "-" for filename compatibility
output_filename = img_base + project_folder + f"{timestamp}_{llm_model_short}_{temp_str}__results_mlx_Step2.csv"

# save to CSV
df.to_csv(output_filename, index=False)



Processing images: 100%|██████████| 3/3 [01:08<00:00, 22.72s/image]


## Generate Subjects - no recommendations

In [13]:
# Step 3 widget
input_file_widget_step3 = widgets.Dropdown(
    options=sorted([f for f in os.listdir(img_base + project_folder) if f.endswith("Step2.csv")]),
    description='Step 3 Input:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='70%')
)
display(input_file_widget_step3)

Dropdown(description='Step 3 Input:', layout=Layout(width='70%'), options=('2025-10-06T09-23-37_Qwen2.5-VL-7B-…

In [14]:
# MULTI-IMAGE step 3 option 1
TEMP = 0

input_file = img_base+project_folder+input_file_widget_step3.value
# read in input
df = pd.read_csv(input_file)

# limit process for testing
#df = df.head(15)

# Add a new column for the LLM-generated description
#df["subjects"] = ""

for idx, item in tqdm(df.iterrows(), total=len(df), desc="Processing images", unit="image"):
    try:
        img = load_image(item["compressed_file_path"])
        prompt = get_prompt(subjects_only_query, template_kwargs={"title": item["title"], "description": item["description"]})
        
        # Invoke the LLM
        output = generate(
            model,
            processor,
            prompt,
            img,
            max_tokens=1500,
            temperature=TEMP,
            verbose=False
        )
        #print(f"output: {output}")
        json_item = json.loads(output)
        #df.at[idx, "title"] = json_item.get("title", "")
        df.at[idx, "subjects"] = json_item.get("subjects", "")
    except Exception as e:
        print(f"Error processing {item}: {e}")
        df.at[idx, "subjects"] = f"Error: {str(e)}"



# Generate ISO 8601 timestamp and prepend to filename
timestamp = datetime.now().isoformat().replace(":", "-")
timestamp = timestamp.split(".")[0]  # Remove milliseconds
temp_str = str(TEMP).replace(".", "-")  # Replace "." with "-" for filename compatibility
llm_model_short = model_id.replace('/','-').replace("mlx-community-", "")  # Replace ":" with "-" for filename compatibility
output_filename = img_base + project_folder + f"{timestamp}_{llm_model_short}_{temp_str}__results_mlx_Step3.csv"

# save to CSV
df.to_csv(output_filename, index=False)



Processing images: 100%|██████████| 3/3 [01:09<00:00, 23.32s/image]
