In [1]:
# Install required packages for multimodal image + OCR text embedding
%pip install pillow chromadb tqdm psutil numpy pytesseract

# Note: ChromaDB's OpenCLIPEmbeddingFunction will automatically handle OpenCLIP dependencies
print("📦 Installing packages for multimodal indexing (image + OCR text)...")
print("🎨 OpenCLIP for image embeddings + 📝 Tesseract for OCR text extraction")


Note: you may need to restart the kernel to use updated packages.
📦 Installing packages for multimodal indexing (image + OCR text)...
🎨 OpenCLIP for image embeddings + 📝 Tesseract for OCR text extraction


In [2]:
import os
import glob
from pathlib import Path
from PIL import Image
import numpy as np
import chromadb
from chromadb.utils.embedding_functions.open_clip_embedding_function import OpenCLIPEmbeddingFunction
from tqdm import tqdm
import hashlib
import json
from datetime import datetime
import concurrent.futures
from multiprocessing import cpu_count
import psutil
import pytesseract

# Fix tokenizers parallelism warning when using with pytesseract
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("✅ Imports loaded successfully!")
print(f"💻 System: {cpu_count()} CPU cores, {psutil.virtual_memory().total // (1024**3):.1f}GB RAM")
print("🎨 OpenCLIP + 📝 OCR integration ready for multimodal indexing!")


✅ Imports loaded successfully!
💻 System: 14 CPU cores, 48.0GB RAM
🎨 OpenCLIP + 📝 OCR integration ready for multimodal indexing!


In [3]:
# Configuration for DUAL indexing (separate image and text collections)
DOCS_PATH = "../output/docs-sm_samples"  # Fixed path relative to notebooks folder
CHROMA_DB_PATH = "../chroma_db_dual"  # Store ChromaDB for dual embeddings
IMAGE_COLLECTION_NAME = "smartdoc_images"
TEXT_COLLECTION_NAME = "smartdoc_texts"

# Supported image extensions
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.tiff', '.bmp'}

print(f"Documents path: {DOCS_PATH}")
print(f"ChromaDB path: {CHROMA_DB_PATH}")
print(f"Image collection: {IMAGE_COLLECTION_NAME}")
print(f"Text collection: {TEXT_COLLECTION_NAME}")
print(f"Supported formats: {', '.join(IMAGE_EXTENSIONS)}")
print("🎯 Mode: DUAL indexing (separate OpenCLIP images + SentenceTransformer text)")


Documents path: ../output/docs-sm_samples
ChromaDB path: ../chroma_db_dual
Image collection: smartdoc_images
Text collection: smartdoc_texts
Supported formats: .bmp, .jpg, .tiff, .jpeg, .png
🎯 Mode: DUAL indexing (separate OpenCLIP images + SentenceTransformer text)


In [4]:
def get_all_image_files(base_path):
    """
    Recursively find all image files in the document folders.
    
    Args:
        base_path (str): Base path to search for image files
        
    Returns:
        list: List of tuples (file_path, document_type)
    """
    image_files = []
    base_path = Path(base_path)
    
    if not base_path.exists():
        print(f"Warning: Path {base_path} does not exist!")
        return image_files
    
    # Iterate through document type folders
    for doc_type_folder in base_path.iterdir():
        if doc_type_folder.is_dir():
            doc_type = doc_type_folder.name
            print(f"Scanning folder: {doc_type}")
            
            # Find all image files in this folder
            for file_path in doc_type_folder.iterdir():
                if file_path.is_file() and file_path.suffix.lower() in IMAGE_EXTENSIONS:
                    image_files.append((str(file_path), doc_type))
    
    print(f"Found {len(image_files)} image files total")
    return image_files

# Get all image files
image_files = get_all_image_files(DOCS_PATH)

# Show some statistics
doc_types = {}
for _, doc_type in image_files:
    doc_types[doc_type] = doc_types.get(doc_type, 0) + 1

print("\nDocument type distribution:")
for doc_type, count in sorted(doc_types.items()):
    print(f"  {doc_type}: {count} files")


Scanning folder: form
Scanning folder: news_article
Scanning folder: handwritten
Scanning folder: resume
Scanning folder: letter
Scanning folder: specification
Scanning folder: questionnaire
Scanning folder: memo
Scanning folder: scientific_report
Scanning folder: scientific_publication
Scanning folder: file_folder
Scanning folder: advertisement
Scanning folder: presentation
Scanning folder: email
Scanning folder: invoice
Scanning folder: budget
Found 3494 image files total

Document type distribution:
  advertisement: 229 files
  budget: 247 files
  email: 203 files
  file_folder: 218 files
  form: 229 files
  handwritten: 226 files
  invoice: 211 files
  letter: 222 files
  memo: 219 files
  news_article: 187 files
  presentation: 223 files
  questionnaire: 217 files
  resume: 217 files
  scientific_publication: 219 files
  scientific_report: 217 files
  specification: 210 files


In [5]:
# Initialize ChromaDB with DUAL indexing approach
print("🎨 Setting up ChromaDB with DUAL indexing approach...")
print("📝 Creating separate collections for IMAGE and TEXT embeddings!")

# Import required embedding functions
from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import SentenceTransformerEmbeddingFunction

# Create separate embedding functions
image_ef = OpenCLIPEmbeddingFunction()
text_ef = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

print("✅ OpenCLIP embedding function initialized for image content")
print("✅ SentenceTransformer embedding function initialized for text content")

# Create ChromaDB client with persistent storage
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

# Create or get IMAGE collection
try:
    image_collection = client.get_collection(name=IMAGE_COLLECTION_NAME)
    print(f"✅ Loaded existing IMAGE collection '{IMAGE_COLLECTION_NAME}' with {image_collection.count()} documents")
except:
    image_collection = client.create_collection(
        name=IMAGE_COLLECTION_NAME,
        embedding_function=image_ef,
        metadata={"description": "SmartDoc image embeddings using OpenCLIP"}
    )
    print(f"✅ Created new IMAGE collection '{IMAGE_COLLECTION_NAME}'")

# Create or get TEXT collection  
try:
    text_collection = client.get_collection(name=TEXT_COLLECTION_NAME)
    print(f"✅ Loaded existing TEXT collection '{TEXT_COLLECTION_NAME}' with {text_collection.count()} documents")
except:
    text_collection = client.create_collection(
        name=TEXT_COLLECTION_NAME,
        embedding_function=text_ef,
        metadata={"description": "SmartDoc text embeddings using SentenceTransformer"}
    )
    print(f"✅ Created new TEXT collection '{TEXT_COLLECTION_NAME}'")

print(f"📁 ChromaDB storage location: {os.path.abspath(CHROMA_DB_PATH)}")
print("🎯 Dual indexing collections ready for separate image and text embeddings!")


🎨 Setting up ChromaDB with DUAL indexing approach...
📝 Creating separate collections for IMAGE and TEXT embeddings!


  from .autonotebook import tqdm as notebook_tqdm


✅ OpenCLIP embedding function initialized for image content
✅ SentenceTransformer embedding function initialized for text content
✅ Created new IMAGE collection 'smartdoc_images'
✅ Created new TEXT collection 'smartdoc_texts'
📁 ChromaDB storage location: /Users/ivan/Workspace/agentai-document-data-extractor/smartdoc/chroma_db_dual
🎯 Dual indexing collections ready for separate image and text embeddings!


In [6]:
def create_document_id(file_path):
    """Create a unique document ID based on file path."""
    return hashlib.md5(file_path.encode()).hexdigest()

def extract_text_from_image(image_path):
    """
    Extract text from an image using OCR (pytesseract).
    Based on the approach from 02_document_indexing.ipynb
    
    Args:
        image_path (str): Path to the image file
        
    Returns:
        str: Extracted text
    """
    try:
        # Open and process the image
        with Image.open(image_path) as img:
            # Convert to RGB if necessary
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Extract text using pytesseract
            text = pytesseract.image_to_string(img, lang='eng')
            
            # Clean up the text
            text = text.strip()
            # Remove excessive whitespace
            text = ' '.join(text.split())
            
            return text
    except Exception as e:
        print(f"⚠️  Error extracting text from {os.path.basename(image_path)}: {str(e)}")
        return ""

def load_image_as_array(image_path):
    """
    Load an image and convert it to a numpy array format expected by ChromaDB.
    
    Args:
        image_path (str): Path to the image file
        
    Returns:
        numpy.ndarray: Image as numpy array, or None if failed
    """
    try:
        with Image.open(image_path) as img:
            # Convert to RGB if necessary
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Convert PIL Image to numpy array
            image_array = np.array(img)
            return image_array
    except Exception as e:
        print(f"⚠️  Error loading image {os.path.basename(image_path)}: {str(e)}")
        return None

def process_image_and_text(image_path):
    """
    Process an image to extract both the image array and OCR text.
    This combines visual and textual content for multimodal embedding.
    
    Args:
        image_path (str): Path to the image file
        
    Returns:
        tuple: (is_valid: bool, image_array: numpy.ndarray or None, extracted_text: str)
    """
    # Load image
    image_array = load_image_as_array(image_path)
    if image_array is None:
        return False, None, ""
    
    # Extract text using OCR
    extracted_text = extract_text_from_image(image_path)
    
    return True, image_array, extracted_text

def process_dual_indexing_batch(image_files, batch_size=25, max_workers=None):
    """
    Process images in batches with DUAL indexing approach.
    Creates separate embeddings for images (OpenCLIP) and text (SentenceTransformer).
    
    Args:
        image_files (list): List of (file_path, doc_type) tuples
        batch_size (int): Number of images to process at once
        max_workers (int): Number of parallel processing workers (None = auto-detect)
    """
    # Auto-detect optimal worker count
    if max_workers is None:
        max_workers = min(cpu_count(), 6)  # Conservative for stability
    
    total_files = len(image_files)
    processed_count = 0
    successful_image_count = 0
    successful_text_count = 0
    failed_count = 0
    
    print(f"🚀 Starting DUAL indexing of {total_files} documents")
    print(f"📊 Configuration: Batch size: {batch_size}, Processing workers: {max_workers}")
    print(f"🎨 Using OpenCLIP for IMAGE embeddings + SentenceTransformer for TEXT embeddings")
    print(f"📝 Creating separate collections for visual and textual understanding!")
    
    for i in tqdm(range(0, total_files, batch_size), desc="Processing dual indexing batches"):
        batch_files = image_files[i:i + batch_size]
        
        # Step 1: Parallel image and OCR processing
        print(f"  🔍 Processing batch {i//batch_size + 1}/{(total_files + batch_size - 1)//batch_size} (image + OCR)")
        
        valid_data = []
        
        # Use parallel processing for image loading and OCR extraction
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit processing tasks (image + OCR)
            future_to_file = {
                executor.submit(process_image_and_text, file_path): (file_path, doc_type)
                for file_path, doc_type in batch_files
            }
            
            # Collect valid data
            for future in concurrent.futures.as_completed(future_to_file):
                file_path, doc_type = future_to_file[future]
                processed_count += 1
                
                try:
                    is_valid, image_array, extracted_text = future.result(timeout=60)  # 60 second timeout for OCR
                    
                    if is_valid and image_array is not None:
                        valid_data.append((file_path, doc_type, image_array, extracted_text))
                    else:
                        failed_count += 1
                        
                except concurrent.futures.TimeoutError:
                    print(f"    ⏰ Timeout processing {os.path.basename(file_path)}")
                    failed_count += 1
                except Exception as e:
                    print(f"    ❌ Error processing {os.path.basename(file_path)}: {str(e)}")
                    failed_count += 1
        
        # Step 2: Add to BOTH collections separately
        if valid_data:
            try:
                print(f"  🎨 Adding {len(valid_data)} documents to DUAL collections")
                
                # Prepare data for IMAGE collection
                batch_images = []
                batch_image_metadatas = []
                batch_image_ids = []
                
                # Prepare data for TEXT collection
                batch_texts = []
                batch_text_metadatas = []
                batch_text_ids = []
                
                for file_path, doc_type, image_array, extracted_text in valid_data:
                    doc_id = create_document_id(file_path)
                    
                    # Prepare IMAGE data
                    batch_images.append(image_array)
                    image_metadata = {
                        "document_type": doc_type,  # TRUE document type as metadata
                        "file_path": file_path,
                        "filename": os.path.basename(file_path),
                        "file_extension": Path(file_path).suffix.lower(),
                        "indexed_at": datetime.now().isoformat(),
                        "modality": "image",
                        "has_text": bool(extracted_text)
                    }
                    batch_image_metadatas.append(image_metadata)
                    batch_image_ids.append(f"img_{doc_id}")
                    
                    # Prepare TEXT data (only if we have extracted text)
                    if extracted_text:
                        batch_texts.append(extracted_text)
                        text_metadata = {
                            "document_type": doc_type,  # TRUE document type as metadata
                            "file_path": file_path,
                            "filename": os.path.basename(file_path),
                            "file_extension": Path(file_path).suffix.lower(),
                            "indexed_at": datetime.now().isoformat(),
                            "modality": "text",
                            "text_length": len(extracted_text),
                            "extracted_text": extracted_text
                        }
                        batch_text_metadatas.append(text_metadata)
                        batch_text_ids.append(f"txt_{doc_id}")
                
                # Add to IMAGE collection
                image_collection.add(
                    images=batch_images,
                    metadatas=batch_image_metadatas,
                    ids=batch_image_ids
                )
                successful_image_count += len(batch_images)
                
                # Add to TEXT collection (only if we have text data)
                if batch_texts:
                    text_collection.add(
                        documents=batch_texts,
                        metadatas=batch_text_metadatas,
                        ids=batch_text_ids
                    )
                    successful_text_count += len(batch_texts)
                
                print(f"  ✅ Batch {i//batch_size + 1} completed:")
                print(f"     📸 {len(batch_images)} image embeddings added")
                print(f"     📝 {len(batch_texts)} text embeddings added")
                
            except Exception as e:
                print(f"  ❌ Error processing batch {i//batch_size + 1}: {str(e)}")
                failed_count += len(valid_data)
                continue
        else:
            print(f"  ⚠️  No valid documents in batch {i//batch_size + 1}")
    
    print(f"\n🎉 DUAL indexing complete!")
    print(f"📈 Total files processed: {processed_count}")
    print(f"✅ Image embeddings created: {successful_image_count}")
    print(f"✅ Text embeddings created: {successful_text_count}")
    print(f"❌ Failed: {failed_count}")
    print(f"📚 IMAGE collection now contains: {image_collection.count()} embeddings")
    print(f"📚 TEXT collection now contains: {text_collection.count()} embeddings")
    print(f"🎯 Dual indexing enables confidence-based classification!")

print("🛠️  Multimodal processing functions ready!")
print("📝 Ready to index documents with both visual and textual understanding!")


🛠️  Multimodal processing functions ready!
📝 Ready to index documents with both visual and textual understanding!


In [7]:
# System optimization settings
def get_optimal_settings():
    """Get optimal processing settings based on system resources for multimodal processing."""
    cpu_count_total = cpu_count()
    memory_gb = psutil.virtual_memory().total // (1024**3)
    
    # Conservative settings for multimodal processing (image + OCR)
    # OCR requires more CPU, so slightly smaller batches and workers
    if memory_gb >= 16:
        recommended_workers = min(4, cpu_count_total - 2)  # Reduced for OCR stability
        recommended_batch_size = 15  # Smaller batches for OCR + image arrays
    elif memory_gb >= 8:
        recommended_workers = min(3, cpu_count_total - 1)
        recommended_batch_size = 10
    else:
        recommended_workers = min(2, cpu_count_total - 1)
        recommended_batch_size = 8
    
    print(f"🎯 Recommended settings for multimodal processing:")
    print(f"   CPU cores: {cpu_count_total}")
    print(f"   Memory: {memory_gb}GB")
    print(f"   Recommended workers: {recommended_workers}")
    print(f"   Recommended batch size: {recommended_batch_size}")
    print(f"   📝 Optimized for: Image processing + OCR extraction")
    
    return recommended_workers, recommended_batch_size

def monitor_system_resources():
    """Monitor CPU and memory usage."""
    cpu_percent = psutil.cpu_percent(interval=1)
    memory = psutil.virtual_memory()
    
    print(f"💻 System Status:")
    print(f"   CPU Usage: {cpu_percent:.1f}%")
    print(f"   Memory Usage: {memory.percent:.1f}% ({memory.used // (1024**3):.1f}GB / {memory.total // (1024**3):.1f}GB)")
    
    if cpu_percent > 80:
        print("   ⚠️  High CPU usage - consider reducing max_workers")
    if memory.percent > 85:
        print("   ⚠️  High memory usage - consider reducing batch_size")
    
    return cpu_percent, memory.percent

# Get system recommendations
recommended_workers, recommended_batch_size = get_optimal_settings()
# For testing purposes
recommended_workers = 16
recommended_batch_size = 16 * 10
monitor_system_resources()


🎯 Recommended settings for multimodal processing:
   CPU cores: 14
   Memory: 48GB
   Recommended workers: 4
   Recommended batch size: 15
   📝 Optimized for: Image processing + OCR extraction
💻 System Status:
   CPU Usage: 37.8%
   Memory Usage: 57.2% (24.0GB / 48.0GB)


(37.8, 57.2)

In [8]:
"""
# Process documents with multimodal embeddings (image + OCR text)
print("🚀 Choose your multimodal processing approach:\n")

print("Option A: Test with small subset (recommended for first run)")
print("- Process first 50 documents to test multimodal system")
print("- Combines visual analysis + OCR text extraction")
print("- Uses conservative settings to ensure stability")
print()

print("Option B: Process all documents with optimal settings")
print("- Uses system-optimized settings for full dataset")
print("- Full multimodal indexing (image + OCR text)")
print("- Recommended after testing with Option A")
print()

print("Option C: Custom processing")
print("- Customize batch size and worker count for your needs")
print("- Full control over multimodal processing parameters")
print()

print("💡 About Multimodal Indexing:")
print("- Each document gets embedded using BOTH visual content AND extracted text")
print("- OpenCLIP understands relationships between images and text")
print("- This significantly improves search accuracy and relevance")
print()

# Uncomment ONE of the options below:

# Option A: Test run (recommended first)
print("🔬 Running test with first 50 documents (multimodal)...")
process_multimodal_batch(
    image_files[:50],  # Test with first 50 documents
    batch_size=8,  # Smaller batches for OCR processing
    max_workers=recommended_workers // 2  # Use half recommended workers for safety
)
"""

# Option B: Full processing with optimal settings (uncomment to use)

print("🚀 Processing all documents with multimodal embeddings...")
print("📝 Combining image analysis + OCR text extraction for each document")
import time

start_time = time.time()
initial_cpu, initial_memory = monitor_system_resources()

process_dual_indexing_batch(
    image_files,
    batch_size=recommended_batch_size,
    max_workers=recommended_workers
)

end_time = time.time()
final_cpu, final_memory = monitor_system_resources()

print(f"⏱️  Total processing time: {end_time - start_time:.2f} seconds")
print(f"📊 Average processing speed: {len(image_files) / (end_time - start_time):.2f} documents/second")
print(f"🎯 Each document now has multimodal embeddings for better search accuracy!")


# Option C: Custom processing (uncomment and customize)
"""
CUSTOM_BATCH_SIZE = 10
CUSTOM_MAX_WORKERS = 2
CUSTOM_SUBSET = image_files[:100]  # Process specific range

print(f"🛠️  Running custom multimodal processing...")
print(f"   Batch size: {CUSTOM_BATCH_SIZE}")
print(f"   Workers: {CUSTOM_MAX_WORKERS}")
print(f"   Processing {len(CUSTOM_SUBSET)} documents")

process_multimodal_batch(
    CUSTOM_SUBSET,
    batch_size=CUSTOM_BATCH_SIZE,
    max_workers=CUSTOM_MAX_WORKERS
)
"""


🚀 Processing all documents with multimodal embeddings...
📝 Combining image analysis + OCR text extraction for each document
💻 System Status:
   CPU Usage: 39.5%
   Memory Usage: 57.2% (23.0GB / 48.0GB)
🚀 Starting DUAL indexing of 3494 documents
📊 Configuration: Batch size: 160, Processing workers: 16
🎨 Using OpenCLIP for IMAGE embeddings + SentenceTransformer for TEXT embeddings
📝 Creating separate collections for visual and textual understanding!


Processing dual indexing batches:   0%|          | 0/22 [00:00<?, ?it/s]

  🔍 Processing batch 1/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:   5%|▍         | 1/22 [00:11<04:06, 11.73s/it]

  ✅ Batch 1 completed:
     📸 160 image embeddings added
     📝 160 text embeddings added
  🔍 Processing batch 2/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:   9%|▉         | 2/22 [00:26<04:28, 13.44s/it]

  ✅ Batch 2 completed:
     📸 160 image embeddings added
     📝 157 text embeddings added
  🔍 Processing batch 3/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  14%|█▎        | 3/22 [00:40<04:17, 13.55s/it]

  ✅ Batch 3 completed:
     📸 160 image embeddings added
     📝 154 text embeddings added
  🔍 Processing batch 4/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  18%|█▊        | 4/22 [00:51<03:48, 12.70s/it]

  ✅ Batch 4 completed:
     📸 160 image embeddings added
     📝 142 text embeddings added
  🔍 Processing batch 5/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  23%|██▎       | 5/22 [01:08<04:04, 14.36s/it]

  ✅ Batch 5 completed:
     📸 160 image embeddings added
     📝 159 text embeddings added
  🔍 Processing batch 6/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  27%|██▋       | 6/22 [01:23<03:53, 14.57s/it]

  ✅ Batch 6 completed:
     📸 160 image embeddings added
     📝 159 text embeddings added
  🔍 Processing batch 7/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  32%|███▏      | 7/22 [01:36<03:31, 14.11s/it]

  ✅ Batch 7 completed:
     📸 160 image embeddings added
     📝 158 text embeddings added
  🔍 Processing batch 8/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  36%|███▋      | 8/22 [01:51<03:21, 14.36s/it]

  ✅ Batch 8 completed:
     📸 160 image embeddings added
     📝 160 text embeddings added
  🔍 Processing batch 9/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  41%|████      | 9/22 [02:06<03:07, 14.41s/it]

  ✅ Batch 9 completed:
     📸 160 image embeddings added
     📝 160 text embeddings added
  🔍 Processing batch 10/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  45%|████▌     | 10/22 [02:20<02:51, 14.32s/it]

  ✅ Batch 10 completed:
     📸 160 image embeddings added
     📝 158 text embeddings added
  🔍 Processing batch 11/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  50%|█████     | 11/22 [02:34<02:38, 14.39s/it]

  ✅ Batch 11 completed:
     📸 160 image embeddings added
     📝 157 text embeddings added
  🔍 Processing batch 12/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  55%|█████▍    | 12/22 [02:49<02:23, 14.32s/it]

  ✅ Batch 12 completed:
     📸 160 image embeddings added
     📝 158 text embeddings added
  🔍 Processing batch 13/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  59%|█████▉    | 13/22 [03:12<02:34, 17.11s/it]

  ✅ Batch 13 completed:
     📸 160 image embeddings added
     📝 158 text embeddings added
  🔍 Processing batch 14/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  64%|██████▎   | 14/22 [03:28<02:14, 16.84s/it]

  ✅ Batch 14 completed:
     📸 160 image embeddings added
     📝 141 text embeddings added
  🔍 Processing batch 15/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  68%|██████▊   | 15/22 [03:39<01:44, 14.98s/it]

  ✅ Batch 15 completed:
     📸 160 image embeddings added
     📝 121 text embeddings added
  🔍 Processing batch 16/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  73%|███████▎  | 16/22 [03:51<01:23, 13.99s/it]

  ✅ Batch 16 completed:
     📸 160 image embeddings added
     📝 141 text embeddings added
  🔍 Processing batch 17/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  77%|███████▋  | 17/22 [04:04<01:09, 13.82s/it]

  ✅ Batch 17 completed:
     📸 160 image embeddings added
     📝 155 text embeddings added
  🔍 Processing batch 18/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  82%|████████▏ | 18/22 [04:17<00:54, 13.66s/it]

  ✅ Batch 18 completed:
     📸 160 image embeddings added
     📝 160 text embeddings added
  🔍 Processing batch 19/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  86%|████████▋ | 19/22 [04:30<00:40, 13.46s/it]

  ✅ Batch 19 completed:
     📸 160 image embeddings added
     📝 160 text embeddings added
  🔍 Processing batch 20/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  91%|█████████ | 20/22 [04:44<00:26, 13.49s/it]

  ✅ Batch 20 completed:
     📸 160 image embeddings added
     📝 159 text embeddings added
  🔍 Processing batch 21/22 (image + OCR)
  🎨 Adding 160 documents to DUAL collections


Processing dual indexing batches:  95%|█████████▌| 21/22 [04:58<00:13, 13.53s/it]

  ✅ Batch 21 completed:
     📸 160 image embeddings added
     📝 155 text embeddings added
  🔍 Processing batch 22/22 (image + OCR)
  🎨 Adding 134 documents to DUAL collections


Processing dual indexing batches: 100%|██████████| 22/22 [05:09<00:00, 14.08s/it]

  ✅ Batch 22 completed:
     📸 134 image embeddings added
     📝 134 text embeddings added

🎉 DUAL indexing complete!
📈 Total files processed: 3494
✅ Image embeddings created: 3494
✅ Text embeddings created: 3366
❌ Failed: 0
📚 IMAGE collection now contains: 3494 embeddings
📚 TEXT collection now contains: 3366 embeddings
🎯 Dual indexing enables confidence-based classification!





💻 System Status:
   CPU Usage: 38.2%
   Memory Usage: 57.1% (22.0GB / 48.0GB)
⏱️  Total processing time: 310.86 seconds
📊 Average processing speed: 11.24 documents/second
🎯 Each document now has multimodal embeddings for better search accuracy!


'\nCUSTOM_BATCH_SIZE = 10\nCUSTOM_MAX_WORKERS = 2\nCUSTOM_SUBSET = image_files[:100]  # Process specific range\n\nprint(f"🛠️  Running custom multimodal processing...")\nprint(f"   Batch size: {CUSTOM_BATCH_SIZE}")\nprint(f"   Workers: {CUSTOM_MAX_WORKERS}")\nprint(f"   Processing {len(CUSTOM_SUBSET)} documents")\n\nprocess_multimodal_batch(\n    CUSTOM_SUBSET,\n    batch_size=CUSTOM_BATCH_SIZE,\n    max_workers=CUSTOM_MAX_WORKERS\n)\n'

In [10]:
# Verify the DUAL indexing results
print("=== DUAL Indexing Results ===")
print(f"📸 IMAGE collection: {image_collection.count()} documents")
print(f"📝 TEXT collection: {text_collection.count()} documents")

# Get some statistics from both collections
image_results = image_collection.get(limit=5, include=["metadatas"])
text_results = text_collection.get(limit=5, include=["metadatas"])

if image_results['metadatas']:
    print(f"\n📊 Sample from IMAGE collection:")
    for i, metadata in enumerate(image_results['metadatas'][:3]):
        print(f"  {i+1}. {metadata['filename']} (type: {metadata['document_type']})")

if text_results['metadatas']:
    print(f"\n📊 Sample from TEXT collection:")  
    for i, metadata in enumerate(text_results['metadatas'][:3]):
        text_preview = metadata.get('extracted_text', '')[:100]
        print(f"  {i+1}. {metadata['filename']} - \"{text_preview}...\"")

print(f"\n✅ Dual indexing complete! Ready for confidence-based classification.")


=== DUAL Indexing Results ===
📸 IMAGE collection: 3494 documents
📝 TEXT collection: 3366 documents

📊 Sample from IMAGE collection:
  1. 92600841_0845.jpg (type: form)
  2. 2050755264.jpg (type: form)
  3. 508881563+-1563.jpg (type: form)

📊 Sample from TEXT collection:
  1. 92600841_0845.jpg - "DATE: LORILLARD NAME/LIST PULL REQUEST To: S. R. Benson xc: C. Humphrey A. Pasheluk V. Lindsley E, D..."
  2. 2050755264.jpg - "Ip, Vee. {¥yo Lorsc —— Feinmwhay OBR VIVES PHILIP MORRIS CORPORATE SERVICES INC. Tharp lS waked AS W..."
  3. 508881563+-1563.jpg - "Beltrage zur Tabakforschung International Copy for anor Reviewer's Comments* Title of manuscript: Th..."

✅ Dual indexing complete! Ready for confidence-based classification.


In [12]:
# DUAL search and confidence-based classification functionality
def search_with_dual_confidence(query_image_path, n_results=10):
    """
    Search using DUAL indexing approach with confidence-based classification.
    
    This function:
    1. Searches the image collection using the query image
    2. Extracts OCR text from the query image and searches the text collection
    3. Analyzes classification confidence from both modalities
    4. Returns final classification based on higher confidence
    
    Args:
        query_image_path (str): Path to query image
        n_results (int): Number of results to return from each collection
        
    Returns:
        dict: Combined results with confidence-based classification
    """
    print(f"🔍 DUAL confidence search for: {os.path.basename(query_image_path)}")
    
    # Step 1: Load and process query image
    query_image_array = load_image_as_array(query_image_path)
    if query_image_array is None:
        print(f"❌ Failed to load query image: {query_image_path}")
        return None
    
    # Step 2: Extract OCR text from query image
    query_text = extract_text_from_image(query_image_path)
    print(f"📝 Extracted OCR text: {'Yes' if query_text else 'No'} ({len(query_text)} chars)")
    
    # Step 3: Search IMAGE collection
    try:
        print(f"📸 Searching IMAGE collection...")
        image_results = image_collection.query(
            query_images=[query_image_array],
            n_results=n_results,
            include=["metadatas", "distances"]
        )
        image_classification, image_confidence = analyze_classification_confidence(
            image_results, "image"
        )
    except Exception as e:
        print(f"❌ Error searching image collection: {str(e)}")
        image_classification, image_confidence = None, 0.0
    
    # Step 4: Search TEXT collection (if we have text)
    text_classification, text_confidence = None, 0.0
    if query_text:
        try:
            print(f"📝 Searching TEXT collection...")
            text_results = text_collection.query(
                query_texts=[query_text],
                n_results=n_results,
                include=["metadatas", "distances"]
            )
            text_classification, text_confidence = analyze_classification_confidence(
                text_results, "text"
            )
        except Exception as e:
            print(f"❌ Error searching text collection: {str(e)}")
    
    # Step 5: Confidence-based final classification
    final_classification, final_confidence, winning_modality = determine_final_classification(
        image_classification, image_confidence,
        text_classification, text_confidence
    )
    
    # Step 6: Create combined results
    results = {
        "query_image": query_image_path,
        "query_text": query_text,
        "image_classification": image_classification,
        "image_confidence": image_confidence,
        "text_classification": text_classification, 
        "text_confidence": text_confidence,
        "final_classification": final_classification,
        "final_confidence": final_confidence,
        "winning_modality": winning_modality,
        "image_results": image_results if 'image_results' in locals() else None,
        "text_results": text_results if 'text_results' in locals() else None
    }
    
    return results

def analyze_classification_confidence(search_results, modality):
    """
    Analyze classification confidence based on search results.
    Uses distance-based voting with confidence scoring.
    
    Args:
        search_results: ChromaDB search results
        modality (str): "image" or "text"
        
    Returns:
        tuple: (predicted_class, confidence_score)
    """
    if not search_results or not search_results['metadatas'][0]:
        return None, 0.0
    
    # Count document types weighted by similarity
    doc_type_scores = {}
    total_weight = 0
    
    for metadata, distance in zip(search_results['metadatas'][0], search_results['distances'][0]):
        doc_type = metadata['document_type']
        # Convert distance to similarity (closer = higher similarity)
        similarity = 1 - distance
        # Weight by similarity (higher similarity = more influence)
        weight = max(0, similarity)  # Ensure non-negative
        
        if doc_type not in doc_type_scores:
            doc_type_scores[doc_type] = 0
        doc_type_scores[doc_type] += weight
        total_weight += weight
    
    if total_weight == 0:
        return None, 0.0
    
    # Normalize scores to get confidence percentages
    for doc_type in doc_type_scores:
        doc_type_scores[doc_type] /= total_weight
    
    # Get the top prediction and its confidence
    predicted_class = max(doc_type_scores, key=doc_type_scores.get)
    confidence = doc_type_scores[predicted_class]
    
    print(f"  {modality.upper()} classification: {predicted_class} (confidence: {confidence:.1%})")
    
    return predicted_class, confidence

def determine_final_classification(image_class, image_conf, text_class, text_conf):
    """
    Determine final classification based on confidence levels from both modalities.
    
    Args:
        image_class: Image-based classification
        image_conf: Image-based confidence
        text_class: Text-based classification  
        text_conf: Text-based confidence
        
    Returns:
        tuple: (final_class, final_confidence, winning_modality)
    """
    # If only one modality has results
    if image_class and not text_class:
        return image_class, image_conf, "image_only"
    elif text_class and not image_class:
        return text_class, text_conf, "text_only"
    elif not image_class and not text_class:
        return None, 0.0, "no_results"
    
    # Both modalities have results - compare confidence
    print(f"\n🎯 CONFIDENCE COMPARISON:")
    print(f"   📸 Image: {image_class} ({image_conf:.1%})")
    print(f"   📝 Text:  {text_class} ({text_conf:.1%})")
    
    # Choose based on higher confidence
    if image_conf > text_conf:
        winning_modality = "image"
        final_class = image_class
        final_confidence = image_conf
    elif text_conf > image_conf:
        winning_modality = "text"
        final_class = text_class
        final_confidence = text_conf
    else:
        # Tie - prefer image (or could use other tie-breaking logic)
        winning_modality = "image_tie"
        final_class = image_class
        final_confidence = image_conf
    
    print(f"   🏆 Winner: {winning_modality.upper()} - {final_class} ({final_confidence:.1%})")
    
    return final_class, final_confidence, winning_modality

def get_true_document_type(query_image_path):
    """
    Get the true document type for a query image from the collections.
    
    Args:
        query_image_path (str): Path to the query image
        
    Returns:
        str: True document type, or None if not found
    """
    try:
        # Try to find the document in the image collection first
        doc_id = create_document_id(query_image_path)
        image_id = f"img_{doc_id}"
        
        # Get from image collection
        result = image_collection.get(
            ids=[image_id],
            include=["metadatas"]
        )
        
        if result['metadatas'] and len(result['metadatas']) > 0:
            return result['metadatas'][0]['document_type']
        
        # If not found in image collection, try text collection
        text_id = f"txt_{doc_id}"
        result = text_collection.get(
            ids=[text_id],
            include=["metadatas"]
        )
        
        if result['metadatas'] and len(result['metadatas']) > 0:
            return result['metadatas'][0]['document_type']
            
        return None
        
    except Exception as e:
        print(f"⚠️  Error getting true document type: {str(e)}")
        return None

def display_dual_search_results(results):
    """Display results from dual confidence search in a readable format."""
    if not results:
        print("❌ No results to display")
        return
    
    # Get the true document type from the collections
    true_doc_type = get_true_document_type(results['query_image'])
    
    print(f"\n{'='*60}")
    print(f"🎯 DUAL CONFIDENCE CLASSIFICATION RESULTS")
    print(f"{'='*60}")
    print(f"📁 Query Image: {os.path.basename(results['query_image'])}")
    print(f"📝 OCR Text Available: {'Yes' if results['query_text'] else 'No'}")
    if results['query_text']:
        print(f"   First 100 chars: {results['query_text'][:100]}...")
    
    # Display true document type
    if true_doc_type:
        print(f"\n📋 TRUE DOCUMENT TYPE: {true_doc_type}")
        
        # Check if prediction is correct
        is_correct = results['final_classification'] == true_doc_type
        accuracy_symbol = "✅" if is_correct else "❌"
        print(f"🎯 PREDICTION ACCURACY: {accuracy_symbol} {'CORRECT' if is_correct else 'INCORRECT'}")
    else:
        print(f"\n📋 TRUE DOCUMENT TYPE: Unknown (not found in collections)")
    
    print(f"\n🔍 INDIVIDUAL MODALITY RESULTS:")
    image_correct = results['image_classification'] == true_doc_type if true_doc_type else False
    text_correct = results['text_classification'] == true_doc_type if true_doc_type else False
    
    image_symbol = "✅" if image_correct else "❌" if true_doc_type else "❓"
    text_symbol = "✅" if text_correct else "❌" if true_doc_type else "❓"
    
    print(f"   📸 Image Classification: {results['image_classification']} ({results['image_confidence']:.1%}) {image_symbol}")
    print(f"   📝 Text Classification:  {results['text_classification']} ({results['text_confidence']:.1%}) {text_symbol}")
    
    print(f"\n🏆 FINAL CLASSIFICATION:")
    print(f"   📋 Predicted Type: {results['final_classification']}")
    print(f"   🎯 Confidence: {results['final_confidence']:.1%}")
    print(f"   🏅 Winning Modality: {results['winning_modality']}")
    
    if true_doc_type:
        is_correct = results['final_classification'] == true_doc_type
        print(f"   🎯 Overall Accuracy: {'✅ CORRECT' if is_correct else '❌ INCORRECT'}")
        
        # Additional insights
        if not is_correct:
            print(f"   💡 Note: True type was '{true_doc_type}', predicted '{results['final_classification']}'")
    
    print(f"\n{'='*60}")

# Example searches using DUAL confidence approach
if image_collection.count() > 0 and text_collection.count() > 0:
    print("=== Testing DUAL Confidence Classification ===")
    print("🎯 Testing confidence-based classification using separate image and text collections!")
    
    # Get some sample documents from different types for testing
    sample_types = ["invoice", "form", "handwritten", "budget", "email"]
    test_samples = []
    
    for doc_type in sample_types:
        # Try to get a sample from this document type
        type_results = image_collection.get(
            where={"document_type": doc_type},
            limit=1,
            include=["metadatas"]
        )
        if type_results['metadatas']:
            test_samples.append(type_results['metadatas'][0]['file_path'])
    
    # If we don't have enough samples, just get some random ones
    if len(test_samples) < 3:
        random_results = image_collection.get(limit=5, include=["metadatas"])
        for metadata in random_results['metadatas']:
            if metadata['file_path'] not in test_samples:
                test_samples.append(metadata['file_path'])
                if len(test_samples) >= 5:
                    break
    
    print(f"\n🧪 Testing {len(test_samples)} sample documents with dual confidence classification:")
    
    for i, sample_path in enumerate(test_samples[:3]):  # Test first 3 samples
        print(f"\n{'='*80}")
        print(f"🧪 TEST {i+1}/{min(len(test_samples), 3)}")
        print(f"{'='*80}")
        
        # Run dual confidence search
        results = search_with_dual_confidence(sample_path, n_results=10)
        if results:
            display_dual_search_results(results)
    
    print(f"\n{'='*80}")
    print("✅ DUAL confidence classification system is ready!")
    print("\n📋 Available methods:")
    print("  1. search_with_dual_confidence(query_image_path) - Main classification function")
    print("  2. Automatically searches both IMAGE and TEXT collections")
    print("  3. Returns confidence-based final classification")
    print("  4. Shows which modality (image or text) won the classification")
    print(f"\n📊 Current collection status:")
    print(f"   📸 IMAGE collection: {image_collection.count()} embeddings")
    print(f"   📝 TEXT collection: {text_collection.count()} embeddings")
    print(f"🎯 Ready for confidence-based document classification!")
    
elif image_collection.count() > 0:
    print("⚠️  Only IMAGE collection has data. Run the dual indexing processing to populate TEXT collection.")
    print(f"📸 IMAGE collection: {image_collection.count()} embeddings")
    print(f"📝 TEXT collection: {text_collection.count()} embeddings")
else:
    print("⚠️  No documents indexed yet. Run the processing cell first.")


=== Testing DUAL Confidence Classification ===
🎯 Testing confidence-based classification using separate image and text collections!

🧪 Testing 5 sample documents with dual confidence classification:

🧪 TEST 1/3
🔍 DUAL confidence search for: ti31689101.jpg
📝 Extracted OCR text: Yes (405 chars)
📸 Searching IMAGE collection...
  IMAGE classification: invoice (confidence: 61.1%)
📝 Searching TEXT collection...
  TEXT classification: invoice (confidence: 47.3%)

🎯 CONFIDENCE COMPARISON:
   📸 Image: invoice (61.1%)
   📝 Text:  invoice (47.3%)
   🏆 Winner: IMAGE - invoice (61.1%)

🎯 DUAL CONFIDENCE CLASSIFICATION RESULTS
📁 Query Image: ti31689101.jpg
📝 OCR Text Available: Yes
   First 100 chars: INVOICE Fla FANNON.LUERS ASSOCIATES INC. 5352 a6th Ave, Hyatievile, Ma 20761 + G01) SONaTTE TOBACCO ...

📋 TRUE DOCUMENT TYPE: invoice
🎯 PREDICTION ACCURACY: ✅ CORRECT

🔍 INDIVIDUAL MODALITY RESULTS:
   📸 Image Classification: invoice (61.1%) ✅
   📝 Text Classification:  invoice (47.3%) ✅

🏆 FINAL CLAS