In [None]:
# Before running this script, make sure you have installed the required libraries:
# You can install them using pip:
pip install pdf2image pytesseract Pillow numpy futures

# Import necessary libraries
import os
from pdf2image import convert_from_path
import face_recognition
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import pytesseract
from PIL import Image

# Function to extract text from an image using OCR (Optical Character Recognition)
def extract_text_from_image(image):
    try:
        text = pytesseract.image_to_string(image)  # Using pytesseract to extract text
        return text
    except Exception as e:
        print(f"Error extracting text from image: {e}")
        return ""

# Function to extract images and text from a PDF
def extract_images_and_text_from_pdf(pdf_path):
    try:
        pages = convert_from_path(pdf_path)  # Convert PDF pages to images
        images_with_text = []
        for i, page in enumerate(pages):
            images_with_text.append((i, page, extract_text_from_image(page)))  # Extract text from each page's image
        return images_with_text
    except Exception as e:
        print(f"Error extracting images and text from PDF: {e}")
        return []

# Function to search for a name in extracted text
def search_name_in_text(name, text):
    return name.lower() in text.lower()

# Function to process each page for a specific name
def process_page_for_name(name, image, text, page_number):
    if search_name_in_text(name, text):
        return page_number, image
    return None, None

# Function to find images containing a specific name in a PDF
def find_images_by_name_in_pdf(name, pdf_path, output_image_dir):
    try:
        images_with_text = extract_images_and_text_from_pdf(pdf_path)

        # Create output directory if it doesn't exist
        if not os.path.exists(output_image_dir):
            os.makedirs(output_image_dir)

        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_page_for_name, name, image, text, page_number) for page_number, image, text in images_with_text]
            
            for future in as_completed(futures):
                page_number, matching_image = future.result()
                if page_number is not None:
                    output_image_path = os.path.join(output_image_dir, f"matching_image_page_{page_number}.png")
                    matching_image.save(output_image_path)  # Save matching image
                    return page_number, output_image_path

        return None, None  # If no match found
    except Exception as e:
        print(f"Error finding images by name in PDF: {e}")
        return None, None

# Example usage
pdf_path = "/path/example.pdf"  # Provide the path to your PDF file
name_to_search = "anything"  # The name you want to search for
output_image_dir = "/path/images"  # Directory to save matching images

# Search for images containing the specified name in the PDF
page_number, output_image_path = find_images_by_name_in_pdf(name_to_search, pdf_path, output_image_dir)

# Display results
if page_number is not None:
    print(f"Images related to {name_to_search} found on page {page_number}. Image saved to {output_image_path}.")
else:
    print(f"No images related to {name_to_search} found in the PDF.")