jupitar notebook for doc anonymizer applications 

In [None]:
import cv2
import pytesseract
import os
from PIL import Image
from pdf2image import convert_from_path

In [None]:
TESSERACT_PATH = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path for your system
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH


def get_llm_redaction_list(text_content):
    """
    Step 3: Calls the Gemini API to identify PII in the text and request a structured JSON response.

    In a real implementation, you would use a JSON Schema (responseSchema) 
    to force the model to return a structured list of the PII strings found.
    """
    print("--- Sending text to LLM for PII analysis ---")

    # MOCK LLM RESPONSE FOR TESTING (In a real app, this would be a fetch/API call)
    # The LLM identifies the specific text strings that are PII.
    mock_llm_response_data = {
        "pii_items": [
            "John Smith", 
            "555-123-4567",
            "123 Fictional Ln"
        ]
    }

    # IMPORTANT: The LLM only sees the text. It must return the exact PII string.
    # Your code will then match this string back to the OCR coordinates.
    
    return mock_llm_response_data['pii_items']


def get_ocr_data(image_path):
    """
    Step 2: Performs OCR and extracts word-level bounding box data.
    """
    print(f"--- Performing OCR on {image_path} ---")
    
    # Use 'data' output which includes bounding box information
    # Output format is typically: level, page_num, block_num, par_num, line_num, word_num, left, top, width, height, conf, text
    data = pytesseract.image_to_data(Image.open(image_path), output_type=pytesseract.Output.DICT)
    
    # Structure the data to make it easier to search
    ocr_words = []
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        if text:
            # We store the bounding box (x, y, w, h)
            bbox = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
            ocr_words.append({'text': text, 'bbox': bbox})
            
    return ocr_words


def redact_image(image_path, pii_list, ocr_words, output_dir="redacted_output"):
    """
    Step 4: Finds the coordinates for PII and redacts the image using OpenCV.
    """
    print(f"--- Redacting image: {image_path} ---")
    
    # Load the image using OpenCV
    img = cv2.imread(image_path)
    if img is None:
        print(f"Error: Could not load image {image_path}")
        return None
    
    # Set the redaction color (black)
    REDACTION_COLOR = (0, 0, 0) # BGR format
    redaction_count = 0

    # This is a simplification: finding exact PII phrases requires complex matching 
    # and aggregation of word bounding boxes.
    for pii_text in pii_list:
        pii_words = pii_text.split()
        
        # Simple lookup: Find the first word of the PII phrase
        for word_data in ocr_words:
            if word_data['text'] == pii_words[0]:
                x, y, w, h = word_data['bbox']
                
                # Simple redaction just for the first word match. 
                # For full phrase matching (e.g., "John Smith"), you need to find
                # the bounding boxes for all words in the phrase and combine them 
                # to create one large redaction box.
                
                # Draw a filled rectangle (the redaction box)
                cv2.rectangle(img, (x, y), (x + w, y + h), REDACTION_COLOR, -1)
                redaction_count += 1

    print(f"Successfully redacted {redaction_count} potential words/segments.")

    # Save the redacted image
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.basename(image_path)
    output_path = os.path.join(output_dir, f"redacted_{base_name}")
    cv2.imwrite(output_path, img)
    
    return output_path


def anonymize_document(file_path):
    """
    Main function to handle PDF or Image input.
    """
    print(f"\n--- Starting Anonymization for {file_path} ---")
    
    temp_images = []
    
    # Step 1: Handle Input (Convert PDF to images if necessary)
    if file_path.lower().endswith('.pdf'):
        # This requires Poppler
        pages = convert_from_path(file_path)
        for i, page in enumerate(pages):
            img_path = f"temp_page_{i}.png"
            page.save(img_path, 'PNG')
            temp_images.append(img_path)
    else:
        temp_images.append(file_path)

    redacted_images = []

    for img_path in temp_images:
        # Step 2: OCR Extraction
        ocr_data = get_ocr_data(img_path)
        
        # Combine all extracted text into one string for the LLM
        full_text = " ".join([d['text'] for d in ocr_data])
        
        # Step 3: LLM Analysis (get list of PII strings)
        pii_to_redact = get_llm_redaction_list(full_text)
        
        if pii_to_redact:
            # Step 4: Redaction
            redacted_path = redact_image(img_path, pii_to_redact, ocr_data)
            if redacted_path:
                redacted_images.append(redacted_path)

        # Cleanup temporary files (only for PDF conversion)
        if file_path.lower().endswith('.pdf'):
            os.remove(img_path)
            
    # Step 5: Final Output
    if file_path.lower().endswith('.pdf') and redacted_images:
        # Convert redacted images back to a single PDF
        output_pdf_path = os.path.join("redacted_output", "redacted_document.pdf")
        
        # Requires PIL/Pillow
        redacted_pages = [Image.open(img) for img in redacted_images]
        if redacted_pages:
            redacted_pages[0].save(
                output_pdf_path, 
                "PDF", 
                resolution=100.0, 
                save_all=True, 
                append_images=redacted_pages[1:]
            )
            print(f"\n--- SUCCESS! Redacted PDF saved to: {output_pdf_path} ---")

    elif redacted_images:
        print(f"\n--- SUCCESS! Redacted Image saved to: {redacted_images[0]} ---")


# --- Example Usage (Requires a file named 'sample.pdf' or 'sample.png' to be present) ---
# if __name__ == '__main__':
#     # NOTE: You must provide a real file path here for testing
#     # anonymize_document('sample_document.pdf')
#     pass