In [1]:
import pdfplumber
import pytesseract
from pytesseract import Output
import os
from PIL import Image
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from presidio_analyzer import AnalyzerEngineProvider
from presidio_anonymizer import AnonymizerEngine
from collections import defaultdict

LANGUAGES = {
    'ENGLISH':{'PRESIDIO':'en','PYTESSERACT':'eng'},
    'FRENCH':{"PRESIDIO":'fr','PYTESSERACT':'fra'},
    'DUTCH':{"PRESIDIO":'nl','PYTESSERACT':'nld'}
             }

def pdf_to_images(pdf_path, output_folder="temp_images"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    image_paths = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            image = page.to_image(resolution=500)
            image_path = os.path.join(output_folder, f"page_{i + 1}.png")
            image.save(image_path, format="PNG")
            image_paths.append((image_path, i + 1, page.width, page.height))
    
    return image_paths

def extract_text_and_coordinates_from_image(image_path, page_number, page_width, page_height):
    image = Image.open(image_path)
    data = pytesseract.image_to_data(image, output_type=Output.DICT,lang="fra")
    
    words_info = []
    for i in range(len(data['text'])):
        if data['text'][i].strip():
            # Calculate bounding box coordinates in PDF units
            word_info = {
                'text': data['text'][i],
                'start_x': data['left'][i] * (page_width / image.width),
                'start_y': data['top'][i] * (page_height / image.height),
                'end_x': (data['left'][i] + data['width'][i]) * (page_width / image.width),
                'end_y': (data['top'][i] + data['height'][i]) * (page_height / image.height),
                'page_number': page_number
            }
            words_info.append(word_info)
    
    return words_info

def extract_text_with_coordinates_pdfplumber(pdf_path, tesseract_lang):
    image_paths = pdf_to_images(pdf_path)
    word_data = []
    pages_data = []

    for image_path, page_number, page_width, page_height in image_paths:
        words_info = extract_text_and_coordinates_from_image(image_path, page_number, page_width, page_height)
        word_data.extend(words_info)

        # Extract text using Tesseract OCR
        image = Image.open(image_path)
        custom_config = r'--oem 3 --psm 6'
        text = pytesseract.image_to_string(image, config=custom_config, lang=tesseract_lang)
        pages_data.append(text)

    # Concatenate all pages' text into a single string
    all_text = " ".join(pages_data)
    return all_text, word_data

def analyzer_anoniymzer_pipeline(text, text_output, presidio_lang):
    analyzer_conf_file = r"C:\Users\CR29QG\OneDrive - ING\Documents\DDM\analyzer_config\all-config.yaml"
    provider = AnalyzerEngineProvider(
    analyzer_engine_conf_file=analyzer_conf_file,
    )
    analyzer = provider.create_engine()
    analyzer_result = analyzer.analyze(text=text,language=presidio_lang)
    anonymizer = AnonymizerEngine()
    result = anonymizer.anonymize(text=text,analyzer_results=analyzer_result)
    with open(text_output,"w",encoding="utf-8") as file:
        file.write(result.text)
    return result.text


def find_differing_words(original_word_data, new_text, old_text):
    original_words = set(old_text.split())
    new_words = set(new_text.split())
    differing_words = original_words.difference(new_words)
    word_data_map = defaultdict(list)
    for word_data in original_word_data:
        word_data_map[word_data['text']].append(word_data)
    differing_words_data = [data for word in differing_words for data in word_data_map[word]]

    return differing_words_data

def redact_differing_words_in_pdf(input_pdf_path, output_pdf_path, differing_word_data):
    temp_pdf_path = "temp_overlay.pdf"
    # Step 1: Create a canvas for the overlay PDF
    with pdfplumber.open(input_pdf_path) as pdf:
        overlay = canvas.Canvas(temp_pdf_path)
        for page_num, page in enumerate(pdf.pages, start=1):
            page_width = page.width
            page_height = page.height
            overlay.setPageSize((page_width, page_height))
            page_words = [word for word in differing_word_data if word['page_number'] == page_num]
            for word_data in page_words:
                x0, y0, x1, y1 = word_data['start_x'], word_data['start_y'], word_data['end_x'], word_data['end_y']
                overlay.setFillColor('black')
                overlay.setStrokeColor('black')
                overlay.setLineWidth(0.5)  
                overlay.rect(x0, page_height - y1, x1 - x0, y1 - y0, fill=1)

            overlay.showPage()

        overlay.save()
    input_pdf = PdfReader(input_pdf_path)
    overlay_pdf = PdfReader(temp_pdf_path)
    writer = PdfWriter()
    for i, page in enumerate(input_pdf.pages):
        if i < len(overlay_pdf.pages):
            page.merge_page(overlay_pdf.pages[i])
        writer.add_page(page)
        
    with open(output_pdf_path, "wb") as output_file:
        writer.write(output_file)

ModuleNotFoundError: No module named 'reportlab'