# IDML Document Migration with AI Analysis

## Dependencies

In [2]:
from PIL import Image
from pathlib import Path
import google.genai as genai
import xml.etree.ElementTree as ET
import fitz
import json
import shutil
import mimetypes
import zipfile
import os
import re

In [3]:
API_KEY = 'AIzaSyBBtO2UJmUute1PJ7w3JUSEj6OLD7g7ECQ'

In [4]:
base_path = Path("data")
old_dir = base_path / "old"
new_template_dir = base_path / "new"
pdf_old_path = base_path / "antigo.pdf"
pdf_new_path = base_path / "new.pdf"

# Diretório de saída para o documento migrado
migrated_dir = base_path / "migrated"
idml_final_path = base_path / "migrated.idml"

## PDF Document Analysis

### PDF Page to Image conversion

In [5]:
def pdf_page_to_image(page) -> Image.Image:
    """Converte a primeira página de um PDF em um objeto de imagem."""
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img


### Text Extraction

In [6]:
def pdf_text_extraction(page) -> list[str]:
    """Extrai todos os blocos de texto da primeira página de um PDF."""
    # Extrai texto em blocos para manter a coesão
    text_blocks = page.get_text("blocks")
    # Retorna uma lista de textos limpos
    return [block[4].replace('\n', ' ').strip() for block in text_blocks if block[4].strip()]

def pdf_text_extraction(page) -> list[str]:
    """Extrai todos os blocos de texto da primeira página de um PDF, excluindo o conteúdo de tabelas."""
    text_blocks = page.get_text("blocks")
    cleaned_texts = []
    i = 0
    while i < len(text_blocks):
        current_block_text = text_blocks[i][4].strip()
        # Check if the next block starts with "Tabelle"
        if i + 1 < len(text_blocks) and text_blocks[i + 1][4].strip().startswith("Tabelle"):
            # Skip the current block and the next block (the table caption)
            i += 2
        else:
            if current_block_text:
                cleaned_texts.append(current_block_text)
            i += 1
    return cleaned_texts


In [7]:
def extract_text_from_idml_story(story_file_path):
    """Extracts text content from a single IDML story XML file."""
    try:
        tree = ET.parse(story_file_path)
        root = tree.getroot()

        # print(f"Debugging IDML extraction for: {story_file_path}") # Debug print
        # print(f"Root element tag: {root.tag}") # Debug print

        # --- Namespace Handling (Adjust URI if needed) ---
        # Extract the namespace URI from the root element
        # The root tag will often be in the format {namespace_uri}local_name
        namespace_uri = None
        if root.tag.startswith('{') and '}' in root.tag:
            namespace_uri = root.tag.split('}')[0][1:]
        # -----------------------------------------------

        # print(f"Deduced Namespace URI: {namespace_uri}") # Debug print

        text_content = ""
        # Iterate through elements and extract text, using namespace if found
        for elem in root.iter(): # Iterate through all elements in the tree
            # Construct the tag name with namespace if available
            tag_to_match = elem.tag
            if namespace_uri and tag_to_match.startswith(f'{{{namespace_uri}}}'):
                 tag_to_match = tag_to_match.replace(f'{{{namespace_uri}}}', '') # Get local name


            # You might need to add other tag names here if your text is in different elements
            if tag_to_match in ['Content', 'Br']: # Consider tags likely to contain text or represent breaks
                 if elem.text:
                    text_content += elem.text
                 if tag_to_match == 'Br': # Explicitly add newline for <Br> tags
                     text_content += "\n"
                 if elem.tail: # Add tail text as it's also content
                     text_content += elem.tail


        # print(f"Extracted raw text content: '{text_content}'") # Debug print

        return text_content
    except Exception as e:
        print(f"Error extracting text from IDML story {story_file_path}: {e}")
        return None

In [8]:
def extract_sections(doc):
    sections = {}
    current_section_title = None
    current_section_text = ""
    section_pattern = re.compile(r'^(\d+\.\d+(\.\d+)*)') # Regex to find section numbers

    # for page_num in range(doc.page_count):
    page = doc.load_page(1)
    text = page.get_text()
    lines = text.splitlines()

    for line in lines:
        section_match = section_pattern.match(line)

        if section_match:
            # If a new section is found, save the previous one
            if current_section_title is not None:
                sections[current_section_title] = current_section_text.strip()

            current_section_title = section_match.group(1)
            current_section_text = line[section_match.end():] # Start new section text after the title
        else:
            # If no new section title, append the line to the current section's text
            current_section_text += "\n" + line

    # Save the last section after the loop
    if current_section_title is not None:
        sections[current_section_title] = current_section_text.strip()

    return sections


### Image Extraction

In [9]:
def extract_region_from_pdf(pdf_path: str, bbox: tuple[float, float, float, float], output_folder: str):
    """Extracts an image from a specific bounding box region of each page of a PDF."""
    Path(output_folder).mkdir(parents=True, exist_ok=True)

    try:
        doc = fitz.open(pdf_path)
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            pix = page.get_pixmap(clip=bbox)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            img.save(Path(output_folder) / f"page_{page_num}_region.png")
        doc.close()
    except Exception as e:
        print(f"Error extracting region from {pdf_path}: {e}")


In [10]:
def extract_images_from_pdf(pdf_path: str, page_num: int, output_folder: str):
    """
    Extracts all images from a specific page of a PDF and saves them to a specified folder,
    returning a list of the saved image paths.

    Args:
        pdf_path: The path to the input PDF file.
        page_num: The page number to extract images from (0-indexed).
        output_folder: The path to the folder where extracted images will be saved.

    Returns:
        A list of paths to the saved image files, or an empty list if no images were found
        or an error occurred.
    """
    extracted_image_paths = []
    try:
        # Open the PDF document
        doc = fitz.open(pdf_path)

        # Create the output folder if it doesn't exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        page = doc.load_page(page_num)

        # Get a list of images on the page
        image_list = page.get_images()

        # Iterate through each image on the page
        for img_index, img_info in enumerate(image_list):
            xref = img_info[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"] # Get the image extension

            # Create the output filename
            output_filename = os.path.join(output_folder, f"page_{page_num}_img_{img_index}.{image_ext}")

            # Save the image
            with open(output_filename, "wb") as img_file:
                img_file.write(image_bytes)

            print(f"Extracted image page {page_num}, image {img_index} to {output_filename}")
            extracted_image_paths.append(output_filename) # Add the saved path to the list

        doc.close()
        print(f"Image extraction complete. Images saved to {output_folder}")
        return extracted_image_paths

    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        return []
    except Exception as e:
        print(f"An error occurred during image extraction: {e}")
        return []


## Story Analysis

### Story Text Extraction

In [11]:
# todo integrar essa função a análise dos textos
def story_text_extraction(story_path) -> str:
    """Extrai todo o conteúdo de texto de um arquivo de Story do IDML."""
    try:
        tree = ET.parse(story_path)
        root = tree.getroot()
        content = ""
        # Itera sobre todos os elementos de conteúdo no XML
        for content_elem in root.findall('.//Content'):
            if content_elem.text:
                content += content_elem.text
        return content.strip()
    except (ET.ParseError, FileNotFoundError):
        return ""

## Text Analysis with AI

In [18]:
# v1
def gemini_doc_analysis(pdf_path: Path) -> dict:
    response = {}
    try:
        # 1. Preparar os dados para a análise multimodal
        doc = fitz.open(pdf_path)
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            textos_pdf = pdf_text_extraction(page)
            imagem_pdf = pdf_page_to_image(page)
        
            # 2. Construir o prompt para a API Gemini
            # todo especificar labels
            # todo especificar possíveis repetições
            prompt_parts = [
                "Analise a imagem desta página de documento e os textos fornecidos.",
                "Atue como um especialista em editoração eletrônica (Desktop Publishing).",
                "Para cada texto na lista, identifique seu type semântico (ex: 'title', 'subtitle', 'text_body', 'author', 'header', 'captions', 'link', 'highlighted_text', 'title_level_1', 'title_level_2', 'section_title') apenas, não analise tabelas nesse momento.",
                "Não existe números de página, pois os documentos possui navegação dinâmica com links internos.", 
                "Considere também que o título do documento, seu tipo e números de referência estão presentes em todos os slides, como parte do design, logo, só há o título de seção em todos, exceto no slide de capa.", 
                "Retorne um único objeto JSON com uma chave 'mapeamento_semantico', que contém uma lista de objetos. Cada objeto deve ter duas chaves: 'text' com o conteúdo original e 'semantic_type' com a classificação que você determinou.",
                "Seja preciso e baseie-se no layout visual (tamanho da fonte, posição, peso).",
                "\n--- IMAGEM DA PÁGINA ---",
                imagem_pdf,
                "\n--- TEXTOS EXTRAÍDOS DA PÁGINA ---",
                "\n".join(textos_pdf)
            ]

            # 3. Chamar a API Gemini
            print(f"Analisando o documento {pdf_path} ({page_num + 1}/{doc.page_count})...")
            ai = genai.Client(api_key=API_KEY)
            generation_config = {
                "temperature": 0.1,
                "response_mime_type": "application/json",
            }
            generated_content = ai.models.generate_content(contents=prompt_parts, model='gemini-1.5-flash', config=generation_config)
            response[int(page_num)] = json.loads(generated_content.text)

            print(f"\n>>> Resultado da Análise Semântica (Documento {pdf_path.name} ({page_num + 1}/{doc.page_count})):")
            print(response[int(page_num)])
            print("\n")
        doc.close()
        return response # json.loads(response)

    except Exception as e:
        print(f"❌ Ocorreu um erro ao analisar {pdf_path.name}: {e}")
        return None

In [12]:
# v2
def gemini_doc_analysis(pdf_old_path, pdf_new_path, idml_target_path) -> dict:
    response = {}
    try:
        old_doc = fitz.open(pdf_old_path)
        new_doc = fitz.open(pdf_new_path)

        ai = genai.Client(api_key=API_KEY)
        generation_config = {
            "temperature": 0.5,
            "response_mime_type": "application/json",
        }

        # 1. Extract text from the first page of both PDFs
        first_page_old_text = ""
        if old_doc.page_count > 0:
            first_page_old = old_doc.load_page(0)
            first_page_old_text = first_page_old.get_text()
            # print("Text from first page of old PDF:", first_page_old_text) # Debug print
            old_image_pdf = pdf_page_to_image(first_page_old)

        first_page_new_text = ""
        if new_doc.page_count > 0:
            first_page_new = new_doc.load_page(0)
            first_page_new_text = first_page_new.get_text()
            # print("Text from first page of new PDF:", first_page_new_text) # Debug print
            new_image_pdf = pdf_page_to_image(first_page_new)

        stories_dir_path = Path(idml_target_path) / 'Stories' # Assuming 'Stories' is the directory within the IDML package

        # 3. Proceed with the rest of the analysis on the potentially modified IDML files

        max_pages = max(old_doc.page_count, new_doc.page_count)

        # Extract sections from the second page onwards (using the modified extract_sections)
        old_sections = extract_sections(old_doc)
        # print("Old PDF Sections (from page 2 onwards):", old_sections) # Debug print
        new_sections = extract_sections(new_doc)
        # print("New PDF Sections (from page 2 onwards):", new_sections) # Debug print

        # Extract text from the potentially modified target IDML stories
        idml_stories_text = {}
        # Re-read the IDML files to get the updated content after replacement
        for story_file in stories_dir_path.glob("*.xml"):
            story_text = extract_text_from_idml_story(story_file)
            if story_text:
                idml_stories_text[story_file.name] = story_text
        # print("Target IDML Stories Text (after potential replacement based on first PDF page):", idml_stories_text) # Debug print

        # 4. Construir o prompt para a API Gemini with the potentially modified IDML text
        prompt_parts_first_page = [
            "You are an expert Desktop Publishing Specialist analyzing the FIRST PAGE (cover/title page) of PDF documents.",
            "Focus on identifying structural and metadata elements typical of document covers and title pages.",
            
            "FIRST PAGE ANALYSIS REQUIREMENTS:",
            "1. Focus on cover/title page elements:",
            "   - Document title and main headings",
            "   - Author information and credentials", 
            "   - Publication metadata (date, reference numbers, version)",
            "   - Institutional information (logos, affiliations)",
            "   - Abstract or summary (if present)",
            "   - Cover page formatting elements",
            
            "2. Semantic classification for first pages:",
            "   'document_title', 'subtitle', 'author', 'date', 'reference_number',",
            "   'institution', 'abstract', 'version', 'logo_text', 'cover_header',",
            "   'cover_footer', 'publication_info', 'classification_level'",
            "",
            
            "3. SPECIAL INSTRUCTION for reference_number extraction:",
            "   - Look for publication_info content that contains reference numbers",
            "   - Extract ONLY the reference number from publication info using this pattern:",
            "   - Pattern: xxx/xx/xxxx (first of sequence of 3 numbers separated by slashes)",
            "   - Example: if publication_info contains '123/45/6789 - Document Series - 2024'",
            "   - Then reference_number should be: '123'",
            "   - Always separate reference_number from other publication_info content",
            
            "4. Match criteria:",
            "   - Elements match if they serve the same structural purpose on the cover",
            "   - Focus on document identification and metadata correspondence",
            "   - Consider visual hierarchy typical of title pages",
            "   - For reference_number: extract the xxx/xx/xxxx pattern from publication_info",
            
            "OUTPUT FORMAT:",
            "Return JSON with 'mapeamento_semantico' containing objects with:",
            "- 'semantic_type': specific semantic classification",
            "- 'old_text': content from original PDF (null if not found)",
            "- 'new_text': content from template PDF (null if not found)",
            "- 'idml_text': relevant IDML content (null if not found)", 
            "- 'match': boolean indicating semantic correspondence",
            
            "CONTENT TO ANALYZE:",
            "\\n--- ORIGINAL PDF PAGE IMAGE ---\\n",
            old_image_pdf if old_image_pdf else "No image available",
            "\\n--- ORIGINAL PDF TEXT ---\\n",
            json.dumps(first_page_old_text, ensure_ascii=False) if first_page_old_text else "No text found",
            "\\n--- TEMPLATE PDF PAGE IMAGE ---\\n", 
            new_image_pdf if new_image_pdf else "No image available",
            "\\n--- TEMPLATE PDF TEXT ---\\n",
            json.dumps(first_page_new_text, ensure_ascii=False) if first_page_new_text else "No text found",
            "\\n--- IDML CONTENT ---\\n",
            json.dumps(idml_stories_text, ensure_ascii=False) if idml_stories_text else "No IDML content"
        ]

        # 5. Chamar a API Gemini
        print(f"Analisando os documentos (Página 0)...")
        
        try:
            generated_content = ai.models.generate_content(contents=prompt_parts_first_page, model='gemini-2.0-flash', config=generation_config)
            response[0] = json.loads(generated_content.text)
            print(f">>> Resultado da Análise Semântica (Página 0):")
            print(json.dumps(response[0], indent=2, ensure_ascii=False))
        except Exception as e:
            print(f"❌ Ocorreu um erro ao analisar a página 0: {e}")
            response[0] = {"error": str(e)}

        for page_num in range(1, max_pages):
            page_from_old = None
            page_from_new = None
            old_image_pdf = None
            new_image_pdf = None
            old_page_text = ""
            new_page_text = ""

            if page_num < old_doc.page_count:
                page_from_old = old_doc.load_page(page_num)
                old_image_pdf = pdf_page_to_image(page_from_old)
                old_page_text = page_from_old.get_text()

            if page_num < new_doc.page_count:
                page_from_new = new_doc.load_page(page_num)
                new_image_pdf = pdf_page_to_image(page_from_new)
                new_page_text = page_from_new.get_text()

            # 4. Construir o prompt para a API Gemini with the potentially modified IDML text
            prompt_parts = [
                "Acting as a Desktop Publishing Specialist, analyze the provided images, text from two PDF documents, and text content from target IDML files.",
                "Compare the content and semantic structure of corresponding pages in the two PDF documents, focusing on the provided section context (which starts from the second page). Also consider the text content from the target IDML files (which may have been modified based on the first page of the PDFs) for additional context and comparison.",
                "For each text element, identify its semantic type (e.g: 'title', 'subtitle', 'text_body', 'author', 'header', 'captions', 'link', 'highlighted_text', 'title_level_1', 'title_level_2', 'section_title'). Do not analyze tables at this time.",
                "In addition to identifying semantic types and matches, please analyze and identify text elements that appear to be repeated or very similar across the old and new PDF documents on this page, even if they are not exact matches or don't require substitution. Label these with a 'repeated_or_similar' key with a boolean value (true if repeated/similar, false otherwise).",
                "There are no page numbers, as the documents have dynamic navigation with internal links. The PDF document is page 0 based.",
                "Also consider standardizations such as repetition of terms across pages, such as the document title in a smaller font, its type in a stronger font, and reference number.",
                "Return a single JSON object with a main key 'mapeamento_semantico', containing a list of objects.",
                "Each object in the list should represent a semantic element and have the following keys: 'semantic_type', 'old_text' (content from the original PDF document section, or null if not found in the section), 'new_text' (content from the template PDF document section, or null if not found in the section), 'idml_text' (relevant content from the target IDML files, or null), 'match' (booleano indicating if the semantic type exists in both PDF documents within the corresponding section and is consistent with the IDML text, even if the text is completely different, always prioritizing the semantic type), and 'repeated_or_similar' (boolean indicating if the text element appears to be repeated or very similar across the old and new PDF documents on this page).",
                "If any of the extractions return a null value for old_text, new_text, or idml_text for a given semantic type, mark the match as false.",
                "CONTENT TO ANALYZE:",
                "\\n--- ORIGINAL PDF PAGE IMAGE ---\\n",
                old_image_pdf if old_image_pdf else "No image available",
                "\\n--- ORIGINAL PDF FULL TEXT ---\\n",
                json.dumps(old_page_text, ensure_ascii=False) if old_page_text else "No text found",
                "\\n--- ORIGINAL PDF SECTIONS (context) ---\\n",
                json.dumps(old_sections, ensure_ascii=False) if old_sections else "No sections found",
                "\\n--- TEMPLATE PDF PAGE IMAGE ---\\n",
                new_image_pdf if new_image_pdf else "No image available", 
                "\\n--- TEMPLATE PDF FULL TEXT ---\\n",
                json.dumps(new_page_text, ensure_ascii=False) if new_page_text else "No text found",
                "\\n--- TEMPLATE PDF SECTIONS (context) ---\\n",
                json.dumps(new_sections, ensure_ascii=False) if new_sections else "No sections found",
                "\\n--- IDML CONTENT (for reference) ---\\n",
                json.dumps(idml_stories_text, ensure_ascii=False) if idml_stories_text else "No IDML content"
            ]

            # 5. Chamar a API Gemini
            print(f"Analisando os documentos (Página {page_num})...")
            
            try:
                generated_content = ai.models.generate_content(contents=prompt_parts, model='gemini-2.0-flash', config=generation_config)
                response[page_num] = json.loads(generated_content.text)
                response[page_num]["mapeamento_semantico"].extend(response[0]["mapeamento_semantico"])
                print(f">>> Resultado da Análise Semântica (Página {page_num}):")
                print(json.dumps(response[page_num], indent=2, ensure_ascii=False))
            except Exception as e:
                print(f"❌ Ocorreu um erro ao analisar a página {page_num}: {e}")
                response[page_num] = {"error": str(e)}

        old_doc.close()
        new_doc.close()
        return response

    except Exception as e:
        print(f"❌ Ocorreu um erro ao realizar a análise: {e}")
        return None


## Image Analysis with AI

In [13]:
def ai_doc_image_analysis(pdf_old_path: str, pdf_new_path: str) -> dict:
    """
    Analyzes and compares images on each page of two PDF documents using Gemini.

    Args:
        pdf_old_path: The path to the original PDF file.
        pdf_new_path: The path to the new PDF file.

    Returns:
        A dictionary containing the image analysis results for each page,
        or None if an error occurred.
    """
    response = {}
    try:
        # Open the PDF documents
        old_doc = fitz.open(pdf_old_path)
        new_doc = fitz.open(pdf_new_path)

        # Ensure both documents have the same number of pages for comparison
        if old_doc.page_count != new_doc.page_count:
            print("Warning: Documents have different page counts. Comparison will be limited.")
            # Decide how to handle this: compare up to the minimum number of pages, or raise an error
            # For now, we'll compare up to the minimum number of pages
            num_pages = min(old_doc.page_count, new_doc.page_count)
        else:
            num_pages = old_doc.page_count

        for page_num in range(num_pages):
            page_from_old = old_doc.load_page(page_num)
            page_from_new = new_doc.load_page(page_num)

            old_image_pdf = pdf_page_to_image(page_from_old)
            new_image_pdf = pdf_page_to_image(page_from_new)

            # Build the prompt for the Gemini API
            prompt_parts = [
                "Act as an image analysis expert. Compare the images provided from two PDF documents, page by page.",
                "Identify and describe the images present on each page.",
                "Compare the images on the corresponding pages of the two documents.",
                "Note any differences or similarities in the images, including their content, position, size, or style.",
                "For each page, provide a summary of the image comparison.",
                "Return a single JSON object with a key 'image_analysis' and the value is an object containing:",
                "'old_images': A list of descriptions of images found on the old document's page, or null if no images.",
                "'new_images': A list of descriptions of images found on the new document's page, or null if no images.",
                "'comparison_summary': A text summary of the comparison between the images on this page.",
                "'action': 'migrate' when the description is similar enough to replace in the new design and 'non-migrate' when it is not necessary to replace"
                "\n--- IMAGE FROM OLD DOCUMENT PAGE ---",
                old_image_pdf if old_image_pdf else "No image for old document on this page.",
                "\n--- IMAGE FROM NEW DOCUMENT PAGE ---",
                new_image_pdf if new_image_pdf else "No image for new document on this page."
            ]

            # Call the Gemini API
            print(f"Analyzing images on page {page_num}...")
            ai = genai.Client(api_key=API_KEY)
            generation_config = {
                "temperature": 0.1,
                "response_mime_type": "application/json",
            }
            try:
                generated_content = ai.models.generate_content(
                    contents=prompt_parts,
                    model='gemini-2.0-flash', # Or another suitable model
                    config=generation_config
                )
                response[page_num] = json.loads(generated_content.text)
                print(f">>> Image Analysis Result (Page {page_num + 1}):")
                print(json.dumps(response[page_num], indent=2, ensure_ascii=False))
            except Exception as e:
                print(f"❌ An error occurred while analyzing images on page {page_num + 1}: {e}")
                response[page_num] = {"error": str(e)}

        old_doc.close()
        new_doc.close()
        return response

    except FileNotFoundError:
        print("Error: One or both PDF files not found.")
        return None
    except Exception as e:
        print(f"❌ An error occurred during image analysis: {e}")
        return None

## Execution

### Text

In [87]:
analysis_result = gemini_doc_analysis(pdf_old_path, pdf_new_path, new_template_dir)

Analisando os documentos (Página 0)...
>>> Resultado da Análise Semântica (Página 0):
{
  "mapeamento_semantico": [
    {
      "semantic_type": "document_title",
      "old_text": "CryoPure Röhren",
      "new_text": "Kleinvolumige S-Monovette® 1,6 ml Serum CAT / Lithium-Heparin LH",
      "idml_text": "Kleinvolumige S-Monovette\n\t\t\t®\n\t\t\t 1,6 ml  Serum CAT / Lithium-Heparin LH\n\t\t\t",
      "match": true
    },
    {
      "semantic_type": "subtitle",
      "old_text": "Produktinformation 958 / 72 / 0823",
      "new_text": "Produktinformation",
      "idml_text": "Produktinformation\n\t\t\t",
      "match": true
    },
    {
      "semantic_type": "author",
      "old_text": "Susanne Bäß, Produktmanagement Life Science",
      "new_text": "Dr. Christa Seipelt, Produktmanagerin Präanalytik",
      "idml_text": null,
      "match": true
    },
    {
      "semantic_type": "date",
      "old_text": null,
      "new_text": "März 2025",
      "idml_text": "März 2025\n\t\t\t",
   

In [88]:
migration_plan = {}

In [89]:
# Iterate through the pages in the analysis result

migration_plan = {}
for page_num, page_data in analysis_result.items():
    migration_plan[page_num] = []  # Initialize migration plan for this page

    # Get the semantic mapping for the current page
    semantic_map_page = page_data.get('mapeamento_semantico', [])

    # Iterate through the semantic elements on the page
    for item in semantic_map_page:
        if item.get('match') and not item.get('repeated_or_similar'):
            migration_plan[page_num].append({
                "type": item.get('semantic_type'),
                "conteudo_a_migrar": item.get('old_text'),
                "placeholder": item.get('new_text'),
                "match": item.get('match'),
                "notes": f"Match status: {item.get('match')}" # Add a note about the match status
            })

# Example of how to use the migration_plan:
for page_num, page_migration_items in migration_plan.items():
    print(f"Migration plan for page {page_num + 1}:")
    for item in page_migration_items:
        print(f"Tipo: [{item['type']}] - Substituir '{item['placeholder']}' por '{item['conteudo_a_migrar']}'")
    print()


Migration plan for page 1:
Tipo: [document_title] - Substituir 'Kleinvolumige S-Monovette® 1,6 ml Serum CAT / Lithium-Heparin LH' por 'CryoPure Röhren'
Tipo: [subtitle] - Substituir 'Produktinformation' por 'Produktinformation 958 / 72 / 0823'
Tipo: [author] - Substituir 'Dr. Christa Seipelt, Produktmanagerin Präanalytik' por 'Susanne Bäß, Produktmanagement Life Science'
Tipo: [institution] - Substituir 'None' por 'www.sarstedt.com'
Tipo: [logo_text] - Substituir 'SARSTEDT' por 'SARSTEDT'
Tipo: [cover_footer] - Substituir 'Nur zum internen Gebrauch' por '+++ Nur zum internen Gebrauch +++'

Migration plan for page 2:
Tipo: [title] - Substituir 'Produktinformation' por 'Produktänderung CryoPure Röhren'
Tipo: [title_level_1] - Substituir 'Kundenkreis' por 'Übersicht Kundenkreis und 
	
Schlagwörter'
Tipo: [document_title] - Substituir 'Kleinvolumige S-Monovette® 1,6 ml Serum CAT / Lithium-Heparin LH' por 'CryoPure Röhren'
Tipo: [subtitle] - Substituir 'Produktinformation' por 'Produktinfor

### Image

In [104]:
# Example usage:
analysis_results = ai_doc_image_analysis(pdf_old_path, pdf_new_path)
if analysis_results:
    # Process the analysis_results dictionary
    print("Image analysis completed successfully.")


Analyzing images on page 0...
>>> Image Analysis Result (Page 1):
{
  "image_analysis": {
    "old_images": [
      "The image shows a product advertisement for CryoPure Röhren. It features several test tubes with different colored caps, arranged in a row. The tubes appear to contain a red liquid. The Sarstedt logo is visible in the bottom right corner."
    ],
    "new_images": [
      "The image shows a product advertisement for Kleinvolumige S-Monovette® 1,6 ml Serum CAT / Lithium-Heparin LH. It features several test tubes with different colored caps, arranged in a row. The tubes appear to contain a red liquid. The Sarstedt logo is visible in the top left corner."
    ],
    "comparison_summary": "The old image shows CryoPure Röhren test tubes, while the new image shows Kleinvolumige S-Monovette® test tubes. Both images feature a row of test tubes with different colored caps and a red liquid inside. The Sarstedt logo is present in both images, but in different locations. The overall

In [23]:
# Example Usage:
# Assuming 'analysis_result' is the output from ai_doc_image_analysis
image_migration_plan = build_image_migration_plan(analysis_results)
print(image_migration_plan)

# Print the generated image migration plan
for page_num, migration_items in image_migration_plan.items():
    print(f"Image Migration Plan for Page {page_num + 1}:")
    if migration_items:
        for item in migration_items:
            print(f"- Action: {item.get('action')}")
            print(f"  Old Image: {item.get('old_image_description')}")
            if item.get('action') == 'replace_image':
                print(f"  New Image Target: {item.get('new_image_description')}")
            elif item.get('action') == 'insert_image':
                 print(f" Target Location Notes: {item.get('target_location_notes')}")
            print(f"  Notes: {item.get('notes')}")
    else:
        print("  No image migration needed for this page.")
    print()

{}


## Content Replacement

### Text Replacement

In [90]:
def get_content_text_from_element(element):
    text = ""
    for sub_element in element.iter():
        if sub_element.tag == 'Content' and sub_element.text:
            text += sub_element.text
    return text

In [91]:
def replace_and_consolidate_content(placeholder: str, novo_conteudo: str, stories_path: Path) -> bool:
    """Encontra o arquivo de story que contém o placeholder e substitui seu conteúdo."""
    for story_file in stories_path.glob("*.xml"):
        try:
            # Usar um parser que lida com namespaces é uma boa prática
            parser = ET.XMLParser(encoding="utf-8")
            tree = ET.parse(story_file, parser=parser)
            root = tree.getroot()

            # Flag para indicar se houve modificação no arquivo
            arquivo_modificado = False

            # Encontra todos os ParagraphStyleRange elements
            for paragraph_style_range in root.findall('.//ParagraphStyleRange'):
                
                # Usa a função para obter texto combinado de tags <Content> no parágrafo
                combined_content_text = get_content_text_from_element(paragraph_style_range)

                # Verifica se o texto combinado contém o placeholder
                if placeholder in combined_content_text:
                    print(f"   -> Encontrado placeholder '{placeholder}' em '{story_file.name}'. Substituindo...")
                    
                    # --- Lógica de Substituição e Consolidação ---
                    
                    # Encontra todos os CharacterStyleRange elements no parágrafo
                    char_style_ranges_in_paragraph = paragraph_style_range.findall('.//CharacterStyleRange')
                    
                    # Encontra a sequência de CharacterStyleRange que contém o placeholder
                    sequence_found = False
                    sequence_start_index = -1
                    sequence_end_index = -1
                    current_combined_sequence_text = ""
                    elements_to_remove = []

                    for i, char_style_range in enumerate(char_style_ranges_in_paragraph):
                        content_text_in_char_style = get_content_text_from_element(char_style_range)
                        
                        # Verifica se estamos construindo a sequência correta
                        if (placeholder.startswith(current_combined_sequence_text + content_text_in_char_style) or 
                            current_combined_sequence_text == ""):
                            
                            if sequence_start_index == -1:
                                sequence_start_index = i
                            current_combined_sequence_text += content_text_in_char_style
                            
                            # Se encontramos o placeholder completo na sequência
                            if placeholder in current_combined_sequence_text:
                                sequence_end_index = i
                                sequence_found = True
                                break
                        else:
                            # Reset se a sequência não corresponde
                            current_combined_sequence_text = ""
                            sequence_start_index = -1
                            sequence_end_index = -1

                    if sequence_found:
                        # Identifica os elementos para remover
                        elements_to_remove = char_style_ranges_in_paragraph[sequence_start_index:sequence_end_index + 1]
                        insertion_point_index = sequence_start_index

                        # Lista dos filhos diretos do paragraph_style_range
                        children_to_process = list(paragraph_style_range)
                        removed_count = 0

                        # Remove os elementos identificados
                        for child in children_to_process:
                            if child in elements_to_remove:
                                paragraph_style_range.remove(child)
                                removed_count += 1

                        # Ajusta o ponto de inserção baseado nos elementos removidos
                        insertion_point_index = max(0, insertion_point_index - removed_count)

                        # Cria o novo CharacterStyleRange consolidado
                        new_char_style_range = ET.Element('CharacterStyleRange')
                        
                        # Copia os atributos do primeiro elemento removido (se existir)
                        if elements_to_remove:
                            for attr, value in elements_to_remove[0].items():
                                new_char_style_range.set(attr, value)

                        # Adiciona o novo conteúdo, tratando quebras de linha
                        linhas = novo_conteudo.split('\n')
                        for i, linha in enumerate(linhas):
                            if i > 0:
                                ET.SubElement(new_char_style_range, 'Br')
                            content_tag = ET.SubElement(new_char_style_range, 'Content')
                            content_tag.text = linha

                        # Insere o novo elemento no ponto calculado
                        paragraph_style_range.insert(insertion_point_index, new_char_style_range)
                        
                        arquivo_modificado = True
                        print(f"    * Substituído e consolidado: '{placeholder}' -> '{novo_conteudo}'")
                    else:
                        print(f"    # Não foi possível encontrar a sequência de CharacterStyleRange contendo '{placeholder}'")

            # Salva as alterações se o arquivo foi modificado
            if arquivo_modificado:
                tree.write(story_file, encoding="UTF-8", xml_declaration=True)
                return True  # Modificação bem-sucedida

        except Exception as e:
            print(f"   -> Erro ao processar o arquivo {story_file.name}: {e}")
    
    return False  # Placeholder não encontrado em nenhuma story

In [33]:
# antigo
def encontrar_e_substituir_na_story(placeholder: str, novo_conteudo: str, stories_path: Path) -> bool:
    """Encontra o arquivo de story que contém o placeholder e substitui seu conteúdo."""
    for story_file in stories_path.glob("*.xml"):
        try:
            # Usar um parser que lida com namespaces é uma boa prática
            parser = ET.XMLParser(encoding="utf-8")
            tree = ET.parse(story_file, parser=parser)
            root = tree.getroot()

            # Precisamos verificar se o placeholder está no arquivo antes de modificar
            raw_text = ET.tostring(root, encoding='unicode')
            if placeholder not in raw_text:
                continue

            # Encontra o elemento de parágrafo/estilo que contém o placeholder
            # todo ajustar para quando houver quebra de linha
            for elem in root.findall('.//ParagraphStyleRange//CharacterStyleRange'):
                content_tags = elem.findall('Content')
                # Recria o texto dentro da tag de estilo para uma busca precisa
                current_text = "".join([c.text for c in content_tags if c.text])
                
                if placeholder in current_text:
                    print(f"   -> Encontrado placeholder em '{story_file.name}'. Substituindo...")
                    
                    # Limpa o conteúdo antigo (tags <Content> e <Br />)
                    for sub_elem in list(elem):
                        if sub_elem.tag in ['Content', 'Br']:
                            elem.remove(sub_elem)
                    
                    # Adiciona o novo conteúdo, tratando quebras de linha
                    linhas = novo_conteudo.split('\n')
                    for i, linha in enumerate(linhas):
                        if i > 0:
                            ET.SubElement(elem, 'Br')
                        content_tag = ET.SubElement(elem, 'Content')
                        content_tag.text = linha
                    
                    # Salva as alterações no arquivo XML
                    tree.write(story_file, encoding="UTF-8", xml_declaration=True)
                    return True # Modificação bem-sucedida

        except Exception as e:
            print(f"   -> Erro ao processar o arquivo {story_file.name}: {e}")
    
    return False # Placeholder não encontrado em nenhuma story

In [107]:
# 6. Executar a migração
# Primeiro, crie uma cópia limpa do template para modificação
if os.path.exists(migrated_dir):
    print(f"🧹 Limpando diretório de migração anterior: '{migrated_dir}'")
    shutil.rmtree(migrated_dir)

print(f"©️ Copiando template de '{new_template_dir}' para '{migrated_dir}'...")
shutil.copytree(new_template_dir, migrated_dir)

# Agora, aplique as modificações na cópia
print("✍️  Iniciando a substituição de conteúdo nos arquivos de Story...")
stories_path_migrado = migrated_dir / "Stories"
sucessos = 0
for index in migration_plan.keys():
    for item in migration_plan[index]:
        if replace_and_consolidate_content(item['placeholder'], item['conteudo_a_migrar'], stories_path_migrado):
            sucessos += 1

if sucessos > 0:
    print(f"✅ {sucessos} substituições de conteúdo realizadas com sucesso.")
else:
    print("⚠️ Nenhuma substituição de conteúdo foi realizada. Verifique os placeholders.")


©️ Copiando template de 'data/new' para 'data/migrated'...
✍️  Iniciando a substituição de conteúdo nos arquivos de Story...
   -> Encontrado placeholder 'Kleinvolumige S-Monovette® 1,6 ml Serum CAT / Lithium-Heparin LH' em 'Story_u172.xml'. Substituindo...
    * Substituído e consolidado: 'Kleinvolumige S-Monovette® 1,6 ml Serum CAT / Lithium-Heparin LH' -> 'CryoPure Röhren'
   -> Encontrado placeholder 'Produktinformation' em 'Story_u67d.xml'. Substituindo...
    # Não foi possível encontrar a sequência de CharacterStyleRange contendo 'Produktinformation'
   -> Encontrado placeholder 'Produktinformation' em 'Story_u15c.xml'. Substituindo...
    * Substituído e consolidado: 'Produktinformation' -> 'Produktinformation 958 / 72 / 0823'
   -> Encontrado placeholder 'Dr. Christa Seipelt, Produktmanagerin Präanalytik' em 'Story_u1b0.xml'. Substituindo...
    * Substituído e consolidado: 'Dr. Christa Seipelt, Produktmanagerin Präanalytik' -> 'Susanne Bäß, Produktmanagement Life Science'
   

### Image Replacement

In [None]:
# Processamento de PDF
import fitz  # PyMuPDF - já em uso
# import PyPDF2  # Alternativa para metadados

# Processamento de imagens
from PIL import Image
import cv2  # Para transformações avançadas

# IDML
import xml.etree.ElementTree as ET
import zipfile  # IDML são arquivos ZIP
import uuid  # Para IDs únicos

# IA e análise
import google.generativeai as genai  # Já em uso
import json

# Para trabalhar com IDML de forma mais robusta
# pip install idml-tools
from idml import IDML

# Para análise de layout mais avançada
# pip install layoutparser
import layoutparser as lp

In [97]:
def enhanced_image_analysis(pdf_old_path: str, pdf_new_path: str) -> dict:
    # Adicionar às informações retornadas:
    return {
        'page_num': page_num,
        'old_images': [...],
        'new_images': [...],
        'comparison_summary': "...",
        'action': 'migrate|non-migrate',
        # NOVOS CAMPOS:
        'image_coordinates': {
            'old': [(x, y, width, height), ...],
            'new': [(x, y, width, height), ...]
        },
        'image_properties': {
            'old': [{'dpi': 300, 'format': 'jpeg', 'size_kb': 150}, ...],
            'new': [{'dpi': 300, 'format': 'png', 'size_mb': 2.1}, ...]
        },
        'migration_mapping': [
            {'old_index': 0, 'new_index': 1, 'confidence': 0.85}
        ]
    }

In [98]:
def extract_pdf_structure(pdf_path: str) -> dict:
    """Extrai estrutura completa do PDF incluindo textos, imagens e layout"""
    return {
        'pages': [
            {
                'page_num': 0,
                'dimensions': (width, height),
                'images': [
                    {
                        'bbox': (x0, y0, x1, y1),
                        'image_data': base64_data,
                        'properties': {...}
                    }
                ],
                'text_blocks': [
                    {
                        'bbox': (x0, y0, x1, y1),
                        'text': "conteúdo",
                        'font': "Arial",
                        'size': 12
                    }
                ]
            }
        ]
    }

In [99]:
def process_migration_images(analysis_result: dict) -> dict:
    """Processa imagebs marcadas para migração"""
    processed_images = {}
    
    for page_num, page_data in analysis_result.items():
        if page_data.get('action') == 'migrate':
            # Extrair imagens do PDF antigo
            # Otimizar qualidade/formato se necessário
            # Preparar para inserção no IDML
            processed_images[page_num] = {
                'source_images': extract_images_from_old_pdf(),
                'target_positions': get_new_pdf_positions(),
                'transformations': calculate_scaling_rotation()
            }
    
    return processed_images

In [101]:
def prepare_idml_structure(new_template_dir: str, migrated_dir: str) -> str:
    """
    Prepara estrutura IDML copiando template e limpando migração anterior
    Similar ao processo atual para textos, mas adaptado para imagens
    """
    import shutil
    from pathlib import Path
    
    migrated_path = Path(migrated_dir)
    new_template_path = Path(new_template_dir)
    
    # Limpeza do diretório anterior (igual ao processo atual)
    if migrated_path.exists():
        print(f"🧹 Limpando diretório de migração anterior: '{migrated_dir}'")
        shutil.rmtree(migrated_path)
    
    # Cópia do template (igual ao processo atual)
    print(f"©️ Copiando template de '{new_template_dir}' para '{migrated_dir}'...")
    shutil.copytree(new_template_path, migrated_path)
    
    # Verificar se é um IDML válido (descompactar se necessário)
    if new_template_path.suffix.lower() == '.idml':
        # Se o template é um arquivo IDML, descompactar
        idml_extracted = migrated_path / "idml_extracted"
        with zipfile.ZipFile(new_template_path, 'r') as zip_ref:
            zip_ref.extractall(idml_extracted)
        return str(idml_extracted)
    else:
        # Se já é um diretório descompactado
        return str(migrated_path)

def create_idml_structure(template_path: str, output_dir: str) -> dict:
    """
    Cria estrutura IDML baseada em template existente
    Retorna caminhos para os diretórios principais
    """
    # Preparar estrutura baseada no template
    working_dir = prepare_idml_structure(template_path, output_dir)
    
    return {
        'working_directory': working_dir,
        'spreads_dir': os.path.join(working_dir, 'Spreads'),
        'stories_dir': os.path.join(working_dir, 'Stories'),
        'resources_dir': os.path.join(working_dir, 'Resources'),
        'links_dir': os.path.join(working_dir, 'Links'),  # Para imagens externas
        'designmap_path': os.path.join(working_dir, 'designmap.xml'),
        'preferences_path': os.path.join(working_dir, 'preferences.xml')
    }

In [102]:
def migrate_images_to_idml_template(idml_structure: dict, processed_images: dict, image_placeholders: dict) -> int:
    """
    Migra imagens para template IDML seguindo padrão similar ao replace_and_consolidate_content
    
    Args:
        idml_structure: Estrutura de diretórios do IDML
        processed_images: Imagens processadas para migração  
        image_placeholders: Dicionário com placeholders de imagem no template
        
    Returns:
        Número de imagens migradas com sucesso
    """
    spreads_dir = idml_structure['spreads_dir']
    links_dir = idml_structure['links_dir']
    
    # Criar diretório Links se não existir (para imagens externas)
    os.makedirs(links_dir, exist_ok=True)
    
    sucessos = 0
    print("🖼️  Iniciando a migração de imagens para o template IDML...")
    
    for page_num, image_data in processed_images.items():
        if image_data.get('action') == 'migrate':
            for image_info in image_data.get('images_to_migrate', []):
                placeholder_id = image_info.get('placeholder_id')  # ex: "IMAGE_PLACEHOLDER_001"
                image_content = image_info.get('image_data')
                image_filename = image_info.get('filename', f'migrated_image_{page_num}_{sucessos}.jpg')
                
                # Salvar imagem no diretório Links
                image_path = os.path.join(links_dir, image_filename)
                save_image_to_file(image_content, image_path)
                
                # Substituir placeholder nos arquivos Spread
                if replace_image_placeholder(placeholder_id, image_filename, spreads_dir):
                    sucessos += 1
                    print(f"✅ Imagem migrada: {image_filename} -> {placeholder_id}")
                else:
                    print(f"⚠️ Falha ao migrar imagem: {placeholder_id}")
    
    return sucessos

def replace_image_placeholder(placeholder_id: str, image_filename: str, spreads_dir: str) -> bool:
    """
    Substitui placeholder de imagem nos arquivos XML do Spread
    Similar ao replace_and_consolidate_content mas para imagens
    """
    try:
        # Percorrer todos os arquivos .xml na pasta Spreads
        for spread_file in Path(spreads_dir).glob("*.xml"):
            tree = ET.parse(spread_file)
            root = tree.getroot()
            
            # Procurar por elementos Image ou Rectangle com o placeholder
            for elem in root.iter():
                # Verificar se elemento contém o placeholder
                if placeholder_id in str(elem.attrib.get('href', '')):
                    # Atualizar referência para nova imagem
                    elem.set('href', f'file://Links/{image_filename}')
                    
                    # Salvar alterações
                    tree.write(spread_file, encoding='utf-8', xml_declaration=True)
                    return True
                    
        return False
    except Exception as e:
        print(f"❌ Erro ao substituir placeholder {placeholder_id}: {e}")
        return False

def save_image_to_file(image_data, output_path: str):
    """Salva dados de imagem em arquivo"""
    if isinstance(image_data, str):  # Base64
        import base64
        with open(output_path, 'wb') as f:
            f.write(base64.b64decode(image_data))
    else:  # Dados binários diretos
        with open(output_path, 'wb') as f:
            f.write(image_data)

In [103]:
def migrate_pdf_images_to_idml(pdf_old_path: str, pdf_new_path: str, 
                               new_template_path: str, migrated_output_path: str):
    """
    Função principal que orquestra todo o processo seguindo o padrão atual de migração
    """
    
    # Etapa 1: Análise de imagens (sua função atual)
    print("🔍 Iniciando análise de imagens entre PDFs...")
    analysis_result = ai_doc_image_analysis(pdf_old_path, pdf_new_path)
    
    if not analysis_result:
        print("❌ Falha na análise de imagens.")
        return
    
    # Etapa 2: Preparação da estrutura IDML (baseada no seu processo atual)
    print("📁 Preparando estrutura IDML...")
    idml_structure = create_idml_structure(new_template_path, migrated_output_path)
    
    # Etapa 3: Processamento de imagens para migração
    print("🖼️  Processando imagens para migração...")
    processed_images = process_migration_images(analysis_result, pdf_old_path)
    
    # Etapa 4: Migração das imagens (similar ao replace_and_consolidate_content)
    print("✍️  Iniciando a migração de imagens para o template...")
    image_placeholders = detect_image_placeholders(idml_structure)
    sucessos = migrate_images_to_idml_template(idml_structure, processed_images, image_placeholders)
    
    # Etapa 5: Finalização (similar ao seu processo atual)
    if sucessos > 0:
        print(f"✅ {sucessos} imagens migradas com sucesso.")
        
        # Compactar de volta para IDML
        final_idml_path = f"{migrated_output_path}.idml"
        compress_to_idml(idml_structure['working_directory'], final_idml_path)
        print(f"📦 Arquivo IDML final gerado: {final_idml_path}")
        
    else:
        print("⚠️ Nenhuma imagem foi migrada. Verifique os placeholders de imagem.")

def compress_to_idml(working_dir: str, output_idml_path: str):
    """Compacta diretório de trabalho de volta para arquivo IDML"""
    with zipfile.ZipFile(output_idml_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(working_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arc_name = os.path.relpath(file_path, working_dir)
                zipf.write(file_path, arc_name)

def detect_image_placeholders(idml_structure: dict) -> dict:
    """
    Detecta placeholders de imagem no template IDML
    Retorna mapeamento de placeholders encontrados
    """
    placeholders = {}
    spreads_dir = idml_structure['spreads_dir']
    
    for spread_file in Path(spreads_dir).glob("*.xml"):
        tree = ET.parse(spread_file)
        root = tree.getroot()
        
        # Procurar por elementos com referências que parecem placeholders
        for elem in root.iter():
            href = elem.attrib.get('href', '')
            if 'PLACEHOLDER' in href.upper() or 'IMAGE_' in href.upper():
                placeholders[href] = spread_file.name
                
    return placeholders

In [106]:
migrate_pdf_images_to_idml(
    pdf_old_path=pdf_old_path,
    pdf_new_path=pdf_new_path,
    new_template_path=f"./{idml_final_path}",  # ou diretório descompactado
    migrated_output_path=migrated_dir  # será criado automaticamente
)

🔍 Iniciando análise de imagens entre PDFs...
Analyzing images on page 0...
>>> Image Analysis Result (Page 1):
{
  "image_analysis": {
    "old_images": [
      "The image displays a product information page for 'CryoPure Röhren' (CryoPure Tubes). It features several vials with different colored caps and varying levels of red liquid inside, arranged in a row. The vials are the main focus, and the background is plain white. Text elements include the product name, a product information number (958/72/0823), and the name of a product manager (Susanne Baß). The Sarstedt logo and website address are also present."
    ],
    "new_images": [
      "The image displays a product information page featuring a row of 'Kleinvolumige S-Monovette® 1,6 ml Serum CAT / Lithium-Heparin LH' tubes. These tubes have different colored caps and labels, and they contain varying levels of liquid. The background is a light gradient. Text elements include the product name, the Sarstedt logo, a product informatio

NotADirectoryError: [Errno 20] Not a directory: 'data/migrated.idml'

## IDML Export

In [93]:
def criar_pacote_idml(source_dir: Path, output_filename: str):
    """Cria um arquivo .idml (zip) a partir de um diretório de origem."""
    print(f"📦 Empacotando o resultado em '{output_filename}'...")
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(source_dir):
            for file in files:
                file_path = Path(root) / file
                # O caminho no arquivo zip deve ser relativo ao diretório de origem
                archive_name = file_path.relative_to(source_dir)
                zipf.write(file_path, archive_name)
    print(f"Pacote '{output_filename}' criado com sucesso!")

In [94]:
# 7. Empacotar o resultado final em um novo arquivo .idml
criar_pacote_idml(migrated_dir, idml_final_path)

print("\n🎉 Processo de migração concluído!")
print(f"Seu novo documento está pronto em: ./{idml_final_path}")
print("Abra este arquivo no Adobe InDesign para verificar e exportar como PDF.")

📦 Empacotando o resultado em 'data/migrated.idml'...
Pacote 'data/migrated.idml' criado com sucesso!

🎉 Processo de migração concluído!
Seu novo documento está pronto em: ./data/migrated.idml
Abra este arquivo no Adobe InDesign para verificar e exportar como PDF.
