In [10]:
import fitz  # PyMuPDF
import os
import glob

# === Step 1: Locate the latest Boletim_da_PI_-_YYYY-MM-DD.pdf ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)

if not pdf_files:
    print("No Boletim_da_PI PDF file found in current directory.")
    exit(1)

# Sort by modified time descending to get the latest
pdf_files.sort(key=os.path.getmtime, reverse=True)
latest_pdf = pdf_files[0]
print(f"Latest PDF found: {latest_pdf}")

# === Step 2: Define output folder ===
output_folder = "extracted_images"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# === Step 3: Open the PDF ===
pdf_file = fitz.open(latest_pdf)

image_count = 0

# === Step 4: Iterate through each page and extract images ===
for page_num in range(len(pdf_file)):
    page = pdf_file[page_num]
    images = page.get_images(full=True)

    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # === Step 5: Save image ===
        image_filename = f"page{page_num + 1}_img{img_index + 1}.{image_ext}"
        image_filepath = os.path.join(output_folder, image_filename)

        with open(image_filepath, "wb") as img_file:
            img_file.write(image_bytes)

        image_count += 1

print(f"Extraction completed: {image_count} images saved in '{output_folder}' folder.")


Latest PDF found: Boletim_da_PI_-_2025-07-11_split_columns.pdf
Extraction completed: 104 images saved in 'extracted_images' folder.


In [15]:
import fitz  # PyMuPDF
import os
import glob
import re

# === Step 1: Locate the latest Boletim_da_PI_-_YYYY-MM-DD.pdf ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)

if not pdf_files:
    print("No Boletim_da_PI PDF file found in current directory.")
    exit(1)

# Sort by modified time descending to get the latest
pdf_files.sort(key=os.path.getmtime, reverse=True)
latest_pdf = pdf_files[0]
print(f"Latest PDF found: {latest_pdf}")

# === Step 2: Define output folder ===
output_folder = "extracted_images_by_210"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# === Step 3: Open the PDF ===
pdf_file = fitz.open(latest_pdf)

image_count = 0

# === Step 4: Iterate through each page ===
for page_num in range(len(pdf_file)):
    page = pdf_file[page_num]
    text = page.get_text("text")
    
    # === Step 4.1: Extract the (210) code ===
    match_210 = re.search(r"\(210\)\s*(\S+)", text)
    code_210 = match_210.group(1) if match_210 else f"page{page_num+1}"
    
    # === Step 4.2: Check if (540) exists on this page ===
    has_540 = "(540)" in text

    if has_540:
        images = page.get_images(full=True)
        
        # === Step 4.3: Extract each image and name with (210) code ===
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_file.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]

            image_filename = f"{code_210}_img{img_index + 1}.{image_ext}"
            image_filepath = os.path.join(output_folder, image_filename)

            with open(image_filepath, "wb") as img_file:
                img_file.write(image_bytes)

            image_count += 1
            print(f"Saved {image_filename}")

print(f"\nExtraction completed: {image_count} images saved in '{output_folder}' folder.")


Latest PDF found: Boletim_da_PI_-_2025-07-11_split_columns.pdf
Saved 747351_img1.jpeg
Saved 747351_img2.jpeg
Saved 747351_img3.jpeg
Saved 747471_img1.jpeg
Saved 747471_img2.jpeg
Saved 747471_img3.jpeg
Saved 748157_img1.jpeg
Saved 748157_img2.jpeg
Saved 748217_img1.jpeg
Saved 748217_img2.jpeg
Saved 748226_img1.jpeg
Saved 748226_img2.jpeg
Saved 748243_img1.jpeg
Saved 748243_img2.jpeg
Saved 748243_img3.jpeg
Saved 748244_img1.jpeg
Saved 748244_img2.jpeg
Saved 748244_img3.jpeg
Saved 748247_img1.jpeg
Saved 748247_img2.jpeg
Saved 748247_img3.jpeg
Saved 748247_img4.jpeg
Saved 748251_img1.jpeg
Saved 748251_img2.jpeg
Saved 748251_img3.jpeg
Saved 748251_img4.jpeg
Saved 748285_img1.jpeg
Saved 748285_img2.jpeg
Saved 748285_img3.jpeg
Saved 748288_img1.jpeg
Saved 748288_img2.jpeg
Saved 748288_img3.jpeg
Saved 748293_img1.jpeg
Saved 748293_img2.jpeg
Saved 748293_img3.jpeg
Saved 748306_img1.jpeg
Saved 748306_img2.jpeg
Saved 748306_img3.jpeg
Saved 748307_img1.jpeg
Saved 748307_img2.jpeg
Saved 748311_img1

New try 

In [20]:
import glob
import fitz  # PyMuPDF
import re
import os

pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)
if not pdf_files:
    print("No PDF found matching pattern")
    exit(1)

pdf_files.sort()
latest_pdf = pdf_files[-1]
print(f"Processing file: {latest_pdf}")

doc = fitz.open(latest_pdf)
pattern_210 = re.compile(r"\(210\)\s*(\d+)")

output_dir = "extracted_images"
os.makedirs(output_dir, exist_ok=True)

image_count = 0

for page_num in range(len(doc)):
    page = doc.load_page(page_num)

    # Extract all text from page and find all (210) with their order in the text
    text = page.get_text("text")
    # Find all (210) occurrences in order
    matches_210 = [(m.start(), m.group(1)) for m in pattern_210.finditer(text)]

    # Extract images from the page in the order they appear
    images = page.get_images(full=True)

    if not matches_210:
        print(f"Page {page_num+1}: No (210) found")
        continue

    if not images:
        print(f"Page {page_num+1}: No images found")
        continue

    # Assign images to (210) codes in order
    # If fewer images than (210), some (210) have no image
    # If more images than (210), assign images in a round-robin or only up to number of (210)

    for idx, img in enumerate(images):
        xref = img[0]
        # assign the (210) code in sequence
        code_index = min(idx, len(matches_210) - 1)
        current_210 = matches_210[code_index][1]

        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        image_name = f"{current_210}_page{page_num+1}_img{idx+1}.{image_ext}"
        image_path = os.path.join(output_dir, image_name)

        with open(image_path, "wb") as img_file:
            img_file.write(image_bytes)

        image_count += 1
        print(f"Saved image {image_name}")

print(f"Total images extracted: {image_count}")


Processing file: Boletim_da_PI_-_2025-07-11_split_columns.pdf
Page 1: No (210) found
Page 2: No (210) found
Page 3: No (210) found
Page 4: No (210) found
Page 5: No (210) found
Page 6: No (210) found
Page 7: No (210) found
Page 8: No (210) found
Page 9: No (210) found
Page 10: No (210) found
Page 11: No (210) found
Page 12: No (210) found
Page 13: No (210) found
Page 14: No (210) found
Page 15: No (210) found
Page 16: No (210) found
Page 17: No (210) found
Page 18: No (210) found
Page 19: No (210) found
Page 20: No (210) found
Page 21: No (210) found
Page 22: No (210) found
Page 23: No (210) found
Page 24: No (210) found
Page 25: No (210) found
Page 26: No (210) found
Page 27: No (210) found
Page 28: No (210) found
Page 29: No (210) found
Page 30: No (210) found
Page 31: No (210) found
Page 32: No (210) found
Page 33: No (210) found
Page 34: No (210) found
Page 35: No (210) found
Page 36: No (210) found
Page 37: No (210) found
Page 38: No (210) found
Page 39: No (210) found
Page 40: No

# new page

In [27]:
import fitz  # PyMuPDF
import os
import glob
import re

# === Step 1: Locate the latest Boletim_da_PI_-_YYYY-MM-DD.pdf in the current folder ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)

if not pdf_files:
    raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found in current directory.")

latest_pdf = max(pdf_files, key=os.path.getctime)

def extract_all_images_with_210(pdf_path):
    doc = fitz.open(pdf_path)
    image_count = 0

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("blocks")

        # Sort text blocks by Y position (top to bottom)
        blocks.sort(key=lambda b: b[1])

        # Keep track of the most recent (210) value
        current_210 = None
        for b in blocks:
            text = b[4]
            match = re.search(r"\(210\)\s*(\d+)", text)
            if match:
                current_210 = match.group(1)

        # Extract images and name them using the most recent (210)
        images = page.get_images(full=True)
        for i, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]

            filename = f"{current_210 or f'page{page_num+1}_img{i+1}'}.{image_ext}"
            with open(filename, "wb") as f:
                f.write(image_bytes)

            image_count += 1
            print(f"✅ Saved image: {filename}")

    print(f"\n🎉 Done! Total images saved: {image_count}")

# Run it
extract_all_images_with_210(latest_pdf)


✅ Saved image: page1_img1.jpeg
✅ Saved image: page2_img1.jpeg
✅ Saved image: page11_img1.jpeg
✅ Saved image: page12_img1.jpeg
✅ Saved image: page13_img1.jpeg
✅ Saved image: page14_img1.jpeg
✅ Saved image: page15_img1.jpeg
✅ Saved image: page16_img1.jpeg
✅ Saved image: page17_img1.jpeg
✅ Saved image: page18_img1.jpeg
✅ Saved image: page19_img1.jpeg
✅ Saved image: page20_img1.jpeg
✅ Saved image: page21_img1.jpeg
✅ Saved image: page22_img1.jpeg
✅ Saved image: page23_img1.jpeg
✅ Saved image: page24_img1.jpeg
✅ Saved image: page25_img1.jpeg
✅ Saved image: page26_img1.jpeg
✅ Saved image: 747353.jpeg
✅ Saved image: 747353.jpeg
✅ Saved image: 747353.jpeg
✅ Saved image: 747953.jpeg
✅ Saved image: 747953.jpeg
✅ Saved image: 747953.jpeg
✅ Saved image: 748216.jpeg
✅ Saved image: 748216.jpeg
✅ Saved image: page52_img1.jpeg
✅ Saved image: page52_img2.jpeg
✅ Saved image: page55_img1.jpeg
✅ Saved image: page55_img2.jpeg
✅ Saved image: 748217.jpeg
✅ Saved image: 748217.jpeg
✅ Saved image: 748236.jpeg
✅

# Getting order


In [30]:
import fitz  # PyMuPDF
import os
import glob
import re

# === Step 1: Locate the latest Boletim_da_PI_-_*.pdf in the current folder ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)

if not pdf_files:
    raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found in current directory.")

latest_pdf = max(pdf_files, key=os.path.getctime)

def extract_all_images_with_correct_210(pdf_path):
    doc = fitz.open(pdf_path)
    total_saved = 0

    for page_num in range(len(doc)):
        page = doc[page_num]

        # Step 1: Use blocks to locate all (210) codes with their vertical positions
        blocks = page.get_text("blocks")
        code_210_positions = []
        for b in blocks:
            text = b[4]
            match = re.search(r"\(210\)\s*(\d+)", text)
            if match:
                y = b[1]  # top y coordinate of the block
                code_210_positions.append((y, match.group(1)))

        # Sort code_210_positions by Y ascending (top to bottom of page)
        code_210_positions.sort()

        # Step 2: Use image blocks from page.get_text("dict")
        dict_blocks = page.get_text("dict")["blocks"]
        image_index = 1

        for b in dict_blocks:
            if b["type"] == 1:  # it's an image
                img_y = b["bbox"][1]

                # Find the last (210) that appears before this image (above it)
                matched_210 = None
                for y, code_210 in reversed(code_210_positions):
                    if y < img_y:
                        matched_210 = code_210
                        break

                # Fallback name if no (210) above
                name = matched_210 if matched_210 else f"page{page_num+1}_img{image_index}"

                # xref is in page.get_images()
                images = page.get_images(full=True)
                if image_index - 1 < len(images):
                    xref = images[image_index - 1][0]
                    img_data = doc.extract_image(xref)
                    image_bytes = img_data["image"]
                    ext = img_data["ext"]
                    filename = f"{name}.{ext}"
                    with open(filename, "wb") as f:
                        f.write(image_bytes)
                    print(f"✅ Saved {filename}")
                    total_saved += 1
                else:
                    print(f"⚠️ Could not match image block to xref on page {page_num+1}")

                image_index += 1

    print(f"\n🎉 Done! Extracted {total_saved} image(s).")

# Run the function
extract_all_images_with_correct_210(latest_pdf)


✅ Saved page25_img1.jpeg
✅ Saved 747351.jpeg
✅ Saved page50_img1.jpeg
✅ Saved 747471.jpeg
✅ Saved 748157.jpeg
✅ Saved 748215.jpeg
✅ Saved page56_img1.jpeg
✅ Saved 748217.jpeg
✅ Saved 748226.jpeg
✅ Saved 748236.jpeg
✅ Saved page59_img1.jpeg
✅ Saved page60_img1.jpeg
✅ Saved 748244.jpeg
✅ Saved page61_img1.jpeg
✅ Saved 748247.jpeg
✅ Saved 748251.jpeg
✅ Saved 748254.jpeg
✅ Saved page63_img1.jpeg
✅ Saved page64_img1.jpeg
✅ Saved 748288.jpeg
✅ Saved 748293.jpeg
✅ Saved 748302.jpeg
✅ Saved page66_img1.jpeg
✅ Saved 748307.jpeg
✅ Saved 748311.jpeg
✅ Saved page69_img1.jpeg
✅ Saved 748321.jpeg
✅ Saved page70_img1.jpeg
✅ Saved page71_img1.jpeg
✅ Saved 748332.jpeg
✅ Saved page72_img1.jpeg
✅ Saved 748342.jpeg
✅ Saved 748349.jpeg
✅ Saved 748352.jpeg
✅ Saved 748394.jpeg
✅ Saved 748402.jpeg
✅ Saved 748403.jpeg
✅ Saved 748475.jpeg
✅ Saved page83_img1.jpeg
✅ Saved 748480.jpeg
✅ Saved page85_img1.jpeg
✅ Saved 748482.jpeg
✅ Saved 748483.jpeg
✅ Saved 58240.jpeg

🎉 Done! Extracted 44 image(s).


# logis 

In [33]:
import fitz  # PyMuPDF
import os
import glob
import re

# === Step 1: Locate the latest Boletim_da_PI_-_*.pdf in the current folder ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)

if not pdf_files:
    raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found.")

latest_pdf = max(pdf_files, key=os.path.getctime)

def extract_images_split_by_columns(pdf_path):
    doc = fitz.open(pdf_path)
    total_saved = 0

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_width = page.rect.width
        mid_x = page_width / 2

        # === Step 1: Extract all (210) blocks and classify as left or right ===
        blocks = page.get_text("blocks")
        left_210s = []
        right_210s = []

        for b in blocks:
            text = b[4]
            match = re.search(r"\(210\)\s*(\d+)", text)
            if match:
                x, y = b[0], b[1]  # top-left corner
                code = match.group(1)
                if x < mid_x:
                    left_210s.append((y, code))
                else:
                    right_210s.append((y, code))

        # Sort by Y (top to bottom)
        left_210s.sort()
        right_210s.sort()

        # === Step 2: Extract images and match with left or right column ===
        dict_blocks = page.get_text("dict")["blocks"]
        images = page.get_images(full=True)
        xrefs = [img[0] for img in images]
        image_index = 0

        for b in dict_blocks:
            if b["type"] == 1:  # it's an image
                bbox = b["bbox"]
                img_x, img_y = bbox[0], bbox[1]

                if image_index >= len(xrefs):
                    print(f"⚠️ No xref for image {image_index} on page {page_num+1}")
                    continue

                # Choose (210) list by side
                if img_x < mid_x:
                    side_210s = left_210s
                else:
                    side_210s = right_210s

                # Find the last (210) above the image
                matched_210 = None
                for y, code in reversed(side_210s):
                    if y < img_y:
                        matched_210 = code
                        break

                # Fallback name if no 210 found
                filename = f"{matched_210 if matched_210 else f'page{page_num+1}_img{image_index+1}'}"

                # Extract and save the image
                xref = xrefs[image_index]
                img_data = doc.extract_image(xref)
                ext = img_data["ext"]
                image_bytes = img_data["image"]
                with open(f"{filename}.{ext}", "wb") as f:
                    f.write(image_bytes)

                print(f"✅ Saved {filename}.{ext}")
                total_saved += 1
                image_index += 1

    print(f"\n🎉 Done! Extracted {total_saved} image(s).")

# Run the function
extract_images_split_by_columns(latest_pdf)


✅ Saved page25_img1.jpeg
✅ Saved 747351.jpeg
✅ Saved page50_img1.jpeg
✅ Saved 747471.jpeg
✅ Saved 748157.jpeg
✅ Saved 748215.jpeg
✅ Saved page56_img1.jpeg
✅ Saved 748217.jpeg
✅ Saved 748226.jpeg
✅ Saved 748236.jpeg
✅ Saved page59_img1.jpeg
✅ Saved page60_img1.jpeg
✅ Saved 748244.jpeg
✅ Saved page61_img1.jpeg
✅ Saved 748247.jpeg
✅ Saved 748251.jpeg
✅ Saved 748254.jpeg
✅ Saved page63_img1.jpeg
✅ Saved page64_img1.jpeg
✅ Saved 748288.jpeg
✅ Saved 748293.jpeg
✅ Saved 748302.jpeg
✅ Saved page66_img1.jpeg
✅ Saved 748307.jpeg
✅ Saved 748311.jpeg
✅ Saved page69_img1.jpeg
✅ Saved 748321.jpeg
✅ Saved page70_img1.jpeg
✅ Saved page71_img1.jpeg
✅ Saved 748332.jpeg
✅ Saved page72_img1.jpeg
✅ Saved 748342.jpeg
✅ Saved 748349.jpeg
✅ Saved 748352.jpeg
✅ Saved 748394.jpeg
✅ Saved 748402.jpeg
✅ Saved 748403.jpeg
✅ Saved 748475.jpeg
✅ Saved page83_img1.jpeg
✅ Saved 748480.jpeg
✅ Saved page85_img1.jpeg
✅ Saved 748482.jpeg
✅ Saved 748483.jpeg
✅ Saved 58240.jpeg

🎉 Done! Extracted 44 image(s).


# Final Edge case

In [34]:
import fitz  # PyMuPDF
import os
import glob
import re

# Step 1: Locate the latest Boletim_da_PI_-_*.pdf
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)
if not pdf_files:
    raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found.")
latest_pdf = max(pdf_files, key=os.path.getctime)

def extract_images_by_210_zone(pdf_path):
    doc = fitz.open(pdf_path)
    total_saved = 0

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("blocks")

        # === Step 1: Extract all (210) Y-positions ===
        code_210_zones = []
        for b in blocks:
            text = b[4]
            match = re.search(r"\(210\)\s*(\d+)", text)
            if match:
                y = b[1]  # top Y
                code = match.group(1)
                code_210_zones.append((y, code))

        # === Step 2: Sort by Y, and define vertical "zones"
        code_210_zones.sort()
        zone_ranges = []
        for i, (start_y, code) in enumerate(code_210_zones):
            end_y = code_210_zones[i + 1][0] if i + 1 < len(code_210_zones) else float("inf")
            zone_ranges.append((start_y, end_y, code))

        # === Step 3: Extract image blocks + get xref list ===
        dict_blocks = page.get_text("dict")["blocks"]
        xrefs = [img[0] for img in page.get_images(full=True)]
        xref_idx = 0

        for b in dict_blocks:
            if b["type"] == 1:  # image block
                bbox = b["bbox"]
                img_y = bbox[1]  # top Y of image

                matched_code = None
                for start_y, end_y, code in zone_ranges:
                    if start_y <= img_y < end_y:
                        matched_code = code
                        break

                name = matched_code if matched_code else f"page{page_num+1}_img{xref_idx+1}"

                if xref_idx >= len(xrefs):
                    print(f"⚠️ Xref not found for image {xref_idx} on page {page_num+1}")
                    continue

                xref = xrefs[xref_idx]
                xref_idx += 1

                img_data = doc.extract_image(xref)
                ext = img_data["ext"]
                image_bytes = img_data["image"]
                filename = f"{name}.{ext}"

                with open(filename, "wb") as f:
                    f.write(image_bytes)

                print(f"✅ Saved {filename}")
                total_saved += 1

    print(f"\n🎉 Done! Extracted {total_saved} image(s).")

# Run it
extract_images_by_210_zone(latest_pdf)


✅ Saved page25_img1.jpeg
✅ Saved 747351.jpeg
✅ Saved page50_img1.jpeg
✅ Saved 747471.jpeg
✅ Saved 748157.jpeg
✅ Saved 748215.jpeg
✅ Saved page56_img1.jpeg
✅ Saved 748217.jpeg
✅ Saved 748226.jpeg
✅ Saved 748236.jpeg
✅ Saved page59_img1.jpeg
✅ Saved page60_img1.jpeg
✅ Saved 748244.jpeg
✅ Saved page61_img1.jpeg
✅ Saved 748247.jpeg
✅ Saved 748251.jpeg
✅ Saved 748254.jpeg
✅ Saved page63_img1.jpeg
✅ Saved page64_img1.jpeg
✅ Saved 748288.jpeg
✅ Saved 748293.jpeg
✅ Saved 748302.jpeg
✅ Saved page66_img1.jpeg
✅ Saved 748307.jpeg
✅ Saved 748311.jpeg
✅ Saved page69_img1.jpeg
✅ Saved 748321.jpeg
✅ Saved page70_img1.jpeg
✅ Saved page71_img1.jpeg
✅ Saved 748332.jpeg
✅ Saved page72_img1.jpeg
✅ Saved 748342.jpeg
✅ Saved 748349.jpeg
✅ Saved 748352.jpeg
✅ Saved 748394.jpeg
✅ Saved 748402.jpeg
✅ Saved 748403.jpeg
✅ Saved 748475.jpeg
✅ Saved page83_img1.jpeg
✅ Saved 748480.jpeg
✅ Saved page85_img1.jpeg
✅ Saved 748482.jpeg
✅ Saved 748483.jpeg
✅ Saved 58240.jpeg

🎉 Done! Extracted 44 image(s).


3 Floiwng maulitple overlaps

In [45]:
import fitz
import os
import glob
import re

# === Find the latest Boletim PDF ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)
if not pdf_files:
    raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found.")
latest_pdf = max(pdf_files, key=os.path.getctime)

def extract_images_columnwise_fixed(pdf_path):
    doc = fitz.open(pdf_path)
    total_saved = 0
    image_counts = {}

    for page_num in range(len(doc)):
        page = doc[page_num]
        width = page.rect.width
        mid_x = width / 2

        # === Step 1: Extract (210) blocks with X/Y ===
        blocks = page.get_text("blocks")
        code_210_positions = []  # (x, y, code)
        for b in blocks:
            text = b[4]
            match = re.search(r"\(210\)\s*(\d+)", text)
            if match:
                x, y = b[0], b[1]
                code_210_positions.append((x, y, match.group(1)))

        # Split into left/right for cleaner logic
        left_codes = [(x, y, code) for x, y, code in code_210_positions if x < mid_x]
        right_codes = [(x, y, code) for x, y, code in code_210_positions if x >= mid_x]

        dict_blocks = page.get_text("dict")["blocks"]
        xrefs = [img[0] for img in page.get_images(full=True)]
        xref_index = 0

        for b in dict_blocks:
            if b["type"] == 1:  # image
                img_x, img_y = b["bbox"][0], b["bbox"][1]
                matched_code = None

                if img_x < mid_x:
                    # Image is in left column: find nearest left (210) above
                    for x, y, code in reversed(left_codes):
                        if y < img_y:
                            matched_code = code
                            break
                else:
                    # Image is in right column: find left (210) on same or above line
                    for x, y, code in reversed(left_codes):
                        if abs(y - img_y) < 30 or y < img_y:
                            matched_code = code
                            break

                # Fallback
                if matched_code is None:
                    matched_code = f"page{page_num+1}_img{xref_index+1}"

                if xref_index >= len(xrefs):
                    continue
                xref = xrefs[xref_index]
                xref_index += 1

                img_data = doc.extract_image(xref)
                ext = img_data["ext"]
                img_bytes = img_data["image"]

                # Handle multiple images per 210
                if matched_code not in image_counts:
                    image_counts[matched_code] = 1
                    filename = f"{matched_code}.{ext}"
                else:
                    image_counts[matched_code] += 1
                    filename = f"{matched_code}_{image_counts[matched_code]}.{ext}"

                with open(filename, "wb") as f:
                    f.write(img_bytes)

                print(f"✅ Saved {filename}")
                total_saved += 1

    print(f"\n🎉 Done! Extracted {total_saved} image(s) with accurate column-aware mapping.")

# Run
extract_images_columnwise_fixed(latest_pdf)


✅ Saved page1_img1.jpeg
✅ Saved page6_img1.jpeg
✅ Saved page7_img1.jpeg
✅ Saved page8_img1.jpeg
✅ Saved page9_img1.jpeg
✅ Saved page10_img1.jpeg
✅ Saved page17_img1.jpeg
✅ Saved page20_img1.jpeg
✅ Saved page20_img2.jpeg
✅ Saved page20_img3.jpeg
✅ Saved page20_img4.jpeg
✅ Saved page20_img5.jpeg
✅ Saved page20_img6.jpeg
✅ Saved page20_img7.jpeg
✅ Saved page20_img8.jpeg
✅ Saved 747615.jpeg
✅ Saved page23_img1.jpeg
✅ Saved page23_img2.jpeg
✅ Saved 748279.jpeg
✅ Saved 748374.jpeg
✅ Saved 748377.jpeg
✅ Saved 748377_2.jpeg
✅ Saved page25_img1.jpeg
✅ Saved page25_img2.jpeg
✅ Saved page25_img3.jpeg
✅ Saved page28_img1.jpeg
✅ Saved page28_img2.jpeg
✅ Saved 748404.jpeg
✅ Saved 748408.jpeg
✅ Saved 748425.jpeg
✅ Saved 748419.jpeg
✅ Saved 748429.jpeg
✅ Saved 748429_2.jpeg
✅ Saved 748441.jpeg
✅ Saved page31_img1.jpeg
✅ Saved page31_img2.jpeg
✅ Saved page32_img1.jpeg
✅ Saved 748463.jpeg
✅ Saved 748463_2.jpeg
✅ Saved 748487.jpeg
✅ Saved page35_img1.jpeg
✅ Saved 748510.jpeg
✅ Saved 748513.jpeg
✅ Saved 7

# ignoring text

In [41]:
import fitz  # PyMuPDF
import re
import os
import glob

# === Locate the latest Boletim_da_PI_-_*.pdf ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)
if not pdf_files:
    raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found.")
latest_pdf = max(pdf_files, key=os.path.getctime)

def extract_clean_images(pdf_path):
    doc = fitz.open(pdf_path)
    saved = 0
    used_210s = set()
    skipped_210s = set()

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("blocks")

        # Step 1: Find (210)s and their y positions
        valid_210s = []
        current_210 = None
        skip_this_210 = False

        for i, block in enumerate(blocks):
            text = block[4].strip()
            y_pos = block[1]

            # Detect new (210)
            match = re.match(r"\(210\)\s*(\d+)", text)
            if match:
                current_210 = match.group(1)
                skip_this_210 = False
                continue

            # Detect (540)
            if current_210 and "(540)" in text:
                # If the (540) line or the one after it contains visible text, mark it to skip
                if re.search(r"\(540\)\s*[A-Z0-9]", text, re.IGNORECASE):
                    skip_this_210 = True
                    skipped_210s.add(current_210)
                    current_210 = None
                    continue

                # Otherwise, check next block
                if i + 1 < len(blocks):
                    next_text = blocks[i + 1][4].strip()
                    if len(next_text) > 1:
                        skip_this_210 = True
                        skipped_210s.add(current_210)
                        current_210 = None
                        continue

                # No visible text — this (210) is okay
                valid_210s.append((y_pos, current_210))
                used_210s.add(current_210)
                current_210 = None

        # Step 2: Extract images and assign to nearest valid 210
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            img_data = doc.extract_image(xref)
            ext = img_data["ext"]
            image_bytes = img_data["image"]

            # Get image Y position from block layout
            layout_blocks = page.get_text("dict")["blocks"]
            img_y = None
            for b in layout_blocks:
                if b["type"] == 1 and "number" in b:
                    if b["number"] == xref:
                        img_y = b["bbox"][1]
                        break
            if img_y is None:
                img_y = 0  # fallback

            # Match to closest valid 210
            match_210 = None
            for y, code in reversed(valid_210s):
                if y <= img_y:
                    match_210 = code
                    break

            if match_210:
                filename = f"{match_210}.{ext}"
                with open(filename, "wb") as f:
                    f.write(image_bytes)
                print(f"✅ Saved {filename}")
                saved += 1

    print(f"\n🎯 Done. Saved {saved} image(s). Skipped {len(skipped_210s)} (210)s due to non-empty (540).")

# Run it
extract_clean_images(latest_pdf)



🎯 Done. Saved 0 image(s). Skipped 10 (210)s due to non-empty (540).


# TXT

In [44]:
import fitz
import os
import glob
import re

def extract_images_within_210_blocks():
    # === Step 1: Locate latest Boletim_da_PI_-_*.pdf ===
    pdf_pattern = "Boletim_da_PI_-_*.pdf"
    pdf_files = glob.glob(pdf_pattern)
    if not pdf_files:
        raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found.")
    latest_pdf = max(pdf_files, key=os.path.getctime)

    # === Step 2: Prepare output paths ===
    output_txt = "parsed_output.txt"
    output_img_dir = "images"
    os.makedirs(output_img_dir, exist_ok=True)

    doc = fitz.open(latest_pdf)
    text_output = []
    image_counts = {}
    total_saved = 0

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("blocks")
        blocks.sort(key=lambda b: (b[1], b[0]))  # top to bottom, left to right

        # Step 3: Find all (210) codes with their Y position
        code_blocks = []
        for block in blocks:
            text = block[4].strip()
            match = re.match(r"\(210\)\s*(\d+)", text)
            if match:
                y_top = block[1]
                code_blocks.append((y_top, match.group(1)))

        # Add end-of-page sentinel
        page_height = page.rect.height
        code_blocks.append((page_height + 1, None))

        # Step 4: Record page text in proper order
        page_text = f"\n=== Page {page_num + 1} ===\n"
        for block in blocks:
            text = block[4].strip()
            if text:
                page_text += text + "\n"
        text_output.append(page_text)

        # Step 5: Process images
        xrefs = [img[0] for img in page.get_images(full=True)]
        dict_blocks = page.get_text("dict")["blocks"]
        image_block_index = 0

        for b in dict_blocks:
            if b["type"] == 1:  # image
                img_y = b["bbox"][1]
                xref = xrefs[image_block_index] if image_block_index < len(xrefs) else None
                image_block_index += 1
                if xref is None:
                    continue

                # Match image to a (210) block
                matched_code = None
                for i in range(len(code_blocks) - 1):
                    y1, code = code_blocks[i]
                    y2, _ = code_blocks[i + 1]
                    if y1 <= img_y < y2:
                        matched_code = code
                        break

                if matched_code:
                    # Count and name
                    image_counts.setdefault(matched_code, 0)
                    image_counts[matched_code] += 1
                    suffix = f"_{image_counts[matched_code]}" if image_counts[matched_code] > 1 else ""
                    filename = f"{matched_code}{suffix}.png"

                    img_data = doc.extract_image(xref)
                    with open(os.path.join(output_img_dir, filename), "wb") as f:
                        f.write(img_data["image"])
                    print(f"✅ Saved {filename}")
                    total_saved += 1

    # Save full text output
    with open(output_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(text_output))

    print(f"\n📄 Parsed text saved to {output_txt}")
    print(f"🖼️  Total images saved: {total_saved} in {output_img_dir}")

# Run it
extract_images_within_210_blocks()


✅ Saved 748419.png
✅ Saved 748419_2.png
✅ Saved 748419_3.png
✅ Saved 748457.png
✅ Saved 748490.png
✅ Saved 748490_2.png
✅ Saved 748490_3.png
✅ Saved 748513.png
✅ Saved 748513_2.png
✅ Saved 748513_3.png
✅ Saved 748517.png
✅ Saved 748517_2.png
✅ Saved 748517_3.png
✅ Saved 748517_4.png
✅ Saved 748538.png
✅ Saved 748538_2.png
✅ Saved 748538_3.png
✅ Saved 748539.png
✅ Saved 748539_2.png
✅ Saved 748594.png
✅ Saved 748594_2.png
✅ Saved 748777.png

📄 Parsed text saved to parsed_output.txt
🖼️  Total images saved: 22 in images


# Changes

In [49]:
import fitz
import os
import glob
import re

# === Find the latest Boletim PDF ===
pdf_pattern = "Boletim_da_PI_-_*.pdf"
pdf_files = glob.glob(pdf_pattern)
if not pdf_files:
    raise FileNotFoundError("No Boletim_da_PI_-_*.pdf files found.")
latest_pdf = max(pdf_files, key=os.path.getctime)

def extract_images_columnwise_fixed(pdf_path):
    doc = fitz.open(pdf_path)
    total_saved = 0
    image_counts = {}

    # Preload (210) codes for all pages
    all_codes_per_page = []
    for page in doc:
        width = page.rect.width
        mid_x = width / 2
        blocks = page.get_text("blocks")
        code_210_positions = []
        for b in blocks:
            text = b[4]
            match = re.search(r"\(210\)\s*(\d+)", text)
            if match:
                x, y = b[0], b[1]
                code_210_positions.append((x, y, match.group(1)))
        left = sorted([c for c in code_210_positions if c[0] < mid_x], key=lambda k: -k[1])
        right = sorted([c for c in code_210_positions if c[0] >= mid_x], key=lambda k: -k[1])
        all_codes_per_page.append((left, right))

    # Go page by page
    for page_num in range(len(doc)):
        page = doc[page_num]
        width = page.rect.width
        mid_x = width / 2

        dict_blocks = page.get_text("dict")["blocks"]
        xrefs = [img[0] for img in page.get_images(full=True)]
        xref_index = 0

        for b in dict_blocks:
            if b["type"] == 1:  # image
                img_x, img_y = b["bbox"][0], b["bbox"][1]
                matched_code = None

                left_codes, right_codes = all_codes_per_page[page_num]

                if img_x < mid_x:  # Left column
                    # Find 210 above image in left column
                    for x, y, code in left_codes:
                        if y < img_y:
                            matched_code = code
                            break
                    # Fallback: bottom-most 210 from previous page's right column
                    if matched_code is None and page_num > 0:
                        prev_right_codes = all_codes_per_page[page_num - 1][1]
                        if prev_right_codes:
                            matched_code = prev_right_codes[0][2]

                else:  # Right column
                    # Find 210 above image in right column
                    for x, y, code in right_codes:
                        if y < img_y:
                            matched_code = code
                            break
                    # Fallback: bottom-most left column 210 from same page
                    if matched_code is None and left_codes:
                        matched_code = left_codes[0][2]

                # Final fallback
                if matched_code is None:
                    matched_code = f"page{page_num+1}_img{xref_index+1}"

                if xref_index >= len(xrefs):
                    continue
                xref = xrefs[xref_index]
                xref_index += 1

                img_data = doc.extract_image(xref)
                ext = img_data["ext"]
                img_bytes = img_data["image"]

                if matched_code not in image_counts:
                    image_counts[matched_code] = 1
                    filename = f"{matched_code}.{ext}"
                else:
                    image_counts[matched_code] += 1
                    filename = f"{matched_code}_{image_counts[matched_code]}.{ext}"

                with open(filename, "wb") as f:
                    f.write(img_bytes)

                print(f"✅ Saved {filename}")
                total_saved += 1

    print(f"\n🎉 Done! Extracted {total_saved} image(s) with enhanced image-to-(210) logic.")

# Run
extract_images_columnwise_fixed(latest_pdf)


✅ Saved page1_img1.jpeg
✅ Saved page6_img1.jpeg
✅ Saved page7_img1.jpeg
✅ Saved page8_img1.jpeg
✅ Saved page9_img1.jpeg
✅ Saved page10_img1.jpeg
✅ Saved page17_img1.jpeg
✅ Saved page20_img1.jpeg
✅ Saved page20_img2.jpeg
✅ Saved page20_img3.jpeg
✅ Saved page20_img4.jpeg
✅ Saved page20_img5.jpeg
✅ Saved page20_img6.jpeg
✅ Saved page20_img7.jpeg
✅ Saved page20_img8.jpeg
✅ Saved 747615.jpeg
✅ Saved page23_img1.jpeg
✅ Saved 748279.jpeg
✅ Saved 748373.jpeg
✅ Saved 748374.jpeg
✅ Saved 748377.jpeg
✅ Saved 748379.jpeg
✅ Saved 748382.jpeg
✅ Saved 748383.jpeg
✅ Saved 748387.jpeg
✅ Saved page28_img1.jpeg
✅ Saved 748408.jpeg
✅ Saved 748417.jpeg
✅ Saved 748418.jpeg
✅ Saved 748425.jpeg
✅ Saved 748429.jpeg
✅ Saved 748432.jpeg
✅ Saved 748435.jpeg
✅ Saved 748441.jpeg
✅ Saved 748445.jpeg
✅ Saved 748448.jpeg
✅ Saved 748457.jpeg
✅ Saved 748463.jpeg
✅ Saved 748490.jpeg
✅ Saved 748502.jpeg
✅ Saved 748505.jpeg
✅ Saved 748510.jpeg
✅ Saved 748514.jpeg
✅ Saved 748515.jpeg
✅ Saved 748516.jpeg
✅ Saved 748517.jpeg
