From 70439557e9444503f76b0794e1ff3630db62f49e Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Mon, 10 Nov 2025 15:20:00 -0400 Subject: [PATCH] Version 0.2.0 --- pdf4llm/setup.py | 2 +- pymupdf4llm/pymupdf4llm/__init__.py | 5 +- .../pymupdf4llm/helpers/document_layout.py | 4 +- pymupdf4llm/pymupdf4llm/helpers/utils.py | 129 +++++++++++++----- pymupdf4llm/pymupdf4llm/versions_file.py | 2 +- pymupdf4llm/setup.py | 2 +- 6 files changed, 103 insertions(+), 41 deletions(-) diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index c4af2cc0..ba82a62c 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -6,7 +6,7 @@ with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f: readme = f.read() -version = "0.1.9" +version = "0.2.0" classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 84e85eaf..8ca76f53 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,7 +1,4 @@ -try: - import pymupdf.layout -except ImportError: - import pymupdf +import pymupdf from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index c95377a7..22ca33a3 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -713,7 +713,9 @@ def parse_document( utils.clean_pictures(page, blocks) utils.add_image_orphans(page, blocks) utils.clean_tables(page, blocks) - page.layout_information = utils.find_reading_order(page.layout_information) + page.layout_information = utils.find_reading_order( + page.rect, blocks, page.layout_information + ) # identify vector graphics to help find tables all_lines, all_boxes = utils.complete_table_structure(page) diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index 955e6046..9df5a3e0 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -210,12 +210,12 @@ def add_image_orphans(page, blocks): """ -def cluster_stripes(boxes, vertical_gap: float = 12): +def cluster_stripes(boxes, joined_boxes, vectors, vertical_gap=12): """ Divide page into horizontal stripes based on vertical gaps. Args: - boxes (list): List of bounding boxes, each defined as (x0, y0, x1, y1). + boxes (list): List of bounding boxes. vertical_gap (float): Minimum vertical gap to separate stripes. Returns: @@ -223,6 +223,10 @@ def cluster_stripes(boxes, vertical_gap: float = 12): """ def is_multi_column_layout(boxes): + """Check if the boxes have a clean multi-column layout. + + Used to early exit from stripe clustering. + """ sorted_boxes = sorted(boxes, key=lambda b: b[0]) columns = [] current_column = [sorted_boxes[0]] @@ -236,40 +240,68 @@ def is_multi_column_layout(boxes): columns.append(current_column) return len(columns) > 1 + def divider(y, box, vertical_gap): + """Create a rectangle of box width and vertical_gap height below y.""" + r = pymupdf.Rect(box[0], y, box[2], y + vertical_gap) + return r + # Sort top to bottom - sorted_boxes = sorted(boxes, key=lambda b: b[1]) + sorted_boxes = sorted(boxes, key=lambda b: b[3]) stripes = [] + + # exit if no boxes if not sorted_boxes: return stripes - # Early exit for clean multi-column layouts - if is_multi_column_layout(sorted_boxes): + # Exit if clean multi-column layout: treat full page as single stripe. + if is_multi_column_layout(boxes): return [boxes] - current_stripe = [sorted_boxes[0]] - - for box in sorted_boxes[1:]: - prev_bottom = max(b[3] for b in current_stripe) - if box[1] - prev_bottom > vertical_gap: + # y-borders of horizontal stripes + y_values = {joined_boxes.y1} + for box in sorted_boxes: + # find empty horizontal dividers of minimum height 'vertical_gap' + y = box[3] + if y >= joined_boxes.y1: + continue + div = divider(y, joined_boxes, vertical_gap) + if not any(div.intersects(pymupdf.Rect(b[:4])) for b in boxes): + # look for next bbox below the divider + y0 = min(b[1] for b in sorted_boxes if b[1] >= div.y1) + div.y1 = y0 # divider has this bottom now + inter_count = 0 # counts intersections with vectors + + # if divider is fully contained in more than one vector's stripe + # we don't consider it. + for vr in vectors: + if div.intersects(vr) and vr.y0 <= div.y0 and div.y1 <= vr.y1: + inter_count += 1 + if inter_count <= 1: + y_values.add(div.y1) + y_values = sorted(y_values) + current_stripe = [] + for y in y_values: + while sorted_boxes and sorted_boxes[0][3] <= y: + current_stripe.append(sorted_boxes.pop(0)) + if current_stripe: stripes.append(current_stripe) - current_stripe = [box] - else: - current_stripe.append(box) - - stripes.append(current_stripe) + current_stripe = [] return stripes -def cluster_columns_in_stripe(stripe: list): +def cluster_columns_in_stripe(stripe): """ Within a stripe, group boxes into columns based on horizontal proximity. + We use a small horizontal gap threshold to decide when a new column starts. + Args: - stripe (list): List of boxes within a stripe. + stripe (list): List of boxes we look at here. Returns: list: List of columns, each column is a list of boxes. """ + HORIZONTAL_GAP = 1 # allowable gap to start a new column # Sort left to right sorted_boxes = sorted(stripe, key=lambda b: b[0]) columns = [] @@ -277,17 +309,17 @@ def cluster_columns_in_stripe(stripe: list): for box in sorted_boxes[1:]: prev_right = max([b[2] for b in current_column]) - if box[0] - prev_right > 1: - columns.append(sorted(current_column, key=lambda b: b[3])) + if box[0] - prev_right > HORIZONTAL_GAP: + columns.append(sorted(current_column, key=lambda b: b[1])) current_column = [box] else: current_column.append(box) - columns.append(sorted(current_column, key=lambda b: b[3])) + columns.append(sorted(current_column, key=lambda b: b[1])) return columns -def compute_reading_order(boxes, vertical_gap: float = 12): +def compute_reading_order(boxes, joined_boxes, vectors, vertical_gap=12): """ Compute reading order of boxes delivered by PyMuPDF-Layout. @@ -298,12 +330,12 @@ def compute_reading_order(boxes, vertical_gap: float = 12): Returns: list: List of boxes in reading order. """ - # compute adequate vertical_gap based height of union of bboxes - temp = pymupdf.EMPTY_RECT() - for b in boxes: - temp |= pymupdf.Rect(b[:4]) - this_vertical_gap = vertical_gap * temp.height / 800 - stripes = cluster_stripes(boxes, vertical_gap=this_vertical_gap) + stripes = cluster_stripes( + boxes, + joined_boxes, + vectors, + vertical_gap=vertical_gap, + ) ordered = [] for stripe in stripes: columns = cluster_columns_in_stripe(stripe) @@ -312,7 +344,7 @@ def compute_reading_order(boxes, vertical_gap: float = 12): return ordered -def find_reading_order(boxes, vertical_gap: float = 36) -> list: +def find_reading_order(page_rect, blocks, boxes, vertical_gap: float = 12) -> list: """Given page layout information, return the boxes in reading order. Args: @@ -326,6 +358,9 @@ def find_reading_order(boxes, vertical_gap: float = 36) -> list: List of boxes in reading order. """ + # compute adequate vertical_gap based on the height the page rectangle + this_vertical_gap = vertical_gap * page_rect.height / 800 + def is_contained(inner, outer) -> bool: """Check if inner box is fully contained within outer box.""" return ( @@ -369,9 +404,28 @@ def filter_contained(boxes) -> list: else: body_boxes.append(box) - # bring body into reading order - ordered = compute_reading_order(body_boxes, vertical_gap=vertical_gap) + # compute joined boxes of body + joined_boxes = pymupdf.Rect( + min(b[0] for b in body_boxes), + min(b[1] for b in body_boxes), + max(b[2] for b in body_boxes), + max(b[3] for b in body_boxes), + ) + # extract vectors contained in the TextPage + min_bbox_height = min(b[3] - b[1] for b in body_boxes) + vectors = [ + pymupdf.Rect(b["bbox"]) + for b in blocks + if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and b["bbox"] in joined_boxes + ] + # bring body into reading order + ordered = compute_reading_order( + body_boxes, + joined_boxes, + vectors, + vertical_gap=this_vertical_gap, + ) # Final full boxes list. We do simple sorts for non-body boxes. final = ( sorted(page_headers, key=lambda r: (r[1], r[0])) @@ -382,6 +436,8 @@ def filter_contained(boxes) -> list: def simplify_vectors(vectors): + """Join vectors that are horizontally adjacent and vertically aligned.""" + Y_TOLERANCE = 1 # allowable top / bottom difference new_vectors = [] if not vectors: return new_vectors @@ -390,8 +446,8 @@ def simplify_vectors(vectors): last_v = new_vectors[-1] if ( 1 - and abs(v["bbox"][1] - last_v["bbox"][1]) < 1 - and abs(v["bbox"][3] - last_v["bbox"][3]) < 1 + and abs(v["bbox"][1] - last_v["bbox"][1]) < Y_TOLERANCE + and abs(v["bbox"][3] - last_v["bbox"][3]) < Y_TOLERANCE and v["bbox"][0] <= last_v["bbox"][2] + 1 ): # merge horizontally @@ -408,7 +464,14 @@ def simplify_vectors(vectors): def find_virtual_lines(page, table_bbox, words, vectors, link_rects): - """Return virtual lines for a given table bbox.""" + """Return virtual lines for a given table bbox. + + This utility looks for: + * horizontal non-stroke vectors and uses their top and bottom edges + as virtual lines. Should work for tables with alternating row colors. + * horizontal thin lines and uses their left x-coordinate as column + borders. + """ def make_vertical(table_bbox, line_bbox, word_boxes): # default top and bottom point of vertical line diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index cad78b1f..afbe9821 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,3 +1,3 @@ # Generated file - do not edit. MINIMUM_PYMUPDF_VERSION = (1, 26, 6) -VERSION = '0.1.9' +VERSION = '0.2.0' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 7b329f1c..5c7cd2c6 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -14,7 +14,7 @@ "Topic :: Utilities", ] -version = "0.1.9" +version = "0.2.0" requires = ["pymupdf>=1.26.6", "tabulate"] text = requires[0].split("=")[1]