Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()

version = "0.1.9"
version = "0.2.0"

classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down
5 changes: 1 addition & 4 deletions pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
try:
import pymupdf.layout
except ImportError:
import pymupdf
import pymupdf

from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION

Expand Down
4 changes: 3 additions & 1 deletion pymupdf4llm/pymupdf4llm/helpers/document_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,9 @@ def parse_document(
utils.clean_pictures(page, blocks)
utils.add_image_orphans(page, blocks)
utils.clean_tables(page, blocks)
page.layout_information = utils.find_reading_order(page.layout_information)
page.layout_information = utils.find_reading_order(
page.rect, blocks, page.layout_information
)

# identify vector graphics to help find tables
all_lines, all_boxes = utils.complete_table_structure(page)
Expand Down
129 changes: 96 additions & 33 deletions pymupdf4llm/pymupdf4llm/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,19 +210,23 @@ def add_image_orphans(page, blocks):
"""


def cluster_stripes(boxes, vertical_gap: float = 12):
def cluster_stripes(boxes, joined_boxes, vectors, vertical_gap=12):
Copy link

Copilot AI Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function signature added new parameters joined_boxes and vectors, but the docstring's Args section does not document these parameters. Add documentation for joined_boxes (the bounding rectangle of all boxes) and vectors (list of vector rectangles to consider during stripe division).

Copilot uses AI. Check for mistakes.
"""
Divide page into horizontal stripes based on vertical gaps.

Args:
boxes (list): List of bounding boxes, each defined as (x0, y0, x1, y1).
boxes (list): List of bounding boxes.
Copy link

Copilot AI Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation for the boxes parameter was simplified but lost important information. The original 'each defined as (x0, y0, x1, y1)' described the expected structure. This should be restored or enhanced to 'List of bounding boxes, each defined as (x0, y0, x1, y1, class)' to match the actual usage in the code.

Suggested change
boxes (list): List of bounding boxes.
boxes (list): List of bounding boxes, each defined as (x0, y0, x1, y1, class).

Copilot uses AI. Check for mistakes.
vertical_gap (float): Minimum vertical gap to separate stripes.

Returns:
List of disjoint horizontal stripes. Each stripe is a list of boxes.
"""

def is_multi_column_layout(boxes):
"""Check if the boxes have a clean multi-column layout.

Used to early exit from stripe clustering.
"""
sorted_boxes = sorted(boxes, key=lambda b: b[0])
columns = []
current_column = [sorted_boxes[0]]
Expand All @@ -236,58 +240,86 @@ def is_multi_column_layout(boxes):
columns.append(current_column)
return len(columns) > 1

def divider(y, box, vertical_gap):
"""Create a rectangle of box width and vertical_gap height below y."""
r = pymupdf.Rect(box[0], y, box[2], y + vertical_gap)
return r

# Sort top to bottom
sorted_boxes = sorted(boxes, key=lambda b: b[1])
sorted_boxes = sorted(boxes, key=lambda b: b[3])
stripes = []

# exit if no boxes
if not sorted_boxes:
return stripes

# Early exit for clean multi-column layouts
if is_multi_column_layout(sorted_boxes):
# Exit if clean multi-column layout: treat full page as single stripe.
if is_multi_column_layout(boxes):
return [boxes]

current_stripe = [sorted_boxes[0]]

for box in sorted_boxes[1:]:
prev_bottom = max(b[3] for b in current_stripe)
if box[1] - prev_bottom > vertical_gap:
# y-borders of horizontal stripes
y_values = {joined_boxes.y1}
for box in sorted_boxes:
# find empty horizontal dividers of minimum height 'vertical_gap'
y = box[3]
if y >= joined_boxes.y1:
continue
div = divider(y, joined_boxes, vertical_gap)
if not any(div.intersects(pymupdf.Rect(b[:4])) for b in boxes):
# look for next bbox below the divider
y0 = min(b[1] for b in sorted_boxes if b[1] >= div.y1)
div.y1 = y0 # divider has this bottom now
inter_count = 0 # counts intersections with vectors

# if divider is fully contained in more than one vector's stripe
# we don't consider it.
for vr in vectors:
if div.intersects(vr) and vr.y0 <= div.y0 and div.y1 <= vr.y1:
inter_count += 1
if inter_count <= 1:
y_values.add(div.y1)
y_values = sorted(y_values)
current_stripe = []
for y in y_values:
while sorted_boxes and sorted_boxes[0][3] <= y:
current_stripe.append(sorted_boxes.pop(0))
if current_stripe:
stripes.append(current_stripe)
current_stripe = [box]
else:
current_stripe.append(box)

stripes.append(current_stripe)
current_stripe = []
return stripes


def cluster_columns_in_stripe(stripe: list):
def cluster_columns_in_stripe(stripe):
"""
Within a stripe, group boxes into columns based on horizontal proximity.

We use a small horizontal gap threshold to decide when a new column starts.

Args:
stripe (list): List of boxes within a stripe.
stripe (list): List of boxes we look at here.

Returns:
list: List of columns, each column is a list of boxes.
"""
HORIZONTAL_GAP = 1 # allowable gap to start a new column
# Sort left to right
sorted_boxes = sorted(stripe, key=lambda b: b[0])
columns = []
current_column = [sorted_boxes[0]]

for box in sorted_boxes[1:]:
prev_right = max([b[2] for b in current_column])
if box[0] - prev_right > 1:
columns.append(sorted(current_column, key=lambda b: b[3]))
if box[0] - prev_right > HORIZONTAL_GAP:
columns.append(sorted(current_column, key=lambda b: b[1]))
current_column = [box]
else:
current_column.append(box)

Comment on lines +312 to 317
Copy link

Copilot AI Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sorting key changed from b[3] (bottom y-coordinate) to b[1] (top y-coordinate). While this may be intentional for top-to-bottom ordering within columns, this is inconsistent with the comment on line 249 'Sort top to bottom' which uses b[3] (bottom coordinate). Consider adding a comment explaining why columns are sorted by top coordinate while stripes use bottom coordinate.

Suggested change
if box[0] - prev_right > HORIZONTAL_GAP:
columns.append(sorted(current_column, key=lambda b: b[1]))
current_column = [box]
else:
current_column.append(box)
if box[0] - prev_right > HORIZONTAL_GAP:
# Note: We sort boxes within each column by their top y-coordinate (b[1]) for top-to-bottom reading order.
# This differs from stripes, which are sorted by bottom y-coordinate (b[3]).
# The use of b[1] here is intentional to ensure columns are read from top to bottom.
columns.append(sorted(current_column, key=lambda b: b[1]))
current_column = [box]
else:
current_column.append(box)
# As above, sort the last column by top y-coordinate (b[1]).

Copilot uses AI. Check for mistakes.
columns.append(sorted(current_column, key=lambda b: b[3]))
columns.append(sorted(current_column, key=lambda b: b[1]))
return columns


def compute_reading_order(boxes, vertical_gap: float = 12):
def compute_reading_order(boxes, joined_boxes, vectors, vertical_gap=12):
Copy link

Copilot AI Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function signature added new parameters joined_boxes and vectors, but the docstring's Args section does not document these parameters. Add documentation for these new parameters to explain their purpose.

Copilot uses AI. Check for mistakes.
"""
Compute reading order of boxes delivered by PyMuPDF-Layout.

Expand All @@ -298,12 +330,12 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
Returns:
list: List of boxes in reading order.
"""
# compute adequate vertical_gap based height of union of bboxes
temp = pymupdf.EMPTY_RECT()
for b in boxes:
temp |= pymupdf.Rect(b[:4])
this_vertical_gap = vertical_gap * temp.height / 800
stripes = cluster_stripes(boxes, vertical_gap=this_vertical_gap)
stripes = cluster_stripes(
boxes,
joined_boxes,
vectors,
vertical_gap=vertical_gap,
)
ordered = []
for stripe in stripes:
columns = cluster_columns_in_stripe(stripe)
Expand All @@ -312,7 +344,7 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
return ordered


def find_reading_order(boxes, vertical_gap: float = 36) -> list:
def find_reading_order(page_rect, blocks, boxes, vertical_gap: float = 12) -> list:
Copy link

Copilot AI Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function signature changed significantly with new parameters page_rect and blocks, and the default vertical_gap changed from 36 to 12, but the docstring's Args section does not document these new parameters or explain the change in default value. Update the documentation to include page_rect and blocks parameters.

Copilot uses AI. Check for mistakes.
"""Given page layout information, return the boxes in reading order.

Args:
Expand All @@ -326,6 +358,9 @@ def find_reading_order(boxes, vertical_gap: float = 36) -> list:
List of boxes in reading order.
"""

# compute adequate vertical_gap based on the height the page rectangle
this_vertical_gap = vertical_gap * page_rect.height / 800

def is_contained(inner, outer) -> bool:
"""Check if inner box is fully contained within outer box."""
return (
Expand Down Expand Up @@ -369,9 +404,28 @@ def filter_contained(boxes) -> list:
else:
body_boxes.append(box)

# bring body into reading order
ordered = compute_reading_order(body_boxes, vertical_gap=vertical_gap)
# compute joined boxes of body
joined_boxes = pymupdf.Rect(
min(b[0] for b in body_boxes),
min(b[1] for b in body_boxes),
max(b[2] for b in body_boxes),
max(b[3] for b in body_boxes),
)

# extract vectors contained in the TextPage
min_bbox_height = min(b[3] - b[1] for b in body_boxes)
vectors = [
pymupdf.Rect(b["bbox"])
for b in blocks
if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and b["bbox"] in joined_boxes
Copy link

Copilot AI Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition b[\"bbox\"] in joined_boxes is checking if a tuple/list is 'in' a Rect object, which will always return False. This should use joined_boxes.contains(b[\"bbox\"]) or pymupdf.Rect(b[\"bbox\"]) in joined_boxes to properly check if the bbox is contained within the joined_boxes rectangle.

Suggested change
if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and b["bbox"] in joined_boxes
if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and pymupdf.Rect(b["bbox"]) in joined_boxes

Copilot uses AI. Check for mistakes.
]
# bring body into reading order
ordered = compute_reading_order(
body_boxes,
joined_boxes,
vectors,
vertical_gap=this_vertical_gap,
)
# Final full boxes list. We do simple sorts for non-body boxes.
final = (
sorted(page_headers, key=lambda r: (r[1], r[0]))
Expand All @@ -382,6 +436,8 @@ def filter_contained(boxes) -> list:


def simplify_vectors(vectors):
"""Join vectors that are horizontally adjacent and vertically aligned."""
Y_TOLERANCE = 1 # allowable top / bottom difference
new_vectors = []
if not vectors:
return new_vectors
Expand All @@ -390,8 +446,8 @@ def simplify_vectors(vectors):
last_v = new_vectors[-1]
if (
1
and abs(v["bbox"][1] - last_v["bbox"][1]) < 1
and abs(v["bbox"][3] - last_v["bbox"][3]) < 1
and abs(v["bbox"][1] - last_v["bbox"][1]) < Y_TOLERANCE
and abs(v["bbox"][3] - last_v["bbox"][3]) < Y_TOLERANCE
and v["bbox"][0] <= last_v["bbox"][2] + 1
):
# merge horizontally
Expand All @@ -408,7 +464,14 @@ def simplify_vectors(vectors):


def find_virtual_lines(page, table_bbox, words, vectors, link_rects):
"""Return virtual lines for a given table bbox."""
"""Return virtual lines for a given table bbox.

This utility looks for:
* horizontal non-stroke vectors and uses their top and bottom edges
as virtual lines. Should work for tables with alternating row colors.
* horizontal thin lines and uses their left x-coordinate as column
borders.
"""

def make_vertical(table_bbox, line_bbox, word_boxes):
# default top and bottom point of vertical line
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/versions_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Generated file - do not edit.
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
VERSION = '0.1.9'
VERSION = '0.2.0'
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"Topic :: Utilities",
]

version = "0.1.9"
version = "0.2.0"
requires = ["pymupdf>=1.26.6", "tabulate"]

text = requires[0].split("=")[1]
Expand Down
Loading