Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()

version = "0.1.8"
version = "0.1.9"

classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down
3 changes: 2 additions & 1 deletion pymupdf4llm/pymupdf4llm/helpers/document_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
import pymupdf
import tabulate
from pymupdf4llm.helpers.get_text_lines import get_raw_lines
from pymupdf4llm.helpers import utils, check_ocr
from pymupdf4llm.helpers import utils

try:
import cv2
from pymupdf4llm.helpers import check_ocr
except ImportError:
cv2 = None

Expand Down
8 changes: 6 additions & 2 deletions pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,12 @@ def sanitize_spans(line):
if s0["bbox"].x1 + delta < s1["bbox"].x0 or (
s0["flags"],
s0["char_flags"] & ~2,
s0["size"],
) != (s1["flags"], s1["char_flags"] & ~2, s1["size"]):
# s0["size"],
) != (
s1["flags"],
s1["char_flags"] & ~2,
# s1["size"],
):
continue # no joining
# We need to join bbox and text of two consecutive spans
# On occasion, spans may also be duplicated.
Expand Down
27 changes: 24 additions & 3 deletions pymupdf4llm/pymupdf4llm/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,31 @@ def cluster_stripes(boxes, vertical_gap: float = 12):
Returns:
List of disjoint horizontal stripes. Each stripe is a list of boxes.
"""

def is_multi_column_layout(boxes):
sorted_boxes = sorted(boxes, key=lambda b: b[0])
columns = []
current_column = [sorted_boxes[0]]
for box in sorted_boxes[1:]:
prev_right = max([b[2] for b in current_column])
if box[0] - prev_right > 3:
columns.append(current_column)
current_column = [box]
else:
current_column.append(box)
columns.append(current_column)
return len(columns) > 1

# Sort top to bottom
sorted_boxes = sorted(boxes, key=lambda b: b[1])
stripes = []
if not sorted_boxes:
return stripes

# Early exit for clean multi-column layouts
if is_multi_column_layout(sorted_boxes):
return [boxes]

current_stripe = [sorted_boxes[0]]

for box in sorted_boxes[1:]:
Expand Down Expand Up @@ -257,7 +277,7 @@ def cluster_columns_in_stripe(stripe: list):

for box in sorted_boxes[1:]:
prev_right = max([b[2] for b in current_column])
if box[0] - prev_right >= -1:
if box[0] - prev_right > 1:
columns.append(sorted(current_column, key=lambda b: b[3]))
current_column = [box]
else:
Expand Down Expand Up @@ -292,14 +312,15 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
return ordered


def find_reading_order(boxes, vertical_gap: float = 12) -> list:
def find_reading_order(boxes, vertical_gap: float = 36) -> list:
"""Given page layout information, return the boxes in reading order.

Args:
boxes: List of classified bounding boxes with class info as defined
by pymupdf_layout: (x0, y0, x1, y1, "class").
vertical_gap: Minimum vertical gap to separate stripes. The default
value of 12 works well for most documents.
value of 36 works well for most documents. It roughly
corresponds to 2 -3 text line heights

Returns:
List of boxes in reading order.
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/versions_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Generated file - do not edit.
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
VERSION = '0.1.8'
VERSION = '0.1.9'
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"Topic :: Utilities",
]

version = "0.1.8"
version = "0.1.9"
requires = ["pymupdf>=1.26.6", "tabulate"]

text = requires[0].split("=")[1]
Expand Down