diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index 79f2ccbf..c4af2cc0 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -6,7 +6,7 @@ with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f: readme = f.read() -version = "0.1.8" +version = "0.1.9" classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index 34e1f392..c95377a7 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -8,10 +8,11 @@ import pymupdf import tabulate from pymupdf4llm.helpers.get_text_lines import get_raw_lines -from pymupdf4llm.helpers import utils, check_ocr +from pymupdf4llm.helpers import utils try: import cv2 + from pymupdf4llm.helpers import check_ocr except ImportError: cv2 = None diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index e1a02da9..f3ef2c94 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -93,8 +93,12 @@ def sanitize_spans(line): if s0["bbox"].x1 + delta < s1["bbox"].x0 or ( s0["flags"], s0["char_flags"] & ~2, - s0["size"], - ) != (s1["flags"], s1["char_flags"] & ~2, s1["size"]): + # s0["size"], + ) != ( + s1["flags"], + s1["char_flags"] & ~2, + # s1["size"], + ): continue # no joining # We need to join bbox and text of two consecutive spans # On occasion, spans may also be duplicated. diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index 28fea125..955e6046 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -221,11 +221,31 @@ def cluster_stripes(boxes, vertical_gap: float = 12): Returns: List of disjoint horizontal stripes. Each stripe is a list of boxes. """ + + def is_multi_column_layout(boxes): + sorted_boxes = sorted(boxes, key=lambda b: b[0]) + columns = [] + current_column = [sorted_boxes[0]] + for box in sorted_boxes[1:]: + prev_right = max([b[2] for b in current_column]) + if box[0] - prev_right > 3: + columns.append(current_column) + current_column = [box] + else: + current_column.append(box) + columns.append(current_column) + return len(columns) > 1 + # Sort top to bottom sorted_boxes = sorted(boxes, key=lambda b: b[1]) stripes = [] if not sorted_boxes: return stripes + + # Early exit for clean multi-column layouts + if is_multi_column_layout(sorted_boxes): + return [boxes] + current_stripe = [sorted_boxes[0]] for box in sorted_boxes[1:]: @@ -257,7 +277,7 @@ def cluster_columns_in_stripe(stripe: list): for box in sorted_boxes[1:]: prev_right = max([b[2] for b in current_column]) - if box[0] - prev_right >= -1: + if box[0] - prev_right > 1: columns.append(sorted(current_column, key=lambda b: b[3])) current_column = [box] else: @@ -292,14 +312,15 @@ def compute_reading_order(boxes, vertical_gap: float = 12): return ordered -def find_reading_order(boxes, vertical_gap: float = 12) -> list: +def find_reading_order(boxes, vertical_gap: float = 36) -> list: """Given page layout information, return the boxes in reading order. Args: boxes: List of classified bounding boxes with class info as defined by pymupdf_layout: (x0, y0, x1, y1, "class"). vertical_gap: Minimum vertical gap to separate stripes. The default - value of 12 works well for most documents. + value of 36 works well for most documents. It roughly + corresponds to 2 -3 text line heights Returns: List of boxes in reading order. diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index 7068efa9..cad78b1f 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,3 +1,3 @@ # Generated file - do not edit. MINIMUM_PYMUPDF_VERSION = (1, 26, 6) -VERSION = '0.1.8' +VERSION = '0.1.9' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index a788de42..7b329f1c 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -14,7 +14,7 @@ "Topic :: Utilities", ] -version = "0.1.8" +version = "0.1.9" requires = ["pymupdf>=1.26.6", "tabulate"] text = requires[0].split("=")[1]