ENH: Add "layout" mode for text extraction (#2388)

The `PageObject.extract_text` got a new `extraction_mode` parameter. The old type of extraction is called "plain" which aims more at extracting text in a way that would be useful for NLP or a Text-to-Speech (TTS) system. The new `extraction_mode="layout"` aims at visually representing the PDF. This is useful for detecting/extracting tables.
py-pdf · Jan 11, 2024 · fc893d5 · fc893d5
1 parent cfd8712
commit fc893d5
Show file tree

Hide file tree

Showing 22 changed files with 1,496 additions and 12 deletions.
diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
@@ -10,7 +10,7 @@ page = reader.pages[0]
 print(page.extract_text())
 ```
 
-you can also choose to limit the text orientation you want to extract, e.g:
+You can also choose to limit the text orientation you want to extract, e.g:
 
 ```python
 # extract only text oriented up
@@ -20,6 +20,24 @@ print(page.extract_text(0))
 print(page.extract_text((0, 90)))
 ```
 
+You can also extract text in "layout" mode:
+
+```python
+# extract text in a fixed width format that closely adheres to the rendered
+# layout in the source pdf
+print(page.extract_text(extraction_mode="layout"))
+
+# extract text preserving horizontal positioning without excess vertical
+# whitespace (removes blank and "whitespace only" lines)
+print(page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False))
+
+# adjust horizontal spacing
+print(page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0))
+
+# exclude (default) or include (as shown below) text rotated w.r.t. the page
+print(page.extract_text(extraction_mode="layout", layout_mode_strip_rotated=False))
+```
+
 Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extract_text) for more details.
 
 ## Using a visitor

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -127,7 +127,7 @@ def build_char_map_from_dict(
 
 # manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
 _default_fonts_space_width: Dict[str, int] = {
-    "/Courrier": 600,
+    "/Courier": 600,
     "/Courier-Bold": 600,
     "/Courier-BoldOblique": 600,
     "/Courier-Oblique": 600,

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -29,7 +29,9 @@
 
 import math
 import re
+import sys
 from decimal import Decimal
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -50,6 +52,7 @@
 from ._protocols import PdfReaderProtocol, PdfWriterProtocol
 from ._text_extraction import (
     OrientationNotFoundError,
+    _layout_mode,
     crlf_space_check,
     handle_tj,
     mult,
@@ -83,6 +86,12 @@
     StreamObject,
 )
 
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+
 MERGE_CROP_BOX = "cropbox"  # pypdf<=3.4.0 used 'trimbox'
 
 
@@ -1868,6 +1877,96 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
             visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
         return output
 
+    def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
+        """
+        Get fonts formatted for "layout" mode text extraction.
+
+        Returns:
+            Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name
+        """
+        # Font retrieval logic adapted from pypdf.PageObject._extract_text()
+        objr: Any = self
+        while NameObject(PG.RESOURCES) not in objr:
+            objr = objr["/Parent"].get_object()
+        resources_dict: Any = objr[PG.RESOURCES]
+        fonts: Dict[str, _layout_mode.Font] = {}
+        if "/Font" in resources_dict and self.pdf is not None:
+            for font_name in resources_dict["/Font"]:
+                *cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
+                font_dict = {
+                    k: self.pdf.get_object(v)
+                    if isinstance(v, IndirectObject)
+                    else [
+                        self.pdf.get_object(_v)
+                        if isinstance(_v, IndirectObject)
+                        else _v
+                        for _v in v
+                    ]
+                    if isinstance(v, ArrayObject)
+                    else v
+                    for k, v in font_dict_obj.items()
+                }
+                # mypy really sucks at unpacking
+                fonts[font_name] = _layout_mode.Font(*cmap, font_dict)  # type: ignore[call-arg,arg-type]
+        return fonts
+
+    def _layout_mode_text(
+        self,
+        space_vertically: bool = True,
+        scale_weight: float = 1.25,
+        strip_rotated: bool = True,
+        debug_path: Optional[Path] = None,
+    ) -> str:
+        """
+        Get text preserving fidelity to source PDF text layout.
+
+        Args:
+            space_vertically: include blank lines inferred from y distance + font
+                height. Defaults to True.
+            scale_weight: multiplier for string length when calculating weighted
+                average character width. Defaults to 1.25.
+            strip_rotated: Removes text that is rotated w.r.t. to the page from
+                layout mode output. Defaults to True.
+            debug_path (Path | None): if supplied, must target a directory.
+                creates the following files with debug information for layout mode
+                functions if supplied:
+                  - fonts.json: output of self._layout_mode_fonts
+                  - tjs.json: individual text render ops with corresponding transform matrices
+                  - bts.json: text render ops left justified and grouped by BT/ET operators
+                  - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
+                Defaults to None.
+
+        Returns:
+            str: multiline string containing page text in a fixed width format that
+                closely adheres to the rendered layout in the source pdf.
+        """
+        fonts = self._layout_mode_fonts()
+        if debug_path:  # pragma: no cover
+            import json
+
+            debug_path.joinpath("fonts.json").write_text(
+                json.dumps(
+                    fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
+                ),
+                "utf-8",
+            )
+
+        ops = iter(
+            ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
+        )
+        bt_groups = _layout_mode.text_show_operations(
+            ops, fonts, strip_rotated, debug_path
+        )
+
+        if not bt_groups:
+            return ""
+
+        ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
+
+        char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
+
+        return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
+
     def extract_text(
         self,
         *args: Any,
@@ -1876,6 +1975,8 @@ def extract_text(
         visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+        extraction_mode: Literal["plain", "layout"] = "plain",
+        **kwargs: Any,
     ) -> str:
         """
         Locate all text drawing commands, in the order they are provided in the
@@ -1913,10 +2014,42 @@ def extract_text(
                 text matrix, font-dictionary and font-size.
                 The font-dictionary may be None in case of unknown fonts.
                 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
+            extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
+                "layout" for experimental layout mode functionality.
+                NOTE: orientations, space_width, and visitor_* parameters are NOT respected
+                in "layout" mode.
+
+        KwArgs:
+            layout_mode_space_vertically (bool): include blank lines inferred from
+                y distance + font height. Defaults to True.
+            layout_mode_scale_weight (float): multiplier for string length when calculating
+                weighted average character width. Defaults to 1.25.
+            layout_mode_strip_rotated (bool): layout mode does not support rotated text.
+                Set to False to include rotated text anyway. If rotated text is discovered,
+                layout will be degraded and a warning will result. Defaults to True.
+            layout_mode_strip_rotated: Removes text that is rotated w.r.t. to the page from
+                layout mode output. Defaults to True.
+            layout_mode_debug_path (Path | None): if supplied, must target a directory.
+                creates the following files with debug information for layout mode
+                functions if supplied:
+
+                  - fonts.json: output of self._layout_mode_fonts
+                  - tjs.json: individual text render ops with corresponding transform matrices
+                  - bts.json: text render ops left justified and grouped by BT/ET operators
+                  - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
 
         Returns:
             The extracted text
         """
+        if extraction_mode not in ["plain", "layout"]:
+            raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
+        if extraction_mode == "layout":
+            return self._layout_mode_text(
+                space_vertically=kwargs.get("layout_mode_space_vertically", True),
+                scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
+                strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
+                debug_path=kwargs.get("layout_mode_debug_path", None),
+            )
         if len(args) >= 1:
             if isinstance(args[0], str):
                 if len(args) >= 3:

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
@@ -12,6 +12,7 @@
 CUSTOM_RTL_MIN: int = -1
 CUSTOM_RTL_MAX: int = -1
 CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
+LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
 
 
 class OrientationNotFoundError(Exception):

diff --git a/pypdf/_text_extraction/_layout_mode/__init__.py b/pypdf/_text_extraction/_layout_mode/__init__.py
@@ -0,0 +1,16 @@
+"""Layout mode text extraction extension for pypdf"""
+from ._fixed_width_page import (
+    fixed_char_width,
+    fixed_width_page,
+    text_show_operations,
+    y_coordinate_groups,
+)
+from ._font import Font
+
+__all__ = [
+    "fixed_char_width",
+    "fixed_width_page",
+    "text_show_operations",
+    "y_coordinate_groups",
+    "Font",
+]