Skip to content

Commit

Permalink
ENH: Add "layout" mode for text extraction (#2388)
Browse files Browse the repository at this point in the history
The `PageObject.extract_text` got a new `extraction_mode` parameter. The old type of extraction is called "plain" which aims more at extracting text in a way that would be useful for NLP or a Text-to-Speech (TTS) system.

The new `extraction_mode="layout"` aims at visually representing the PDF. This is useful for detecting/extracting tables.
  • Loading branch information
shartzog committed Jan 11, 2024
1 parent cfd8712 commit fc893d5
Show file tree
Hide file tree
Showing 22 changed files with 1,496 additions and 12 deletions.
20 changes: 19 additions & 1 deletion docs/user/extract-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ page = reader.pages[0]
print(page.extract_text())
```

you can also choose to limit the text orientation you want to extract, e.g:
You can also choose to limit the text orientation you want to extract, e.g:

```python
# extract only text oriented up
Expand All @@ -20,6 +20,24 @@ print(page.extract_text(0))
print(page.extract_text((0, 90)))
```

You can also extract text in "layout" mode:

```python
# extract text in a fixed width format that closely adheres to the rendered
# layout in the source pdf
print(page.extract_text(extraction_mode="layout"))

# extract text preserving horizontal positioning without excess vertical
# whitespace (removes blank and "whitespace only" lines)
print(page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False))

# adjust horizontal spacing
print(page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0))

# exclude (default) or include (as shown below) text rotated w.r.t. the page
print(page.extract_text(extraction_mode="layout", layout_mode_strip_rotated=False))
```

Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extract_text) for more details.

## Using a visitor
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def build_char_map_from_dict(

# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
_default_fonts_space_width: Dict[str, int] = {
"/Courrier": 600,
"/Courier": 600,
"/Courier-Bold": 600,
"/Courier-BoldOblique": 600,
"/Courier-Oblique": 600,
Expand Down
133 changes: 133 additions & 0 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@

import math
import re
import sys
from decimal import Decimal
from pathlib import Path
from typing import (
Any,
Callable,
Expand All @@ -50,6 +52,7 @@
from ._protocols import PdfReaderProtocol, PdfWriterProtocol
from ._text_extraction import (
OrientationNotFoundError,
_layout_mode,
crlf_space_check,
handle_tj,
mult,
Expand Down Expand Up @@ -83,6 +86,12 @@
StreamObject,
)

if sys.version_info >= (3, 8):
from typing import Literal
else:
from typing_extensions import Literal


MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'


Expand Down Expand Up @@ -1868,6 +1877,96 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
"""
Get fonts formatted for "layout" mode text extraction.
Returns:
Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name
"""
# Font retrieval logic adapted from pypdf.PageObject._extract_text()
objr: Any = self
while NameObject(PG.RESOURCES) not in objr:
objr = objr["/Parent"].get_object()
resources_dict: Any = objr[PG.RESOURCES]
fonts: Dict[str, _layout_mode.Font] = {}
if "/Font" in resources_dict and self.pdf is not None:
for font_name in resources_dict["/Font"]:
*cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
font_dict = {
k: self.pdf.get_object(v)
if isinstance(v, IndirectObject)
else [
self.pdf.get_object(_v)
if isinstance(_v, IndirectObject)
else _v
for _v in v
]
if isinstance(v, ArrayObject)
else v
for k, v in font_dict_obj.items()
}
# mypy really sucks at unpacking
fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
return fonts

def _layout_mode_text(
self,
space_vertically: bool = True,
scale_weight: float = 1.25,
strip_rotated: bool = True,
debug_path: Optional[Path] = None,
) -> str:
"""
Get text preserving fidelity to source PDF text layout.
Args:
space_vertically: include blank lines inferred from y distance + font
height. Defaults to True.
scale_weight: multiplier for string length when calculating weighted
average character width. Defaults to 1.25.
strip_rotated: Removes text that is rotated w.r.t. to the page from
layout mode output. Defaults to True.
debug_path (Path | None): if supplied, must target a directory.
creates the following files with debug information for layout mode
functions if supplied:
- fonts.json: output of self._layout_mode_fonts
- tjs.json: individual text render ops with corresponding transform matrices
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
Defaults to None.
Returns:
str: multiline string containing page text in a fixed width format that
closely adheres to the rendered layout in the source pdf.
"""
fonts = self._layout_mode_fonts()
if debug_path: # pragma: no cover
import json

debug_path.joinpath("fonts.json").write_text(
json.dumps(
fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
),
"utf-8",
)

ops = iter(
ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
)
bt_groups = _layout_mode.text_show_operations(
ops, fonts, strip_rotated, debug_path
)

if not bt_groups:
return ""

ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)

char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)

def extract_text(
self,
*args: Any,
Expand All @@ -1876,6 +1975,8 @@ def extract_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
extraction_mode: Literal["plain", "layout"] = "plain",
**kwargs: Any,
) -> str:
"""
Locate all text drawing commands, in the order they are provided in the
Expand Down Expand Up @@ -1913,10 +2014,42 @@ def extract_text(
text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
"layout" for experimental layout mode functionality.
NOTE: orientations, space_width, and visitor_* parameters are NOT respected
in "layout" mode.
KwArgs:
layout_mode_space_vertically (bool): include blank lines inferred from
y distance + font height. Defaults to True.
layout_mode_scale_weight (float): multiplier for string length when calculating
weighted average character width. Defaults to 1.25.
layout_mode_strip_rotated (bool): layout mode does not support rotated text.
Set to False to include rotated text anyway. If rotated text is discovered,
layout will be degraded and a warning will result. Defaults to True.
layout_mode_strip_rotated: Removes text that is rotated w.r.t. to the page from
layout mode output. Defaults to True.
layout_mode_debug_path (Path | None): if supplied, must target a directory.
creates the following files with debug information for layout mode
functions if supplied:
- fonts.json: output of self._layout_mode_fonts
- tjs.json: individual text render ops with corresponding transform matrices
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
Returns:
The extracted text
"""
if extraction_mode not in ["plain", "layout"]:
raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
if extraction_mode == "layout":
return self._layout_mode_text(
space_vertically=kwargs.get("layout_mode_space_vertically", True),
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
debug_path=kwargs.get("layout_mode_debug_path", None),
)
if len(args) >= 1:
if isinstance(args[0], str):
if len(args) >= 3:
Expand Down
1 change: 1 addition & 0 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5


class OrientationNotFoundError(Exception):
Expand Down
16 changes: 16 additions & 0 deletions pypdf/_text_extraction/_layout_mode/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Layout mode text extraction extension for pypdf"""
from ._fixed_width_page import (
fixed_char_width,
fixed_width_page,
text_show_operations,
y_coordinate_groups,
)
from ._font import Font

__all__ = [
"fixed_char_width",
"fixed_width_page",
"text_show_operations",
"y_coordinate_groups",
"Font",
]
Loading

0 comments on commit fc893d5

Please sign in to comment.