Skip to content

Commit

Permalink
FIX: Extract text in layout mode without finding resources (#2555)
Browse files Browse the repository at this point in the history
Closes #2533
  • Loading branch information
pubpub-zz committed Mar 29, 2024
1 parent 253cde4 commit e35df5a
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 21 deletions.
44 changes: 23 additions & 21 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1891,28 +1891,30 @@ def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
"""
# Font retrieval logic adapted from pypdf.PageObject._extract_text()
objr: Any = self
while NameObject(PG.RESOURCES) not in objr:
objr = objr["/Parent"].get_object()
resources_dict: Any = objr[PG.RESOURCES]
fonts: Dict[str, _layout_mode.Font] = {}
if "/Font" in resources_dict and self.pdf is not None:
for font_name in resources_dict["/Font"]:
*cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
font_dict = {
k: self.pdf.get_object(v)
if isinstance(v, IndirectObject)
else [
self.pdf.get_object(_v)
if isinstance(_v, IndirectObject)
else _v
for _v in v
]
if isinstance(v, ArrayObject)
else v
for k, v in font_dict_obj.items()
}
# mypy really sucks at unpacking
fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
while objr is not None:
try:
resources_dict: Any = objr[PG.RESOURCES]
except KeyError:
resources_dict = {}
if "/Font" in resources_dict and self.pdf is not None:
for font_name in resources_dict["/Font"]:
*cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
font_dict = {
k: v.get_object()
if isinstance(v, IndirectObject)
else [_v.get_object() for _v in v]
if isinstance(v, ArrayObject)
else v
for k, v in font_dict_obj.items()
}
# mypy really sucks at unpacking
fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
try:
objr = objr["/Parent"].get_object()
except KeyError:
objr = None

return fonts

def _layout_mode_text(
Expand Down
9 changes: 9 additions & 0 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,3 +1272,12 @@ def test_get_page_showing_field():
writer._root_object["/AcroForm"]["/Fields"][-1]
)
] == []


@pytest.mark.enable_socket()
def test_extract_empty_page():
"""Cf #2533"""
url = "https://github.com/py-pdf/pypdf/files/14718318/test.pdf"
name = "iss2533.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name)))
assert reader.pages[1].extract_text(extraction_mode="layout") == ""

0 comments on commit e35df5a

Please sign in to comment.