Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: cope with loops in Fields tree #2656

Merged
merged 7 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 30 additions & 32 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,17 +492,19 @@ def get_fields(
tree: Optional[TreeObject] = None,
retval: Optional[Dict[Any, Any]] = None,
fileobj: Optional[Any] = None,
stack: Optional[List[PdfObject]] = None,
) -> Optional[Dict[str, Any]]:
"""
Extract field data if this PDF contains interactive form fields.

The *tree* and *retval* parameters are for recursive use.
The *tree*, *retval*, *stack* parameters are for recursive use.

Args:
tree:
retval:
tree: Current object to parse.
retval: In-progress list of fields.
fileobj: A file object (usually a text file) to write
a report to on all interactive form fields found.
stack: List of already parsed objects.

Returns:
A dictionary where each key is a field name, and each
Expand All @@ -515,26 +517,23 @@ def get_fields(
if retval is None:
retval = {}
catalog = self.root_object
stack = []
# get the AcroForm tree
if CD.ACRO_FORM in catalog:
tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
else:
return None
if tree is None:
return retval
self._check_kids(tree, retval, fileobj)
for attr in field_attributes:
if attr in tree:
# Tree is a field
self._build_field(tree, retval, fileobj, field_attributes)
break

assert stack is not None
if "/Fields" in tree:
fields = cast(ArrayObject, tree["/Fields"])
for f in fields:
field = f.get_object()
self._build_field(field, retval, fileobj, field_attributes)

self._build_field(field, retval, fileobj, field_attributes, stack)
elif any(attr in tree for attr in field_attributes):
# Tree is a field
self._build_field(tree, retval, fileobj, field_attributes, stack)
return retval

def _get_qualified_field_name(self, parent: DictionaryObject) -> str:
Expand All @@ -557,25 +556,11 @@ def _build_field(
retval: Dict[Any, Any],
fileobj: Any,
field_attributes: Any,
stack: List[PdfObject],
) -> None:
self._check_kids(field, retval, fileobj)
try:
key = cast(str, field["/TM"])
except KeyError:
try:
if "/Parent" in field:
key = (
self._get_qualified_field_name(
cast(DictionaryObject, field["/Parent"])
)
+ "."
)
else:
key = ""
key += cast(str, field["/T"])
except KeyError:
# Ignore no-name field for now
return
if all(attr not in field for attr in ("/T", "/TM")):
return
key = self._get_qualified_field_name(field)
if fileobj:
self._write_field(fileobj, field, field_attributes)
fileobj.write("\n")
Expand Down Expand Up @@ -604,14 +589,27 @@ def _build_field(
and "/Off" in retval[key]["/_States_"]
):
del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]
# at last for order
self._check_kids(field, retval, fileobj, stack)

def _check_kids(
self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any
self,
tree: Union[TreeObject, DictionaryObject],
retval: Any,
fileobj: Any,
stack: List[PdfObject],
) -> None:
if tree in stack:
logger_warning(
f"{self._get_qualified_field_name(tree)} already parsed", __name__
)
return
stack.append(tree)
if PA.KIDS in tree:
# recurse down the tree
for kid in tree[PA.KIDS]: # type: ignore
self.get_fields(kid.get_object(), retval, fileobj)
kid = kid.get_object()
self.get_fields(kid, retval, fileobj, stack)

def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:
field_attributes_tuple = FA.attributes()
Expand Down
27 changes: 27 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1530,3 +1530,30 @@ def test_damaged_pdf():
assert (
exc.value.args[0] == "Expected object ID (21 0) does not match actual (-1 -1)."
)


@pytest.mark.enable_socket()
@pytest.mark.timeout(10)
def test_looping_form(caplog):
"""Cf iss 2643"""
url = "https://github.com/py-pdf/pypdf/files/15306053/inheritance.pdf"
name = "iss2643.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False)
flds = reader.get_fields()
assert all(
x in flds
for x in (
"Text10",
"Text10.0.0.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1",
"amt1.0",
"amt1.1",
"DSS#3pg3#0hgu7",
)
)
writer = PdfWriter(reader)
writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"].append(
writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"][0]
)
flds2 = writer.get_fields()
assert "Text68.0 already parsed" in caplog.text
assert list(flds.keys()) == list(flds2.keys())
Loading