diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 84e99208a..0baab4fc7 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -492,17 +492,19 @@ def get_fields( tree: Optional[TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None, + stack: Optional[List[PdfObject]] = None, ) -> Optional[Dict[str, Any]]: """ Extract field data if this PDF contains interactive form fields. - The *tree* and *retval* parameters are for recursive use. + The *tree*, *retval*, *stack* parameters are for recursive use. Args: - tree: - retval: + tree: Current object to parse. + retval: In-progress list of fields. fileobj: A file object (usually a text file) to write a report to on all interactive form fields found. + stack: List of already parsed objects. Returns: A dictionary where each key is a field name, and each @@ -515,6 +517,7 @@ def get_fields( if retval is None: retval = {} catalog = self.root_object + stack = [] # get the AcroForm tree if CD.ACRO_FORM in catalog: tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) @@ -522,19 +525,15 @@ def get_fields( return None if tree is None: return retval - self._check_kids(tree, retval, fileobj) - for attr in field_attributes: - if attr in tree: - # Tree is a field - self._build_field(tree, retval, fileobj, field_attributes) - break - + assert stack is not None if "/Fields" in tree: fields = cast(ArrayObject, tree["/Fields"]) for f in fields: field = f.get_object() - self._build_field(field, retval, fileobj, field_attributes) - + self._build_field(field, retval, fileobj, field_attributes, stack) + elif any(attr in tree for attr in field_attributes): + # Tree is a field + self._build_field(tree, retval, fileobj, field_attributes, stack) return retval def _get_qualified_field_name(self, parent: DictionaryObject) -> str: @@ -557,25 +556,11 @@ def _build_field( retval: Dict[Any, Any], fileobj: Any, field_attributes: Any, + stack: List[PdfObject], ) -> None: - self._check_kids(field, retval, fileobj) - try: - key = cast(str, field["/TM"]) - except KeyError: - try: - if "/Parent" in field: - key = ( - self._get_qualified_field_name( - cast(DictionaryObject, field["/Parent"]) - ) - + "." - ) - else: - key = "" - key += cast(str, field["/T"]) - except KeyError: - # Ignore no-name field for now - return + if all(attr not in field for attr in ("/T", "/TM")): + return + key = self._get_qualified_field_name(field) if fileobj: self._write_field(fileobj, field, field_attributes) fileobj.write("\n") @@ -604,14 +589,27 @@ def _build_field( and "/Off" in retval[key]["/_States_"] ): del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] + # at last for order + self._check_kids(field, retval, fileobj, stack) def _check_kids( - self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any + self, + tree: Union[TreeObject, DictionaryObject], + retval: Any, + fileobj: Any, + stack: List[PdfObject], ) -> None: + if tree in stack: + logger_warning( + f"{self._get_qualified_field_name(tree)} already parsed", __name__ + ) + return + stack.append(tree) if PA.KIDS in tree: # recurse down the tree for kid in tree[PA.KIDS]: # type: ignore - self.get_fields(kid.get_object(), retval, fileobj) + kid = kid.get_object() + self.get_fields(kid, retval, fileobj, stack) def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: field_attributes_tuple = FA.attributes() diff --git a/tests/test_reader.py b/tests/test_reader.py index 83b61bc59..4557270bb 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1530,3 +1530,30 @@ def test_damaged_pdf(): assert ( exc.value.args[0] == "Expected object ID (21 0) does not match actual (-1 -1)." ) + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(10) +def test_looping_form(caplog): + """Cf iss 2643""" + url = "https://github.com/py-pdf/pypdf/files/15306053/inheritance.pdf" + name = "iss2643.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) + flds = reader.get_fields() + assert all( + x in flds + for x in ( + "Text10", + "Text10.0.0.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1", + "amt1.0", + "amt1.1", + "DSS#3pg3#0hgu7", + ) + ) + writer = PdfWriter(reader) + writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"].append( + writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"][0] + ) + flds2 = writer.get_fields() + assert "Text68.0 already parsed" in caplog.text + assert list(flds.keys()) == list(flds2.keys())