From 125660148271e63e8283cbc9029afc53aa23e159 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 19 May 2024 15:14:48 +0200 Subject: [PATCH 1/7] ROB: cope with loops in Fields tree closes #2643 --- pypdf/_doc_common.py | 72 ++++++++++++++++++++++++-------------------- tests/test_reader.py | 19 ++++++++++++ 2 files changed, 59 insertions(+), 32 deletions(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 84e99208a..200fea88b 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -492,17 +492,19 @@ def get_fields( tree: Optional[TreeObject] = None, retval: Optional[Dict[Any, Any]] = None, fileobj: Optional[Any] = None, + stack: Optional[List[PdfObject]] = None, ) -> Optional[Dict[str, Any]]: """ Extract field data if this PDF contains interactive form fields. - The *tree* and *retval* parameters are for recursive use. + The *tree*, *retval*, *stack* parameters are for recursive use. Args: - tree: - retval: + tree: current object parsed + retval: in progress list of fields fileobj: A file object (usually a text file) to write a report to on all interactive form fields found. + stack: list of object already parsed Returns: A dictionary where each key is a field name, and each @@ -515,6 +517,7 @@ def get_fields( if retval is None: retval = {} catalog = self.root_object + stack = [] # get the AcroForm tree if CD.ACRO_FORM in catalog: tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) @@ -522,19 +525,17 @@ def get_fields( return None if tree is None: return retval - self._check_kids(tree, retval, fileobj) - for attr in field_attributes: - if attr in tree: - # Tree is a field - self._build_field(tree, retval, fileobj, field_attributes) - break - + assert stack is not None if "/Fields" in tree: fields = cast(ArrayObject, tree["/Fields"]) for f in fields: field = f.get_object() - self._build_field(field, retval, fileobj, field_attributes) - + self._build_field(field, retval, fileobj, field_attributes, stack) + else: + self._check_kids(tree, retval, fileobj, stack) + if any(attr in tree for attr in field_attributes): + # Tree is a field + self._build_field(tree, retval, fileobj, field_attributes, stack) return retval def _get_qualified_field_name(self, parent: DictionaryObject) -> str: @@ -557,25 +558,12 @@ def _build_field( retval: Dict[Any, Any], fileobj: Any, field_attributes: Any, + stack: List[PdfObject], ) -> None: - self._check_kids(field, retval, fileobj) - try: - key = cast(str, field["/TM"]) - except KeyError: - try: - if "/Parent" in field: - key = ( - self._get_qualified_field_name( - cast(DictionaryObject, field["/Parent"]) - ) - + "." - ) - else: - key = "" - key += cast(str, field["/T"]) - except KeyError: - # Ignore no-name field for now - return + self._check_kids(field, retval, fileobj, stack) + if all(attr not in field for attr in ("/T", "/TM")): + return + key = self._get_qualified_field_name(field) if fileobj: self._write_field(fileobj, field, field_attributes) fileobj.write("\n") @@ -606,12 +594,32 @@ def _build_field( del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] def _check_kids( - self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any + self, + tree: Union[TreeObject, DictionaryObject], + retval: Any, + fileobj: Any, + stack: List[PdfObject], ) -> None: + if tree in stack: + logger_warning( + f"{self._get_qualified_field_name(tree)} already parsed", __name__ + ) + return + stack.append(tree) if PA.KIDS in tree: # recurse down the tree for kid in tree[PA.KIDS]: # type: ignore - self.get_fields(kid.get_object(), retval, fileobj) + kid = kid.get_object() + if tree.indirect_reference != kid.get("/Parent", None): + logger_warning( + ( + f'"/Parent" of {self._get_qualified_field_name(kid)} ' + f"different from {self._get_qualified_field_name(tree)}" + ), + __name__, + ) + continue + self.get_fields(kid, retval, fileobj, stack) def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: field_attributes_tuple = FA.attributes() diff --git a/tests/test_reader.py b/tests/test_reader.py index 83b61bc59..df4d962e8 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1530,3 +1530,22 @@ def test_damaged_pdf(): assert ( exc.value.args[0] == "Expected object ID (21 0) does not match actual (-1 -1)." ) + + +@pytest.mark.enable_socket() +def test_looping_form(): + """Cf iss 2643""" + url = "https://github.com/py-pdf/pypdf/files/15306053/inheritance.pdf" + name = "iss2643.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) + flds = reader.get_fields() + assert all( + x in flds + for x in ( + "Text10", + "Text10.0.0.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1", + "amt1.0", + "amt1.1", + "DSS#3pg3#0hgu7", + ) + ) From 5530c1db2f359f3ab078502c88db7710acb566ce Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 09:59:44 +0200 Subject: [PATCH 2/7] refactor --- pypdf/_doc_common.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 200fea88b..1cd4b22a5 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -531,11 +531,9 @@ def get_fields( for f in fields: field = f.get_object() self._build_field(field, retval, fileobj, field_attributes, stack) - else: - self._check_kids(tree, retval, fileobj, stack) - if any(attr in tree for attr in field_attributes): - # Tree is a field - self._build_field(tree, retval, fileobj, field_attributes, stack) + elif any(attr in tree for attr in field_attributes): + # Tree is a field + self._build_field(tree, retval, fileobj, field_attributes, stack) return retval def _get_qualified_field_name(self, parent: DictionaryObject) -> str: @@ -560,7 +558,6 @@ def _build_field( field_attributes: Any, stack: List[PdfObject], ) -> None: - self._check_kids(field, retval, fileobj, stack) if all(attr not in field for attr in ("/T", "/TM")): return key = self._get_qualified_field_name(field) @@ -592,6 +589,8 @@ def _build_field( and "/Off" in retval[key]["/_States_"] ): del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] + # at last for order + self._check_kids(field, retval, fileobj, stack) def _check_kids( self, @@ -610,15 +609,6 @@ def _check_kids( # recurse down the tree for kid in tree[PA.KIDS]: # type: ignore kid = kid.get_object() - if tree.indirect_reference != kid.get("/Parent", None): - logger_warning( - ( - f'"/Parent" of {self._get_qualified_field_name(kid)} ' - f"different from {self._get_qualified_field_name(tree)}" - ), - __name__, - ) - continue self.get_fields(kid, retval, fileobj, stack) def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: From 00cb0b1f12f990fcb3ef4993b76d1454db693dae Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:10:20 +0200 Subject: [PATCH 3/7] Update pypdf/_doc_common.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_doc_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 1cd4b22a5..91c508939 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -501,7 +501,7 @@ def get_fields( Args: tree: current object parsed - retval: in progress list of fields + retval: In-progress list of fields. fileobj: A file object (usually a text file) to write a report to on all interactive form fields found. stack: list of object already parsed From a417fcb2b75c60e1353fa97c19c61c7d8fa6bd00 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:10:26 +0200 Subject: [PATCH 4/7] Update pypdf/_doc_common.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_doc_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 91c508939..871eeb40c 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -500,7 +500,7 @@ def get_fields( The *tree*, *retval*, *stack* parameters are for recursive use. Args: - tree: current object parsed + tree: Current object to parse. retval: In-progress list of fields. fileobj: A file object (usually a text file) to write a report to on all interactive form fields found. From 7e8cc75f5d93af5742442d389318a78d1ab9f054 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:10:40 +0200 Subject: [PATCH 5/7] Update pypdf/_doc_common.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_doc_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index 871eeb40c..0baab4fc7 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -504,7 +504,7 @@ def get_fields( retval: In-progress list of fields. fileobj: A file object (usually a text file) to write a report to on all interactive form fields found. - stack: list of object already parsed + stack: List of already parsed objects. Returns: A dictionary where each key is a field name, and each From 8d92bb040172105c2ed31fb146016946ac173d94 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:21:10 +0200 Subject: [PATCH 6/7] coverage --- tests/test_reader.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index df4d962e8..0df3f2fcb 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1533,7 +1533,8 @@ def test_damaged_pdf(): @pytest.mark.enable_socket() -def test_looping_form(): +@pytest.mark.timeout(4) +def test_looping_form(caplog): """Cf iss 2643""" url = "https://github.com/py-pdf/pypdf/files/15306053/inheritance.pdf" name = "iss2643.pdf" @@ -1549,3 +1550,10 @@ def test_looping_form(): "DSS#3pg3#0hgu7", ) ) + writer = PdfWriter(reader) + writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"].append( + writer.root_object["/AcroForm"]["/Fields"][5]["/Kids"][0] + ) + flds2 = writer.get_fields() + assert "Text68.0 already parsed" in caplog.text + assert list(flds.keys()) == list(flds2.keys()) From 72270a5a76de46971e62aee119a6f505e25e5230 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:43:55 +0200 Subject: [PATCH 7/7] increase T.O. for loading --- tests/test_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 0df3f2fcb..4557270bb 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1533,7 +1533,7 @@ def test_damaged_pdf(): @pytest.mark.enable_socket() -@pytest.mark.timeout(4) +@pytest.mark.timeout(10) def test_looping_form(caplog): """Cf iss 2643""" url = "https://github.com/py-pdf/pypdf/files/15306053/inheritance.pdf"