From 97515fd7e282021006af647973a93158d469250a Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 9 Jan 2023 13:35:24 +1100 Subject: [PATCH 1/4] Rename resources deterministically in merge_page --- pypdf/_page.py | 35 +++++++++-- tests/test_page.py | 150 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 6 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 2a9656f22..c9de6bbe0 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,7 +28,6 @@ # POSSIBILITY OF SUCH DAMAGE. import math -import uuid import warnings from decimal import Decimal from typing import ( @@ -576,17 +575,41 @@ def _merge_resources( ) -> Tuple[Dict[str, Any], Dict[str, Any]]: new_res = DictionaryObject() new_res.update(res1.get(resource, DictionaryObject()).get_object()) + + def compute_unique_key(base_key: str) -> tuple[str, bool]: + """Find a key that either doesn't already exist or has the same + value (indicated by the bool)""" + value = page2res.raw_get(base_key) + # try the current key first (e.g. "foo"), but otherwise iterate + # through "foo-0", "foo-1", etc. new_res can contain only finitely + # many keys, thus this'll eventually end, even if it's been crafted + # to be maximally annoying. + computed_key = base_key + idx = 0 + while computed_key in new_res: + if new_res.raw_get(computed_key) == value: + # there's already a resource of this name, with the exact + # same value + return computed_key, True + computed_key = "%s-%d" % (key, idx) + idx += 1 + return computed_key, False + page2res = cast( DictionaryObject, res2.get(resource, DictionaryObject()).get_object() ) rename_res = {} - for key in list(page2res.keys()): - if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): - newname = NameObject(key + str(uuid.uuid4())) + for key in sorted(page2res.keys()): + unique_key, same_value = compute_unique_key(key) + newname = NameObject(unique_key) + if key != unique_key: + # we have to use a different name for this rename_res[key] = newname + + if not same_value: + # the value wasn't already recorded new_res[newname] = page2res[key] - elif key not in new_res: - new_res[key] = page2res.raw_get(key) + return new_res, rename_res @staticmethod diff --git a/tests/test_page.py b/tests/test_page.py index b58d42ceb..7ddb8edc5 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -13,6 +13,7 @@ from pypdf.errors import DeprecationError, PdfReadWarning from pypdf.generic import ( ArrayObject, + ContentStream, DictionaryObject, FloatObject, IndirectObject, @@ -871,3 +872,152 @@ def test_no_resources(): page_one = reader.pages[0] page_two = reader.pages[0] page_one.merge_page(page_two) + + +@pytest.mark.parametrize( + ("page1", "page2", "expected_result", "expected_renames"), + [ + # simple cases: + pytest.param({}, {}, {}, {}, id="no resources"), + pytest.param( + {"/1": "/v1"}, + {"/2": "/v2"}, + {"/1": "/v1", "/2": "/v2"}, + {}, + id="no overlap", + ), + pytest.param( + {"/x": "/v"}, {"/x": "/v"}, {"/x": "/v"}, {}, id="overlap, matching values" + ), + pytest.param( + {"/x": "/v1"}, + {"/x": "/v2"}, + {"/x": "/v1", "/x-0": "/v2"}, + {"/x": "/x-0"}, + id="overlap, different values", + ), + # carefully crafted names that match the renaming pattern: + pytest.param( + {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v1"}, + {"/x": "/v2"}, + { + "/x": "/v1", + "/x-0": "/v1", + "/x-1": "/v1", + "/x-2": "/v2", + }, + {"/x": "/x-2"}, + id="crafted, different values", + ), + pytest.param( + {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"}, + {"/x": "/v"}, + {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"}, + {"/x": "/x-1"}, + id="crafted, matching value in chain", + ), + pytest.param( + {"/x": "/v1"}, + {"/x": "/v2.1", "/x-0": "/v2.2"}, + {"/x": "/v1", "/x-0": "/v2.1", "/x-0-0": "/v2.2"}, + {"/x": "/x-0", "/x-0": "/x-0-0"}, + id="crafted, overlaps with previous rename, different value", + ), + pytest.param( + {"/x": "/v1"}, + {"/x": "/v2", "/x-0": "/v2"}, + {"/x": "/v1", "/x-0": "/v2"}, + {"/x": "/x-0"}, + id="crafted, overlaps with previous rename, matching value", + ), + ], +) +def test_merge_resources(page1, page2, expected_result, expected_renames): + # Arrange + page1 = DictionaryObject( + { + PG.RESOURCES: DictionaryObject( + {NameObject(k): NameObject(v) for k, v in page1.items()} + ) + } + ) + page2 = DictionaryObject( + { + PG.RESOURCES: DictionaryObject( + {NameObject(k): NameObject(v) for k, v in page2.items()} + ) + } + ) + + # Act + result, renames = PageObject._merge_resources(page1, page2, PG.RESOURCES) + + # Assert + assert result == expected_result + assert renames == expected_renames + + +def test_merge_page_resources_smoke_test(): + # Arrange + page1 = PageObject.create_blank_page(width=100, height=100) + page2 = PageObject.create_blank_page(width=100, height=100) + + NO = NameObject + + # set up some dummy resources that overlap (or not) between the two pages (note, all the edge + # cases are tested in test_merge_resources) + props1 = page1[NO("/Resources")][NO("/Properties")] = DictionaryObject( + { + NO("/just1"): NO("/just1-value"), + NO("/overlap-matching"): NO("/overlap-matching-value"), + NO("/overlap-different"): NO("/overlap-different-value1"), + } + ) + props2 = page2[NO("/Resources")][NO("/Properties")] = DictionaryObject( + { + NO("/just2"): NO("/just2-value"), + NO("/overlap-matching"): NO("/overlap-matching-value"), + NO("/overlap-different"): NO("/overlap-different-value2"), + } + ) + # use these keys for some "operations", to validate renaming (the operand name doesn't matter) + contents1 = page1[NO("/Contents")] = ContentStream(None, None) + contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")] + contents2 = page2[NO("/Contents")] = ContentStream(None, None) + contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")] + + expected_properties = { + "/just1": "/just1-value", + "/just2": "/just2-value", + "/overlap-matching": "/overlap-matching-value", + "/overlap-different": "/overlap-different-value1", + "/overlap-different-0": "/overlap-different-value2", + } + expected_operations = [ + # no renaming + (ArrayObject(props1.keys()), b"page1-contents"), + # some renaming + ( + ArrayObject( + [ + NO("/just2"), + NO("/overlap-matching"), + NO("/overlap-different-0"), + ] + ), + b"page2-contents", + ), + ] + + # Act + page1.merge_page(page2) + + # Assert + assert page1[NO("/Resources")][NO("/Properties")] == expected_properties + + relevant_operations = [ + (op, name) + for op, name in page1.get_contents().operations + if name in (b"page1-contents", b"page2-contents") + ] + assert relevant_operations == expected_operations From c7b2be6435492e1559214eb305fec3534a566661 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 9 Jan 2023 14:24:07 +1100 Subject: [PATCH 2/4] Type indexing --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index c9de6bbe0..1fae11025 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -576,7 +576,7 @@ def _merge_resources( new_res = DictionaryObject() new_res.update(res1.get(resource, DictionaryObject()).get_object()) - def compute_unique_key(base_key: str) -> tuple[str, bool]: + def compute_unique_key(base_key: str) -> Tuple[str, bool]: """Find a key that either doesn't already exist or has the same value (indicated by the bool)""" value = page2res.raw_get(base_key) From 3a73155ba4fa458e547423c04e1791d9584a76bc Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 9 Jan 2023 16:11:19 +1100 Subject: [PATCH 3/4] f-strings Co-authored-by: Matthew Peveler --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 1fae11025..eca650e3e 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -591,7 +591,7 @@ def compute_unique_key(base_key: str) -> Tuple[str, bool]: # there's already a resource of this name, with the exact # same value return computed_key, True - computed_key = "%s-%d" % (key, idx) + computed_key = f"{key}-{idx}" idx += 1 return computed_key, False From 9a792c807139ac446fc0aaac885a4d3107851249 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 9 Jan 2023 16:12:34 +1100 Subject: [PATCH 4/4] Refer to the correct variable, oops --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index eca650e3e..8d8f286c1 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -591,7 +591,7 @@ def compute_unique_key(base_key: str) -> Tuple[str, bool]: # there's already a resource of this name, with the exact # same value return computed_key, True - computed_key = f"{key}-{idx}" + computed_key = f"{base_key}-{idx}" idx += 1 return computed_key, False