Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Rename resources deterministically in merge_page #1543

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
# POSSIBILITY OF SUCH DAMAGE.

import math
import uuid
import warnings
from decimal import Decimal
from typing import (
Expand Down Expand Up @@ -576,17 +575,41 @@ def _merge_resources(
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
new_res = DictionaryObject()
new_res.update(res1.get(resource, DictionaryObject()).get_object())

def compute_unique_key(base_key: str) -> Tuple[str, bool]:
"""Find a key that either doesn't already exist or has the same
value (indicated by the bool)"""
value = page2res.raw_get(base_key)
# try the current key first (e.g. "foo"), but otherwise iterate
# through "foo-0", "foo-1", etc. new_res can contain only finitely
# many keys, thus this'll eventually end, even if it's been crafted
# to be maximally annoying.
computed_key = base_key
idx = 0
while computed_key in new_res:
if new_res.raw_get(computed_key) == value:
# there's already a resource of this name, with the exact
# same value
return computed_key, True
computed_key = "%s-%d" % (key, idx)
huonw marked this conversation as resolved.
Show resolved Hide resolved
idx += 1
return computed_key, False

page2res = cast(
DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
)
rename_res = {}
for key in list(page2res.keys()):
if key in new_res and new_res.raw_get(key) != page2res.raw_get(key):
newname = NameObject(key + str(uuid.uuid4()))
for key in sorted(page2res.keys()):
unique_key, same_value = compute_unique_key(key)
newname = NameObject(unique_key)
if key != unique_key:
# we have to use a different name for this
rename_res[key] = newname

if not same_value:
# the value wasn't already recorded
new_res[newname] = page2res[key]
elif key not in new_res:
new_res[key] = page2res.raw_get(key)

return new_res, rename_res

@staticmethod
Expand Down
150 changes: 150 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pypdf.errors import DeprecationError, PdfReadWarning
from pypdf.generic import (
ArrayObject,
ContentStream,
DictionaryObject,
FloatObject,
IndirectObject,
Expand Down Expand Up @@ -871,3 +872,152 @@ def test_no_resources():
page_one = reader.pages[0]
page_two = reader.pages[0]
page_one.merge_page(page_two)


@pytest.mark.parametrize(
("page1", "page2", "expected_result", "expected_renames"),
[
# simple cases:
pytest.param({}, {}, {}, {}, id="no resources"),
pytest.param(
{"/1": "/v1"},
{"/2": "/v2"},
{"/1": "/v1", "/2": "/v2"},
{},
id="no overlap",
),
pytest.param(
{"/x": "/v"}, {"/x": "/v"}, {"/x": "/v"}, {}, id="overlap, matching values"
),
pytest.param(
{"/x": "/v1"},
{"/x": "/v2"},
{"/x": "/v1", "/x-0": "/v2"},
{"/x": "/x-0"},
id="overlap, different values",
),
# carefully crafted names that match the renaming pattern:
pytest.param(
{"/x": "/v1", "/x-0": "/v1", "/x-1": "/v1"},
{"/x": "/v2"},
{
"/x": "/v1",
"/x-0": "/v1",
"/x-1": "/v1",
"/x-2": "/v2",
},
{"/x": "/x-2"},
id="crafted, different values",
),
pytest.param(
{"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"},
{"/x": "/v"},
{"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"},
{"/x": "/x-1"},
id="crafted, matching value in chain",
),
pytest.param(
{"/x": "/v1"},
{"/x": "/v2.1", "/x-0": "/v2.2"},
{"/x": "/v1", "/x-0": "/v2.1", "/x-0-0": "/v2.2"},
{"/x": "/x-0", "/x-0": "/x-0-0"},
id="crafted, overlaps with previous rename, different value",
),
pytest.param(
{"/x": "/v1"},
{"/x": "/v2", "/x-0": "/v2"},
{"/x": "/v1", "/x-0": "/v2"},
{"/x": "/x-0"},
id="crafted, overlaps with previous rename, matching value",
),
],
)
def test_merge_resources(page1, page2, expected_result, expected_renames):
# Arrange
page1 = DictionaryObject(
{
PG.RESOURCES: DictionaryObject(
{NameObject(k): NameObject(v) for k, v in page1.items()}
)
}
)
page2 = DictionaryObject(
{
PG.RESOURCES: DictionaryObject(
{NameObject(k): NameObject(v) for k, v in page2.items()}
)
}
)

# Act
result, renames = PageObject._merge_resources(page1, page2, PG.RESOURCES)

# Assert
assert result == expected_result
assert renames == expected_renames


def test_merge_page_resources_smoke_test():
# Arrange
page1 = PageObject.create_blank_page(width=100, height=100)
page2 = PageObject.create_blank_page(width=100, height=100)

NO = NameObject

# set up some dummy resources that overlap (or not) between the two pages (note, all the edge
# cases are tested in test_merge_resources)
props1 = page1[NO("/Resources")][NO("/Properties")] = DictionaryObject(
{
NO("/just1"): NO("/just1-value"),
NO("/overlap-matching"): NO("/overlap-matching-value"),
NO("/overlap-different"): NO("/overlap-different-value1"),
}
)
props2 = page2[NO("/Resources")][NO("/Properties")] = DictionaryObject(
{
NO("/just2"): NO("/just2-value"),
NO("/overlap-matching"): NO("/overlap-matching-value"),
NO("/overlap-different"): NO("/overlap-different-value2"),
}
)
# use these keys for some "operations", to validate renaming (the operand name doesn't matter)
contents1 = page1[NO("/Contents")] = ContentStream(None, None)
contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")]
contents2 = page2[NO("/Contents")] = ContentStream(None, None)
contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")]

expected_properties = {
"/just1": "/just1-value",
"/just2": "/just2-value",
"/overlap-matching": "/overlap-matching-value",
"/overlap-different": "/overlap-different-value1",
"/overlap-different-0": "/overlap-different-value2",
}
expected_operations = [
# no renaming
(ArrayObject(props1.keys()), b"page1-contents"),
# some renaming
(
ArrayObject(
[
NO("/just2"),
NO("/overlap-matching"),
NO("/overlap-different-0"),
]
),
b"page2-contents",
),
]

# Act
page1.merge_page(page2)

# Assert
assert page1[NO("/Resources")][NO("/Properties")] == expected_properties

relevant_operations = [
(op, name)
for op, name in page1.get_contents().operations
if name in (b"page1-contents", b"page2-contents")
]
assert relevant_operations == expected_operations