Skip to content

Commit

Permalink
BUG: Wrong page inserted when PdfMerger.merge is done
Browse files Browse the repository at this point in the history
Caused-by: #207

Why it wasn't detected by the tests: We don't have any tests that check
for the correct result of a merge. We just check for exceptions

How we prevent it in future: Unit test was added

Risk of the fix:
- We will have bigger file sizes again as #207 was effectively reverted
- We will need to adjust this test if we change the way we write PDFs

Closes: #1062
  • Loading branch information
MartinThoma committed Jul 5, 2022
1 parent 1e9c4dd commit 5beb76e
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 8 deletions.
8 changes: 0 additions & 8 deletions PyPDF2/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,16 +913,8 @@ def _sweep_indirect_references(
if newobj is None:
try:
newobj = data.pdf.get_object(data)
hash_value = None
if newobj is not None:
hash_value = newobj.hash_value()
# Check if object is already added to pdf.
if hash_value in self._idnum_hash:
return IndirectObject(self._idnum_hash[hash_value], 0, self)
self._objects.append(None) # placeholder
idnum = len(self._objects)
if hash_value is not None:
self._idnum_hash[hash_value] = idnum
newobj_ido = IndirectObject(idnum, 0, self)
if data.pdf not in extern_map:
extern_map[data.pdf] = {}
Expand Down
Binary file not shown.
34 changes: 34 additions & 0 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,9 @@ def test_merge_with_warning(url, name):
):
merger.write("tmp.merged.pdf")

# Cleanup
os.remove("tmp.merged.pdf")


@pytest.mark.parametrize(
("url", "name"),
Expand All @@ -261,6 +264,9 @@ def test_merge(url, name):
merger.append(reader)
merger.write("tmp.merged.pdf")

# Cleanup
os.remove("tmp.merged.pdf")


@pytest.mark.parametrize(
("url", "name"),
Expand Down Expand Up @@ -335,3 +341,31 @@ def test_scale_rectangle_indirect_object():

for page in reader.pages:
page.scale(sx=2, sy=3)


def test_merge_output():
# Arrange
base = os.path.join(RESOURCE_ROOT, "Seige_of_Vicksburg_Sample_OCR.pdf")
crazy = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
expected = os.path.join(
RESOURCE_ROOT, "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf"
)

# Act
merger = PdfMerger(strict=True)
with pytest.warns(PdfReadWarning):
merger.append(base)
merger.merge(1, crazy)
stream = BytesIO()
merger.write(stream)

# Assert
stream.seek(0)
actual = stream.read()
with open(expected, "rb") as fp:
expected_data = fp.read()
if actual != expected_data:
# See https://github.com/pytest-dev/pytest/issues/9124
assert (
False
), f"len(actual) = {len(actual):,} vs len(expected) = {len(expected_data):,}"

0 comments on commit 5beb76e

Please sign in to comment.