Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PI: Check duplicate objects in writer._sweep_indirect_references #207

Merged
merged 2 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions PyPDF2/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class (typically :class:`PdfReader<PyPDF2.PdfReader>`).
def __init__(self) -> None:
self._header = b"%PDF-1.3"
self._objects: List[Optional[PdfObject]] = [] # array of indirect objects
self._idnum_hash: Dict[bytes, int] = {}

# The root of our page tree node.
pages = DictionaryObject()
Expand Down Expand Up @@ -868,8 +869,16 @@ def _sweep_indirect_references(
if newobj is None:
try:
newobj = data.pdf.get_object(data)
hash_value = None
if newobj is not None:
hash_value = newobj.hash_value()
# Check if object is already added to pdf.
if hash_value in self._idnum_hash:
return IndirectObject(self._idnum_hash[hash_value], 0, self)
self._objects.append(None) # placeholder
idnum = len(self._objects)
if hash_value is not None:
self._idnum_hash[hash_value] = idnum
newobj_ido = IndirectObject(idnum, 0, self)
if data.pdf not in extern_map:
extern_map[data.pdf] = {}
Expand Down
21 changes: 20 additions & 1 deletion PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@

import codecs
import decimal
import hashlib
import logging
import re
import warnings
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast

from ._codecs import ( # noqa: rev_encoding
_pdfdoc_encoding,
Expand Down Expand Up @@ -77,6 +78,21 @@


class PdfObject:
# function for calculating a hash value
hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1

def hash_value_data(self) -> bytes:
return ("%s" % self).encode()

def hash_value(self) -> bytes:
return (
"%s:%s"
% (
self.__class__.__name__,
self.hash_func(self.hash_value_data()).hexdigest(),
)
).encode()

def get_object(self) -> Optional["PdfObject"]:
"""Resolve indirect references."""
return self
Expand Down Expand Up @@ -1001,6 +1017,9 @@ def __init__(self) -> None:
self.__data: Optional[str] = None
self.decoded_self: Optional[DecodedStreamObject] = None

def hash_value_data(self) -> bytes:
return b_(self._data)

@property
def decodedSelf(self) -> Optional["DecodedStreamObject"]: # pragma: no cover
deprecate_with_replacement("decodedSelf", "decoded_self")
Expand Down