Skip to content

Commit

Permalink
Check duplicate objects when sweep indirect references.
Browse files Browse the repository at this point in the history
  • Loading branch information
Harry Karvonen committed Jun 27, 2022
1 parent 53efd73 commit 9fab63a
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
6 changes: 6 additions & 0 deletions PyPDF2/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class (typically :class:`PdfReader<PyPDF2.PdfReader>`).
def __init__(self) -> None:
self._header = b"%PDF-1.3"
self._objects: List[Optional[PdfObject]] = [] # array of indirect objects
self._idnum_hash: Dict[str, int] = {}

# The root of our page tree node.
pages = DictionaryObject()
Expand Down Expand Up @@ -868,8 +869,13 @@ def _sweep_indirect_references(
if newobj is None:
try:
newobj = data.pdf.get_object(data)
hash_value = newobj.hash_value()
# Check if object is already added to pdf.
if hash_value in self._idnum_hash:
return IndirectObject(self._idnum_hash[hash_value], 0, self)
self._objects.append(None) # placeholder
idnum = len(self._objects)
self._idnum_hash[hash_value] = idnum
newobj_ido = IndirectObject(idnum, 0, self)
if data.pdf not in extern_map:
extern_map[data.pdf] = {}
Expand Down
21 changes: 20 additions & 1 deletion PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@

import codecs
import decimal
import hashlib
import logging
import re
import warnings
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast

from ._codecs import ( # noqa: rev_encoding
_pdfdoc_encoding,
Expand Down Expand Up @@ -77,6 +78,21 @@


class PdfObject:
# function for calculating a hash value
hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1

def hash_value_data(self) -> bytes:
return ("%s" % self).encode()

def hash_value(self) -> bytes:
return (
"%s:%s"
% (
self.__class__.__name__,
self.hash_func(self.hash_value_data()).hexdigest(),
)
).encode()

def get_object(self) -> Optional["PdfObject"]:
"""Resolve indirect references."""
return self
Expand Down Expand Up @@ -1001,6 +1017,9 @@ def __init__(self) -> None:
self.__data: Optional[str] = None
self.decoded_self: Optional[DecodedStreamObject] = None

def hash_value_data(self) -> bytes:
return b_(self._data)

@property
def decodedSelf(self) -> Optional["DecodedStreamObject"]: # pragma: no cover
deprecate_with_replacement("decodedSelf", "decoded_self")
Expand Down

0 comments on commit 9fab63a

Please sign in to comment.