Skip to content

Commit

Permalink
Check duplicate objects when sweepIndirectReferences.
Browse files Browse the repository at this point in the history
Added PdfObject.hashValue() and some complex hashValue() methods to
complex objects.
A hashValue() method is used to detect duplicate objects.
  • Loading branch information
Harry Karvonen committed Apr 19, 2022
1 parent 13e7cb3 commit bf16aac
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 0 deletions.
42 changes: 42 additions & 0 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import codecs
import decimal
import hashlib
import re
import warnings

Expand Down Expand Up @@ -62,6 +63,11 @@
IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]"))


def _hashFuncForDeduplication():
"""Returns hashlib function for deduplication"""
return hashlib.sha1()


def readObject(stream, pdf):
tok = stream.read(1)
stream.seek(-1, 1) # reset to start
Expand Down Expand Up @@ -115,6 +121,12 @@ def getObject(self):
"""Resolves indirect references."""
return self

def hashValue(self):
"""Return a hash in bytes for deduplication or None"""
hashFunc = _hashFuncForDeduplication()
hashFunc.update(b_(self))
return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())


class NullObject(PdfObject):
def writeToStream(self, stream, encryption_key):
Expand Down Expand Up @@ -151,6 +163,17 @@ def readFromStream(stream):


class ArrayObject(list, PdfObject):
def hashValue(self):
hashFunc = _hashFuncForDeduplication()

for i, value in enumerate(self):
if isinstance(value, PdfObject):
hashFunc.update(b_("%i=%s" % (i, value.hashValue())))
else:
hashFunc.update(b_(value))

return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())

def writeToStream(self, stream, encryption_key):
stream.write(b_("["))
for data in self:
Expand Down Expand Up @@ -188,6 +211,9 @@ def __init__(self, idnum, generation, pdf):
def getObject(self):
return self.pdf.getObject(self).getObject()

def hashValue(self):
return b_("IndirectObject<%s>:(%r,%r)" % (hex(id(self.pdf)), self.idnum, self.generation))

def __repr__(self):
return "IndirectObject(%r, %r)" % (self.idnum, self.generation)

Expand Down Expand Up @@ -565,6 +591,17 @@ def writeToStream(self, stream, encryption_key):
stream.write(b_("\n"))
stream.write(b_(">>"))

def hashValue(self):
hashFunc = _hashFuncForDeduplication()

for key, value in self.items():
if isinstance(value, PdfObject):
hashFunc.update(b_("%s=%s" % (key, value.hashValue())))
else:
hashFunc.update(b_(value))

return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())

@staticmethod
def readFromStream(stream, pdf):
debug = False
Expand Down Expand Up @@ -803,6 +840,11 @@ def writeToStream(self, stream, encryption_key):
stream.write(data)
stream.write(b_("\nendstream"))

def hashValue(self):
hashFunc = _hashFuncForDeduplication()
hashFunc.update(self._data)
return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())

@staticmethod
def initializeFromDictionary(data):
if SA.FILTER in data:
Expand Down
6 changes: 6 additions & 0 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class (typically :class:`PdfFileReader<PdfFileReader>`).
def __init__(self):
self._header = b_("%PDF-1.3")
self._objects = [] # array of indirect objects
self._idnum_hash = {}

# The root of our page tree node.
pages = DictionaryObject()
Expand Down Expand Up @@ -612,8 +613,13 @@ def _sweepIndirectReferences(self, externMap, data):
if newobj is None:
try:
newobj = data.pdf.getObject(data)
hashValue = newobj.hashValue()
# Check if object is already added to pdf.
if hashValue in self._idnum_hash:
return IndirectObject(self._idnum_hash[hashValue], 0, self)
self._objects.append(None) # placeholder
idnum = len(self._objects)
self._idnum_hash[hashValue] = idnum
newobj_ido = IndirectObject(idnum, 0, self)
if data.pdf not in externMap:
externMap[data.pdf] = {}
Expand Down

0 comments on commit bf16aac

Please sign in to comment.