Check duplicate objects when sweepIndirectReferences.

Added PdfObject.hashValue() and some complex hashValue() methods to complex objects. A hashValue() method is used to detect duplicate objects.
py-pdf · Apr 19, 2022 · bf16aac · bf16aac
1 parent 13e7cb3
commit bf16aac
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 0 deletions.
diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -34,6 +34,7 @@
 
 import codecs
 import decimal
+import hashlib
 import re
 import warnings
 
@@ -62,6 +63,11 @@
 IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
 
 
+def _hashFuncForDeduplication():
+    """Returns hashlib function for deduplication"""
+    return hashlib.sha1()
+
+
 def readObject(stream, pdf):
     tok = stream.read(1)
     stream.seek(-1, 1) # reset to start
@@ -115,6 +121,12 @@ def getObject(self):
         """Resolves indirect references."""
         return self
 
+    def hashValue(self):
+        """Return a hash in bytes for deduplication or None"""
+        hashFunc = _hashFuncForDeduplication()
+        hashFunc.update(b_(self))
+        return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())
+
 
 class NullObject(PdfObject):
     def writeToStream(self, stream, encryption_key):
@@ -151,6 +163,17 @@ def readFromStream(stream):
 
 
 class ArrayObject(list, PdfObject):
+    def hashValue(self):
+        hashFunc = _hashFuncForDeduplication()
+
+        for i, value in enumerate(self):
+            if isinstance(value, PdfObject):
+                hashFunc.update(b_("%i=%s" % (i, value.hashValue())))
+            else:
+                hashFunc.update(b_(value))
+
+        return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())
+
     def writeToStream(self, stream, encryption_key):
         stream.write(b_("["))
         for data in self:
@@ -188,6 +211,9 @@ def __init__(self, idnum, generation, pdf):
     def getObject(self):
         return self.pdf.getObject(self).getObject()
 
+    def hashValue(self):
+        return b_("IndirectObject<%s>:(%r,%r)" % (hex(id(self.pdf)), self.idnum, self.generation))
+
     def __repr__(self):
         return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
 
@@ -565,6 +591,17 @@ def writeToStream(self, stream, encryption_key):
             stream.write(b_("\n"))
         stream.write(b_(">>"))
 
+    def hashValue(self):
+        hashFunc = _hashFuncForDeduplication()
+
+        for key, value in self.items():
+            if isinstance(value, PdfObject):
+                hashFunc.update(b_("%s=%s" % (key, value.hashValue())))
+            else:
+                hashFunc.update(b_(value))
+
+        return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())
+
     @staticmethod
     def readFromStream(stream, pdf):
         debug = False
@@ -803,6 +840,11 @@ def writeToStream(self, stream, encryption_key):
         stream.write(data)
         stream.write(b_("\nendstream"))
 
+    def hashValue(self):
+        hashFunc = _hashFuncForDeduplication()
+        hashFunc.update(self._data)
+        return b_(self.__class__.__name__ + ":" + hashFunc.hexdigest())
+
     @staticmethod
     def initializeFromDictionary(data):
         if SA.FILTER in data:

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -87,6 +87,7 @@ class (typically :class:`PdfFileReader<PdfFileReader>`).
     def __init__(self):
         self._header = b_("%PDF-1.3")
         self._objects = []  # array of indirect objects
+        self._idnum_hash = {}
 
         # The root of our page tree node.
         pages = DictionaryObject()
@@ -612,8 +613,13 @@ def _sweepIndirectReferences(self, externMap, data):
                 if newobj is None:
                     try:
                         newobj = data.pdf.getObject(data)
+                        hashValue = newobj.hashValue()
+                        # Check if object is already added to pdf.
+                        if hashValue in self._idnum_hash:
+                            return IndirectObject(self._idnum_hash[hashValue], 0, self)
                         self._objects.append(None) # placeholder
                         idnum = len(self._objects)
+                        self._idnum_hash[hashValue] = idnum
                         newobj_ido = IndirectObject(idnum, 0, self)
                         if data.pdf not in externMap:
                             externMap[data.pdf] = {}