ROB: Use null ID when encrypted but no ID given (#812)

If no '/ID' key is present in self.trailer an array of two empty bytestrings is used in place of an '/ID'. This is how Apache PDFBox handles this case. This makes PyPDF2 more robust to malformed PDFs. Closes #608 Closes #610 Full credit for this one to Richard Millson - Martin Thoma only fixed a merge conflict Co-authored-by: Richard Millson <8217613+richardmillson@users.noreply.github.com>
py-pdf · Apr 24, 2022 · 663ca98 · 663ca98
1 parent f48b4ac
commit 663ca98
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 1 deletion.
diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -2205,7 +2205,13 @@ def _authenticateUserPassword(self, password):
         rev = encrypt['/R'].getObject()
         owner_entry = encrypt['/O'].getObject()
         p_entry = encrypt['/P'].getObject()
-        id_entry = self.trailer[TK.ID].getObject()
+        if TK.ID in self.trailer:
+            id_entry = self.trailer[TK.ID].getObject()
+        else:
+            # Some documents may not have a /ID, use two empty
+            # byte strings instead. Solves
+            # https://github.com/mstamy2/PyPDF2/issues/608
+            id_entry = ArrayObject([ByteStringObject(b''), ByteStringObject(b'')])
         id1_entry = id_entry[0].getObject()
         real_U = encrypt['/U'].getObject().original_bytes
         if rev == 2:

diff --git a/Resources/encrypted_doc_no_id.pdf b/Resources/encrypted_doc_no_id.pdf
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -486,3 +486,18 @@ def test_do_not_get_stuck_on_large_files_without_start_xref():
     # parsing is expected take less than a second on a modern cpu, but include a large
     # tolerance to account for busy or slow systems
     assert parse_duration < 60
+
+
+def test_PdfReaderDecryptWhenNoID():
+    """
+    Decrypt an encrypted file that's missing the 'ID' value in its
+    trailer.
+    https://github.com/mstamy2/PyPDF2/issues/608
+    """
+
+    with open(
+        os.path.join(RESOURCE_ROOT, "encrypted_doc_no_id.pdf"), "rb"
+    ) as inputfile:
+        ipdf = PdfFileReader(inputfile)
+        ipdf.decrypt("")
+        assert ipdf.getDocumentInfo() == {"/Producer": "European Patent Office"}