Skip to content

Commit

Permalink
ENH: Add decrypt support for V5 and AES-128, AES-256 (R5 only) (#749)
Browse files Browse the repository at this point in the history
This is a rewrite of the encryption part to support V4 and AES-128 encryption (ONLY decrypt for now)
PyCryptodome was added as an optional dependency for AES.

It does NOT add support for encryption R=6 as introduced by PDF 2.0

Closes #528
  • Loading branch information
exiledkingcc committed Jun 19, 2022
1 parent 6d426a0 commit 868f977
Show file tree
Hide file tree
Showing 22 changed files with 869 additions and 152 deletions.
19 changes: 9 additions & 10 deletions PyPDF2/_merger.py
Expand Up @@ -33,6 +33,7 @@
from ._utils import StrByteType, deprecate_with_replacement, str_
from ._writer import PdfWriter
from .constants import PagesAttributes as PA
from .encryption import Encryption
from .generic import (
ArrayObject,
Bookmark,
Expand Down Expand Up @@ -129,16 +130,14 @@ def merge(
bookmarks from being imported by specifying this as ``False``.
"""

stream, my_file, decryption_key = self._create_stream(fileobj)
stream, my_file, encryption_obj = self._create_stream(fileobj)

# Create a new PdfReader instance using the stream
# (either file or BytesIO or StringIO) created above
reader = PdfReader(stream, strict=self.strict) # type: ignore[arg-type]

# Keep track of our input files so we can close them later
self.inputs.append((stream, reader, my_file))
if decryption_key is not None:
reader._decryption_key = decryption_key
if encryption_obj is not None:
reader._encryption = encryption_obj

# Find the range of pages to merge.
if pages is None:
Expand Down Expand Up @@ -188,7 +187,7 @@ def merge(

def _create_stream(
self, fileobj: Union[StrByteType, PdfReader]
) -> Tuple[IOBase, bool, Optional[bytes]]:
) -> Tuple[IOBase, bool, Optional[Encryption]]:
# This parameter is passed to self.inputs.append and means
# that the stream used was created in this method.
my_file = False
Expand All @@ -199,14 +198,14 @@ def _create_stream(
# it is a PdfReader, copy that reader's stream into a
# BytesIO stream.
# If fileobj is none of the above types, it is not modified
decryption_key = None
encryption_obj = None
stream: IOBase
if isinstance(fileobj, str):
stream = FileIO(fileobj, "rb")
my_file = True
elif isinstance(fileobj, PdfReader):
if hasattr(fileobj, "_decryption_key"):
decryption_key = fileobj._decryption_key
if hasattr(fileobj, "_encryption"):
encryption_obj = fileobj._encryption
orig_tell = fileobj.stream.tell()
fileobj.stream.seek(0)
stream = BytesIO(fileobj.stream.read())
Expand All @@ -222,7 +221,7 @@ def _create_stream(
my_file = True
else:
stream = fileobj
return stream, my_file, decryption_key
return stream, my_file, encryption_obj

def append(
self,
Expand Down
159 changes: 24 additions & 135 deletions PyPDF2/_reader.py
Expand Up @@ -31,7 +31,6 @@
import re
import struct
import warnings
from hashlib import md5
from io import BytesIO
from pathlib import Path
from typing import (
Expand All @@ -47,14 +46,12 @@
)

from ._page import PageObject, _VirtualList
from ._security import RC4_encrypt, _alg33_1, _alg34, _alg35
from ._utils import (
StrByteType,
StreamType,
b_,
deprecate_no_replacement,
deprecate_with_replacement,
ord_,
read_non_whitespace,
read_previous_line,
read_until_whitespace,
Expand All @@ -65,32 +62,25 @@
from .constants import CatalogDictionary as CD
from .constants import Core as CO
from .constants import DocumentInformationAttributes as DI
from .constants import EncryptionDictAttributes as ED
from .constants import PageAttributes as PG
from .constants import PagesAttributes as PA
from .constants import StreamAttributes as SA
from .constants import TrailerKeys as TK
from .errors import PdfReadError, PdfReadWarning, PdfStreamError
from .errors import DependencyError, PdfReadError, PdfReadWarning, PdfStreamError
from .generic import (
ArrayObject,
BooleanObject,
ByteStringObject,
ContentStream,
DecodedStreamObject,
Destination,
DictionaryObject,
EncodedStreamObject,
Field,
FloatObject,
IndirectObject,
NameObject,
NullObject,
NumberObject,
PdfObject,
StreamObject,
TextStringObject,
TreeObject,
createStringObject,
read_object,
)
from .types import OutlinesType, PagemodeType
Expand Down Expand Up @@ -353,6 +343,9 @@ def _get_num_pages(self) -> int:
self._override_encryption = True
self.decrypt("")
return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore
except DependencyError as e:
# make dependency error clear to users
raise e
except Exception:
raise PdfReadError("File has not been decrypted")
finally:
Expand Down Expand Up @@ -1051,16 +1044,11 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
# override encryption is used for the /Encrypt dictionary
if not self._override_encryption and self.is_encrypted:
# if we don't have the encryption key:
if not hasattr(self, "_decryption_key"):
if not hasattr(self, "_encryption"):
raise PdfReadError("file has not been decrypted")
# otherwise, decrypt here...
pack1 = struct.pack("<i", indirect_reference.idnum)[:3]
pack2 = struct.pack("<i", indirect_reference.generation)[:2]
key = self._decryption_key + pack1 + pack2
assert len(key) == (len(self._decryption_key) + 5)
md5_hash = md5(key).digest()
key = md5_hash[: min(16, len(self._decryption_key) + 5)]
retval = self._decrypt_object(retval, key) # type: ignore
retval = cast(PdfObject, retval)
retval = self._encryption.decrypt_object(retval, indirect_reference.idnum, indirect_reference.generation)
else:
warnings.warn(
f"Object {indirect_reference.idnum} {indirect_reference.generation} "
Expand All @@ -1085,35 +1073,6 @@ def getObject(
deprecate_with_replacement("getObject", "get_object")
return self.get_object(indirectReference)

def _decrypt_object(
self,
obj: Union[
ArrayObject,
BooleanObject,
ByteStringObject,
DictionaryObject,
FloatObject,
IndirectObject,
NameObject,
NullObject,
NumberObject,
StreamObject,
TextStringObject,
],
key: Union[str, bytes],
) -> PdfObject:
if isinstance(obj, (ByteStringObject, TextStringObject)):
obj = createStringObject(RC4_encrypt(key, obj.original_bytes))
elif isinstance(obj, StreamObject):
obj._data = RC4_encrypt(key, obj._data)
elif isinstance(obj, DictionaryObject):
for dictkey, value in list(obj.items()):
obj[dictkey] = self._decrypt_object(value, key)
elif isinstance(obj, ArrayObject):
for i in range(len(obj)):
obj[i] = self._decrypt_object(obj[i], key)
return obj

def read_object_header(self, stream: StreamType) -> Tuple[int, int]:
# Should never be necessary to read out whitespace, since the
# cross-reference table should put us in the right spot to read the
Expand Down Expand Up @@ -1627,93 +1586,23 @@ def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
return permissions

def _decrypt(self, password: Union[str, bytes]) -> int:
# Decrypts data as per Section 3.5 (page 117) of PDF spec v1.7
# "The security handler defines the use of encryption and decryption in
# the document, using the rules specified by the CF, StmF, and StrF entries"
encrypt = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object())
# /Encrypt Keys:
# Filter (name) : "name of the preferred security handler "
# V (number) : Algorithm Code
# Length (integer): Length of encryption key, in bits
# CF (dictionary) : Crypt filter
# StmF (name) : Name of the crypt filter that is used by default when decrypting streams
# StrF (name) : The name of the crypt filter that is used when decrypting all strings in the document
# R (number) : Standard security handler revision number
# U (string) : A 32-byte string, based on the user password
# P (integer) : Permissions allowed with user access
if encrypt["/Filter"] != "/Standard":
raise NotImplementedError(
"only Standard PDF encryption handler is available"
)
encrypt_v = cast(int, encrypt["/V"])
if encrypt_v not in (1, 2):
raise NotImplementedError(
f"only algorithm code 1 and 2 are supported. This PDF uses code {encrypt_v}"
)
user_password, key = self._authenticate_user_password(password)
if user_password:
self._decryption_key = key
return 1
else:
rev = cast(int, encrypt["/R"].get_object())
if rev == 2:
keylen = 5
else:
keylen = cast(int, encrypt[SA.LENGTH].get_object()) // 8
key = _alg33_1(password, rev, keylen)
real_O = cast(bytes, encrypt["/O"].get_object())
if rev == 2:
userpass = RC4_encrypt(key, real_O)
else:
val = real_O
for i in range(19, -1, -1):
new_key = b""
for key_char in key:
new_key += b_(chr(ord_(key_char) ^ i))
val = RC4_encrypt(new_key, val)
userpass = val
owner_password, key = self._authenticate_user_password(userpass)
if owner_password:
self._decryption_key = key
return 2
return 0

def _authenticate_user_password(
self, password: Union[str, bytes]
) -> Tuple[bool, bytes]:
encrypt = cast(
Optional[DictionaryObject], self.trailer[TK.ENCRYPT].get_object()
)
if encrypt is None:
raise Exception(
"_authenticateUserPassword was called on unencrypted document"
)
rev = cast(int, encrypt[ED.R].get_object())
owner_entry = cast(ByteStringObject, encrypt[ED.O].get_object())
p_entry = cast(int, encrypt[ED.P].get_object())
if TK.ID in self.trailer:
id_entry = cast(ArrayObject, self.trailer[TK.ID].get_object())
else:
# Some documents may not have a /ID, use two empty
# byte strings instead. Solves
# https://github.com/mstamy2/PyPDF2/issues/608
id_entry = ArrayObject([ByteStringObject(b""), ByteStringObject(b"")])
id1_entry = id_entry[0].get_object()
real_U = encrypt[ED.U].get_object().original_bytes # type: ignore
if rev == 2:
U, key = _alg34(password, owner_entry, p_entry, id1_entry)
elif rev >= 3:
U, key = _alg35(
password,
rev,
encrypt[SA.LENGTH].get_object() // 8, # type: ignore
owner_entry,
p_entry,
id1_entry,
encrypt.get(ED.ENCRYPT_METADATA, BooleanObject(False)).get_object(), # type: ignore
)
U, real_U = U[:16], real_U[:16]
return U == real_U, key
# already got the KEY
if hasattr(self, "_encryption"):
return 3
from PyPDF2.encryption import Encryption
# Some documents may not have a /ID, use two empty
# byte strings instead. Solves
# https://github.com/mstamy2/PyPDF2/issues/608
id_entry = self.trailer.get(TK.ID)
id1_entry = id_entry[0].get_object().original_bytes if id_entry else b""
encryptEntry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object())
encryption = Encryption.read(encryptEntry, id1_entry)
# maybe password is owner password
# TODO: add/modify api to set owner password
rr = encryption.verify(password, password)
if rr > 0:
self._encryption = encryption
return rr

@property
def is_encrypted(self) -> bool:
Expand Down

0 comments on commit 868f977

Please sign in to comment.