Merge branch 'main' into outline-nomenclature

py-pdf · Jul 25, 2022 · 80ff23b · 80ff23b
2 parents 9537c4b + 844f238
commit 80ff23b
Show file tree

Hide file tree

Showing 20 changed files with 350 additions and 144 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,7 +30,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 22.6.0
     hooks:
     -   id: black
         args: [--target-version, py36]
@@ -40,7 +40,7 @@ repos:
     -   id: blacken-docs
         additional_dependencies: [black==22.1.0]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.34.0
+    rev: v2.37.2
     hooks:
     -   id: pyupgrade
         args: [--py36-plus]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,34 @@
 # CHANGELOG
 
+## Version 2.8.0, 2022-07-24
+
+### New Features (ENH)
+-  Add writer.add_annotation, page.annotations, and generic.AnnotationBuilder (#1120)
+
+### Bug Fixes (BUG)
+-  Set /AS for /Btn form fields in writer (#1161)
+-  Ignore if /Perms verify failed (#1157)
+
+### Robustness (ROB)
+-  Cope with utf16 character for space calculation (#1155)
+-  Cope with null params for FitH / FitV destination (#1152)
+-  Handle outlines without valid destination (#1076)
+
+### Developer Experience (DEV)
+-  Introduce _utils.logger_warning (#1148)
+
+### Maintenance (MAINT)
+-  Break up parse_to_unicode (#1162)
+-  Add diagnostic output to exception in read_from_stream (#1159)
+-  Reduce PdfReader.read complexity (#1151)
+
+### Testing (TST)
+-  Add workflow tests found by arc testing (#1154)
+-  Decrypt file which is not encrypted (#1149)
+-  Test CryptRC4 encryption class; test image extraction filters (#1147)
+
+Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.7.0...2.8.0
+
 ## Version 2.7.0, 2022-07-21
 
 ### New Features (ENH)

diff --git a/Makefile b/Makefile
@@ -31,3 +31,6 @@ mutation-results:
 
 benchmark:
 	pytest tests/bench.py
+
+mypy:
+	mypy PyPDF2 --ignore-missing-imports --check-untyped --strict
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -42,7 +42,11 @@ def build_char_map(
         pass
     # I conside the space_code is available on one byte
     if isinstance(space_code, str):
-        sp = space_code.encode("charmap")[0]
+        try:  # one byte
+            sp = space_code.encode("charmap")[0]
+        except Exception:
+            sp = space_code.encode("utf-16-be")
+            sp = sp[0] + 256 * sp[1]
     else:
         sp = space_code
     sp_width = compute_space_width(ft, sp, space_width)
@@ -52,12 +56,12 @@ def build_char_map(
         float(sp_width / 2),
         encoding,
         # https://github.com/python/mypy/issues/4374
-        map_dict,  # type: ignore
-    )  # type: ignore
+        map_dict,
+    )
 
 
 # used when missing data, e.g. font def missing
-unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict] = (
+unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
     "Unknown",
     9999,
     dict(zip(range(256), ["�"] * 256)),
@@ -104,15 +108,15 @@ def parse_encoding(
     encoding: Union[str, List[str], Dict[int, str]] = []
     if "/Encoding" not in ft:
         try:
-            if "/BaseFont" in ft and ft["/BaseFont"] in charset_encoding:
+            if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
                 encoding = dict(
                     zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
                 )
             else:
                 encoding = "charmap"
             return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
         except Exception:
-            if ft["/Subtype"] == "/Type1":
+            if cast(str, ft["/Subtype"]) == "/Type1":
                 return "charmap", space_code
             else:
                 return "", space_code
@@ -163,19 +167,31 @@ def parse_encoding(
 
 def parse_to_unicode(
     ft: DictionaryObject, space_code: int
-) -> Tuple[Dict, int, List[int]]:
-    map_dict: Dict[
-        Any, Any
-    ] = (
-        {}
-    )  # will store all translation code and map_dict[-1] we will have the number of bytes to convert
-    int_entry: List[
-        int
-    ] = []  # will provide the list of cmap keys as int to correct encoding
+) -> Tuple[Dict[Any, Any], int, List[int]]:
+    # will store all translation code
+    # and map_dict[-1] we will have the number of bytes to convert
+    map_dict: Dict[Any, Any] = {}
+
+    # will provide the list of cmap keys as int to correct encoding
+    int_entry: List[int] = []
+
     if "/ToUnicode" not in ft:
         return {}, space_code, []
     process_rg: bool = False
     process_char: bool = False
+    cm = prepare_cm(ft)
+    for l in cm.split(b"\n"):
+        process_rg, process_char = process_cm_line(
+            l.strip(b" "), process_rg, process_char, map_dict, int_entry
+        )
+
+    for a, value in map_dict.items():
+        if value == " ":
+            space_code = a
+    return map_dict, space_code, int_entry
+
+
+def prepare_cm(ft: DictionaryObject) -> bytes:
     cm: bytes = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
     # we need to prepare cm before due to missing return line in pdf printed to pdf from word
     cm = (
@@ -204,71 +220,84 @@ def parse_to_unicode(
         .replace(b"]", b" ]\n ")
         .replace(b"\r", b"\n")
     )
+    return cm
 
-    for l in cm.split(b"\n"):
-        if l in (b"", b" ") or l[0] == 37:  # 37 = %
-            continue
-        if b"beginbfrange" in l:
-            process_rg = True
-        elif b"endbfrange" in l:
-            process_rg = False
-        elif b"beginbfchar" in l:
-            process_char = True
-        elif b"endbfchar" in l:
-            process_char = False
-        elif process_rg:
-            lst = [x for x in l.split(b" ") if x]
-            a = int(lst[0], 16)
-            b = int(lst[1], 16)
-            nbi = len(lst[0])
-            map_dict[-1] = nbi // 2
-            fmt = b"%%0%dX" % nbi
-            if lst[2] == b"[":
-                for sq in lst[3:]:
-                    if sq == b"]":
-                        break
-                    map_dict[
-                        unhexlify(fmt % a).decode(
-                            "charmap" if map_dict[-1] == 1 else "utf-16-be",
-                            "surrogatepass",
-                        )
-                    ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
-                    int_entry.append(a)
-                    a += 1
-            else:
-                c = int(lst[2], 16)
-                fmt2 = b"%%0%dX" % max(4, len(lst[2]))
-                while a <= b:
-                    map_dict[
-                        unhexlify(fmt % a).decode(
-                            "charmap" if map_dict[-1] == 1 else "utf-16-be",
-                            "surrogatepass",
-                        )
-                    ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
-                    int_entry.append(a)
-                    a += 1
-                    c += 1
-        elif process_char:
-            lst = [x for x in l.split(b" ") if x]
-            map_dict[-1] = len(lst[0]) // 2
-            while len(lst) > 1:
-                map_to = ""
-                # placeholder (see above) means empty string
-                if lst[1] != b".":
-                    map_to = unhexlify(lst[1]).decode(
-                        "utf-16-be", "surrogatepass"
-                    )  # join is here as some cases where the code was split
-                map_dict[
-                    unhexlify(lst[0]).decode(
-                        "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
-                    )
-                ] = map_to
-                int_entry.append(int(lst[0], 16))
-                lst = lst[2:]
-    for a, value in map_dict.items():
-        if value == " ":
-            space_code = a
-    return map_dict, space_code, int_entry
+
+def process_cm_line(
+    l: bytes,
+    process_rg: bool,
+    process_char: bool,
+    map_dict: Dict[Any, Any],
+    int_entry: List[int],
+) -> Tuple[bool, bool]:
+    if l in (b"", b" ") or l[0] == 37:  # 37 = %
+        return process_rg, process_char
+    if b"beginbfrange" in l:
+        process_rg = True
+    elif b"endbfrange" in l:
+        process_rg = False
+    elif b"beginbfchar" in l:
+        process_char = True
+    elif b"endbfchar" in l:
+        process_char = False
+    elif process_rg:
+        parse_bfrange(l, map_dict, int_entry)
+    elif process_char:
+        parse_bfchar(l, map_dict, int_entry)
+    return process_rg, process_char
+
+
+def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
+    lst = [x for x in l.split(b" ") if x]
+    a = int(lst[0], 16)
+    b = int(lst[1], 16)
+    nbi = len(lst[0])
+    map_dict[-1] = nbi // 2
+    fmt = b"%%0%dX" % nbi
+    if lst[2] == b"[":
+        for sq in lst[3:]:
+            if sq == b"]":
+                break
+            map_dict[
+                unhexlify(fmt % a).decode(
+                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                    "surrogatepass",
+                )
+            ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+            int_entry.append(a)
+            a += 1
+    else:
+        c = int(lst[2], 16)
+        fmt2 = b"%%0%dX" % max(4, len(lst[2]))
+        while a <= b:
+            map_dict[
+                unhexlify(fmt % a).decode(
+                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                    "surrogatepass",
+                )
+            ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
+            int_entry.append(a)
+            a += 1
+            c += 1
+
+
+def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
+    lst = [x for x in l.split(b" ") if x]
+    map_dict[-1] = len(lst[0]) // 2
+    while len(lst) > 1:
+        map_to = ""
+        # placeholder (see above) means empty string
+        if lst[1] != b".":
+            map_to = unhexlify(lst[1]).decode(
+                "utf-16-be", "surrogatepass"
+            )  # join is here as some cases where the code was split
+        map_dict[
+            unhexlify(lst[0]).decode(
+                "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
+            )
+        ] = map_to
+        int_entry.append(int(lst[0], 16))
+        lst = lst[2:]
 
 
 def compute_space_width(
@@ -285,7 +314,7 @@ def compute_space_width(
         except Exception:
             w1[-1] = 1000.0
         if "/W" in ft1:
-            w = list(ft1["/W"])  # type: ignore
+            w = list(ft1["/W"])
         else:
             w = []
         while len(w) > 0:

diff --git a/PyPDF2/_encryption.py b/PyPDF2/_encryption.py
@@ -29,8 +29,9 @@
 import random
 import struct
 from enum import IntEnum
-from typing import Optional, Tuple, Union, cast
+from typing import Any, Dict, Optional, Tuple, Union, cast
 
+from PyPDF2._utils import logger_warning
 from PyPDF2.errors import DependencyError
 from PyPDF2.generic import (
     ArrayObject,
@@ -565,7 +566,7 @@ def verify_perms(
     @staticmethod
     def generate_values(
         user_pwd: bytes, owner_pwd: bytes, key: bytes, p: int, metadata_encrypted: bool
-    ) -> dict:
+    ) -> Dict[Any, Any]:
         u_value, ue_value = AlgV5.compute_U_value(user_pwd, key)
         o_value, oe_value = AlgV5.compute_O_value(owner_pwd, key, u_value)
         perms = AlgV5.compute_Perms_value(key, p, metadata_encrypted)
@@ -826,7 +827,7 @@ def verify_v5(self, password: bytes) -> Tuple[bytes, PasswordType]:
         P = (P + 0x100000000) % 0x100000000  # maybe < 0
         metadata_encrypted = self.entry.get("/EncryptMetadata", True)
         if not AlgV5.verify_perms(key, perms, P, metadata_encrypted):
-            return b"", PasswordType.NOT_DECRYPTED
+            logger_warning("ignore '/Perms' verify failed", __name__)
         return key, rc
 
     @staticmethod

diff --git a/PyPDF2/_merger.py b/PyPDF2/_merger.py
@@ -567,12 +567,12 @@ def find_outline_item(
 
         for i, oi_enum in enumerate(root):
             if isinstance(oi_enum, list):
-                # b is still an inner node
+                # oi_enum is still an inner node
                 # (OutlineType, if recursive types were supported by mypy)
                 res = self.find_outline_item(outline_item, oi_enum)  # type: ignore
                 if res:
                     return [i] + res
-            elif oi_enum == outline_item or oi_enum["/Title"] == outline_item:
+            elif oi_enum == outline_item or cast(Dict[Any, Any], oi_enum["/Title"]) == outline_item:
                 # we found a leaf node
                 return [i]
 
@@ -689,7 +689,7 @@ def add_named_destination(self, title: str, pagenum: int) -> None:
 
 class PdfFileMerger(PdfMerger):  # pragma: no cover
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        deprecate_with_replacement("PdfFileMerger", "PdfMerge")
+        deprecate_with_replacement("PdfFileMerger", "PdfMerger")
 
         if "strict" not in kwargs and len(args) < 1:
             kwargs["strict"] = True  # maintain the default

diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -506,9 +506,9 @@ def _merge_page(
         # Combine /ProcSet sets.
         new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
             frozenset(
-                original_resources.get(RES.PROC_SET, ArrayObject()).get_object()  # type: ignore
+                original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
             ).union(
-                frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())  # type: ignore
+                frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
             )
         )
 
@@ -1248,7 +1248,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                         cmaps[operands[0]][2],
                         cmaps[operands[0]][3],
                         operands[0],
-                    )  # type:ignore
+                    )
                 except KeyError:  # font not found
                     _space_width = unknown_char_map[1]
                     cmap = (
@@ -1395,7 +1395,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 except IndexError:
                     pass
                 try:
-                    xobj = resources_dict["/XObject"]  # type: ignore
+                    xobj = resources_dict["/XObject"]
                     if xobj[operands[0]]["/Subtype"] != "/Image":  # type: ignore
                         # output += text
                         text = self.extract_xform_text(xobj[operands[0]], space_width)  # type: ignore