Merge branch 'main' into decimal-precision

py-pdf · Sep 5, 2022 · 2cfe102 · 2cfe102
2 parents a71d15b + 4073b2a
commit 2cfe102
Show file tree

Hide file tree

Showing 26 changed files with 642 additions and 119 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,37 @@
 # CHANGELOG
 
+## Version 2.10.5, 2022-09-04
+
+### New Features (ENH)
+-  Process XRefStm (#1297)
+-  Auto-detect RTL for text extraction (#1309)
+
+### Bug Fixes (BUG)
+-  Avoid scaling cropbox twice (#1314)
+
+### Robustness (ROB)
+-  Fix offset correction in revised PDF (#1318)
+-  Crop data of /U and /O in encryption dictionary to 48 bytes (#1317)
+-  MultiLine bfrange in cmap (#1299)
+-  Cope with 2 digit codes in bfchar (#1310)
+-  Accept '/annn' charset as ASCII code (#1316)
+-  Log errors during Float / NumberObject initialization (#1315)
+-  Cope with corrupted entries in xref table (#1300)
+
+### Documentation (DOC)
+-  Migration guide (PyPDF2 1.x ➔ 2.x) (#1324)
+-  Creating a coverage report (#1319)
+-  Fix AnnotationBuilder.free_text example (#1311)
+-  Fix usage of page.scale by replacing it with page.scale_by (#1313)
+
+### Maintenance (MAINT)
+-  PdfReaderProtocol (#1303)
+-  Throw PdfReadError if Trailer can't be read (#1298)
+-  Remove catching OverflowException (#1302)
+
+Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.10.4...2.10.5
+
+
 ## Version 2.10.4, 2022-08-28
 
 ### Robustness (ROB)
@@ -419,7 +451,7 @@ The highlight of this release is improved support for file encryption
 -  Apply improvements to _utils suggested by perflint (#993)
 
 ### Robustness (ROB)
--  utf-16-be\' codec can\'t decode (...) (#995)
+-  utf-16-be codec can't decode (...) (#995)
 
 ### Documentation (DOC)
 -  Remove reference to Scripts (#987)
@@ -465,7 +497,7 @@ e.g. Russian / Chinese / Japanese / Korean / Arabic.
 -  Optimize read_next_end_line (#646)
 
 ### Bug Fixes (BUG)
--  Adobe Acrobat \'Would you like to save this file?\' (#970)
+-  Adobe Acrobat 'Would you like to save this file?' (#970)
 
 ### Documentation (DOC)
 -  Notes on annotations (#982)
@@ -905,7 +937,7 @@ large PDF files (#808) 🎉
 
 ### Maintenance (MAINT)
 -  Validate PDF magic byte in strict mode (#814)
--  Make PdfFileMerger.addBookmark() behave life PdfFileWriters\' (#339)
+-  Make PdfFileMerger.addBookmark() behave life PdfFileWriters' (#339)
 -  Quadratic runtime while parsing reduced to linear  (#808)
 
 ### Testing (TST)

diff --git a/Makefile b/Makefile
@@ -17,7 +17,7 @@ clean:
 	rm -rf tests/__pycache__ PyPDF2/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf PyPDF2.egg-info PyPDF2_pdfLocation.txt .pytest_cache .mypy_cache .benchmarks
 
 test:
-	pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
+	pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=60 PyPDF2
 
 testtype:
 	pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 --typeguard-packages=PyPDF2

diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -180,10 +180,13 @@ def parse_to_unicode(
         return {}, space_code, []
     process_rg: bool = False
     process_char: bool = False
+    multiline_rg: Union[
+        None, Tuple[int, int]
+    ] = None  # tuple = (current_char, remaining size) ; cf #1285 for example of file
     cm = prepare_cm(ft)
     for l in cm.split(b"\n"):
-        process_rg, process_char = process_cm_line(
-            l.strip(b" "), process_rg, process_char, map_dict, int_entry
+        process_rg, process_char, multiline_rg = process_cm_line(
+            l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
         )
 
     for a, value in map_dict.items():
@@ -228,11 +231,12 @@ def process_cm_line(
     l: bytes,
     process_rg: bool,
     process_char: bool,
+    multiline_rg: Union[None, Tuple[int, int]],
     map_dict: Dict[Any, Any],
     int_entry: List[int],
-) -> Tuple[bool, bool]:
+) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
     if l in (b"", b" ") or l[0] == 37:  # 37 = %
-        return process_rg, process_char
+        return process_rg, process_char, multiline_rg
     if b"beginbfrange" in l:
         process_rg = True
     elif b"endbfrange" in l:
@@ -242,22 +246,29 @@ def process_cm_line(
     elif b"endbfchar" in l:
         process_char = False
     elif process_rg:
-        parse_bfrange(l, map_dict, int_entry)
+        multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
     elif process_char:
         parse_bfchar(l, map_dict, int_entry)
-    return process_rg, process_char
+    return process_rg, process_char, multiline_rg
 
 
-def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
+def parse_bfrange(
+    l: bytes,
+    map_dict: Dict[Any, Any],
+    int_entry: List[int],
+    multiline_rg: Union[None, Tuple[int, int]],
+) -> Union[None, Tuple[int, int]]:
     lst = [x for x in l.split(b" ") if x]
-    a = int(lst[0], 16)
-    b = int(lst[1], 16)
+    closure_found = False
     nbi = len(lst[0])
     map_dict[-1] = nbi // 2
     fmt = b"%%0%dX" % nbi
-    if lst[2] == b"[":
-        for sq in lst[3:]:
+    if multiline_rg is not None:
+        a = multiline_rg[0]  # a, b not in the current line
+        b = multiline_rg[1]
+        for sq in lst[1:]:
             if sq == b"]":
+                closure_found = True
                 break
             map_dict[
                 unhexlify(fmt % a).decode(
@@ -268,18 +279,36 @@ def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> N
             int_entry.append(a)
             a += 1
     else:
-        c = int(lst[2], 16)
-        fmt2 = b"%%0%dX" % max(4, len(lst[2]))
-        while a <= b:
-            map_dict[
-                unhexlify(fmt % a).decode(
-                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
-                    "surrogatepass",
-                )
-            ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
-            int_entry.append(a)
-            a += 1
-            c += 1
+        a = int(lst[0], 16)
+        b = int(lst[1], 16)
+        if lst[2] == b"[":
+            for sq in lst[3:]:
+                if sq == b"]":
+                    closure_found = True
+                    break
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+        else:  # case without list
+            c = int(lst[2], 16)
+            fmt2 = b"%%0%dX" % max(4, len(lst[2]))
+            closure_found = True
+            while a <= b:
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+                c += 1
+    return None if closure_found else (a, b)
 
 
 def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
@@ -290,7 +319,7 @@ def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> No
         # placeholder (see above) means empty string
         if lst[1] != b".":
             map_to = unhexlify(lst[1]).decode(
-                "utf-16-be", "surrogatepass"
+                "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
             )  # join is here as some cases where the code was split
         map_dict[
             unhexlify(lst[0]).decode(

diff --git a/PyPDF2/_codecs/adobe_glyphs.py b/PyPDF2/_codecs/adobe_glyphs.py
@@ -13425,3 +13425,13 @@
     "/zukatakana": "\u30BA",
     "/zwarakay": "\u0659",
 }
+
+
+def _complete() -> None:
+    global adobe_glyphs
+    for i in range(256):
+        adobe_glyphs[f"/a{i}"] = chr(i)
+    adobe_glyphs["/.notdef"] = "□"
+
+
+_complete()
diff --git a/PyPDF2/_encryption.py b/PyPDF2/_encryption.py
@@ -119,7 +119,7 @@ def __init__(self, key: bytes) -> None:
                 self.S[i], self.S[j] = self.S[j], self.S[i]
 
         def encrypt(self, data: bytes) -> bytes:
-            S = [x for x in self.S]
+            S = list(self.S)
             out = list(0 for _ in range(len(data)))
             i, j = 0, 0
             for k in range(len(data)):
@@ -516,10 +516,13 @@ def verify_owner_password(
            should match the value in the P key.
         """
         password = password[:127]
-        if AlgV5.calculate_hash(R, password, o_value[32:40], u_value) != o_value[:32]:
+        if (
+            AlgV5.calculate_hash(R, password, o_value[32:40], u_value[:48])
+            != o_value[:32]
+        ):
             return b""
         iv = bytes(0 for _ in range(16))
-        tmp_key = AlgV5.calculate_hash(R, password, o_value[40:], u_value)
+        tmp_key = AlgV5.calculate_hash(R, password, o_value[40:48], u_value[:48])
         key = AES_CBC_decrypt(tmp_key, iv, oe_value)
         return key
 
@@ -532,7 +535,7 @@ def verify_user_password(
         if AlgV5.calculate_hash(R, password, u_value[32:40], b"") != u_value[:32]:
             return b""
         iv = bytes(0 for _ in range(16))
-        tmp_key = AlgV5.calculate_hash(R, password, u_value[40:], b"")
+        tmp_key = AlgV5.calculate_hash(R, password, u_value[40:48], b"")
         return AES_CBC_decrypt(tmp_key, iv, ue_value)
 
     @staticmethod

diff --git a/PyPDF2/_merger.py b/PyPDF2/_merger.py
@@ -541,21 +541,22 @@ def _write_outline_item_on_page(
         )
 
     def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None:
-        for nd in self.named_dests:
+        for named_dest in self.named_dests:
             pageno = None
-            np = nd["/Page"]
+            np = named_dest["/Page"]
 
             if isinstance(np, NumberObject):
                 continue
 
-            for p in pages:
-                if np.get_object() == p.pagedata.get_object():
-                    pageno = p.id
+            for page in pages:
+                if np.get_object() == page.pagedata.get_object():
+                    pageno = page.id
 
-            if pageno is not None:
-                nd[NameObject("/Page")] = NumberObject(pageno)
-            else:
-                raise ValueError(f"Unresolved named destination '{nd['/Title']}'")
+            if pageno is None:
+                raise ValueError(
+                    f"Unresolved named destination '{named_dest['/Title']}'"
+                )
+            named_dest[NameObject("/Page")] = NumberObject(pageno)
 
     @deprecate_bookmark(bookmarks="outline")
     def _associate_outline_items_to_pages(
@@ -612,12 +613,11 @@ def find_bookmark(
         self,
         outline_item: Dict[str, Any],
         root: Optional[OutlineType] = None,
-    ) -> Optional[List[int]]:
+    ) -> Optional[List[int]]:  # pragma: no cover
         """
         .. deprecated:: 2.9.0
             Use :meth:`find_outline_item` instead.
         """
-
         return self.find_outline_item(outline_item, root)
 
     def add_outline_item(