Skip to content

Commit

Permalink
Merge branch 'main' into outline-nomenclature
Browse files Browse the repository at this point in the history
  • Loading branch information
mtd91429 committed Jul 25, 2022
2 parents 9537c4b + 844f238 commit 80ff23b
Show file tree
Hide file tree
Showing 20 changed files with 350 additions and 144 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 22.3.0
rev: 22.6.0
hooks:
- id: black
args: [--target-version, py36]
Expand All @@ -40,7 +40,7 @@ repos:
- id: blacken-docs
additional_dependencies: [black==22.1.0]
- repo: https://github.com/asottile/pyupgrade
rev: v2.34.0
rev: v2.37.2
hooks:
- id: pyupgrade
args: [--py36-plus]
29 changes: 29 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,34 @@
# CHANGELOG

## Version 2.8.0, 2022-07-24

### New Features (ENH)
- Add writer.add_annotation, page.annotations, and generic.AnnotationBuilder (#1120)

### Bug Fixes (BUG)
- Set /AS for /Btn form fields in writer (#1161)
- Ignore if /Perms verify failed (#1157)

### Robustness (ROB)
- Cope with utf16 character for space calculation (#1155)
- Cope with null params for FitH / FitV destination (#1152)
- Handle outlines without valid destination (#1076)

### Developer Experience (DEV)
- Introduce _utils.logger_warning (#1148)

### Maintenance (MAINT)
- Break up parse_to_unicode (#1162)
- Add diagnostic output to exception in read_from_stream (#1159)
- Reduce PdfReader.read complexity (#1151)

### Testing (TST)
- Add workflow tests found by arc testing (#1154)
- Decrypt file which is not encrypted (#1149)
- Test CryptRC4 encryption class; test image extraction filters (#1147)

Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.7.0...2.8.0

## Version 2.7.0, 2022-07-21

### New Features (ENH)
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ mutation-results:

benchmark:
pytest tests/bench.py

mypy:
mypy PyPDF2 --ignore-missing-imports --check-untyped --strict
189 changes: 109 additions & 80 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ def build_char_map(
pass
# I conside the space_code is available on one byte
if isinstance(space_code, str):
sp = space_code.encode("charmap")[0]
try: # one byte
sp = space_code.encode("charmap")[0]
except Exception:
sp = space_code.encode("utf-16-be")
sp = sp[0] + 256 * sp[1]
else:
sp = space_code
sp_width = compute_space_width(ft, sp, space_width)
Expand All @@ -52,12 +56,12 @@ def build_char_map(
float(sp_width / 2),
encoding,
# https://github.com/python/mypy/issues/4374
map_dict, # type: ignore
) # type: ignore
map_dict,
)


# used when missing data, e.g. font def missing
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict] = (
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
"Unknown",
9999,
dict(zip(range(256), ["�"] * 256)),
Expand Down Expand Up @@ -104,15 +108,15 @@ def parse_encoding(
encoding: Union[str, List[str], Dict[int, str]] = []
if "/Encoding" not in ft:
try:
if "/BaseFont" in ft and ft["/BaseFont"] in charset_encoding:
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
encoding = dict(
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
)
else:
encoding = "charmap"
return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
if ft["/Subtype"] == "/Type1":
if cast(str, ft["/Subtype"]) == "/Type1":
return "charmap", space_code
else:
return "", space_code
Expand Down Expand Up @@ -163,19 +167,31 @@ def parse_encoding(

def parse_to_unicode(
ft: DictionaryObject, space_code: int
) -> Tuple[Dict, int, List[int]]:
map_dict: Dict[
Any, Any
] = (
{}
) # will store all translation code and map_dict[-1] we will have the number of bytes to convert
int_entry: List[
int
] = [] # will provide the list of cmap keys as int to correct encoding
) -> Tuple[Dict[Any, Any], int, List[int]]:
# will store all translation code
# and map_dict[-1] we will have the number of bytes to convert
map_dict: Dict[Any, Any] = {}

# will provide the list of cmap keys as int to correct encoding
int_entry: List[int] = []

if "/ToUnicode" not in ft:
return {}, space_code, []
process_rg: bool = False
process_char: bool = False
cm = prepare_cm(ft)
for l in cm.split(b"\n"):
process_rg, process_char = process_cm_line(
l.strip(b" "), process_rg, process_char, map_dict, int_entry
)

for a, value in map_dict.items():
if value == " ":
space_code = a
return map_dict, space_code, int_entry


def prepare_cm(ft: DictionaryObject) -> bytes:
cm: bytes = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
# we need to prepare cm before due to missing return line in pdf printed to pdf from word
cm = (
Expand Down Expand Up @@ -204,71 +220,84 @@ def parse_to_unicode(
.replace(b"]", b" ]\n ")
.replace(b"\r", b"\n")
)
return cm

for l in cm.split(b"\n"):
if l in (b"", b" ") or l[0] == 37: # 37 = %
continue
if b"beginbfrange" in l:
process_rg = True
elif b"endbfrange" in l:
process_rg = False
elif b"beginbfchar" in l:
process_char = True
elif b"endbfchar" in l:
process_char = False
elif process_rg:
lst = [x for x in l.split(b" ") if x]
a = int(lst[0], 16)
b = int(lst[1], 16)
nbi = len(lst[0])
map_dict[-1] = nbi // 2
fmt = b"%%0%dX" % nbi
if lst[2] == b"[":
for sq in lst[3:]:
if sq == b"]":
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else:
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
c += 1
elif process_char:
lst = [x for x in l.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 1:
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
map_to = unhexlify(lst[1]).decode(
"utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = map_to
int_entry.append(int(lst[0], 16))
lst = lst[2:]
for a, value in map_dict.items():
if value == " ":
space_code = a
return map_dict, space_code, int_entry

def process_cm_line(
l: bytes,
process_rg: bool,
process_char: bool,
map_dict: Dict[Any, Any],
int_entry: List[int],
) -> Tuple[bool, bool]:
if l in (b"", b" ") or l[0] == 37: # 37 = %
return process_rg, process_char
if b"beginbfrange" in l:
process_rg = True
elif b"endbfrange" in l:
process_rg = False
elif b"beginbfchar" in l:
process_char = True
elif b"endbfchar" in l:
process_char = False
elif process_rg:
parse_bfrange(l, map_dict, int_entry)
elif process_char:
parse_bfchar(l, map_dict, int_entry)
return process_rg, process_char


def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
lst = [x for x in l.split(b" ") if x]
a = int(lst[0], 16)
b = int(lst[1], 16)
nbi = len(lst[0])
map_dict[-1] = nbi // 2
fmt = b"%%0%dX" % nbi
if lst[2] == b"[":
for sq in lst[3:]:
if sq == b"]":
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else:
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
c += 1


def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
lst = [x for x in l.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 1:
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
map_to = unhexlify(lst[1]).decode(
"utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = map_to
int_entry.append(int(lst[0], 16))
lst = lst[2:]


def compute_space_width(
Expand All @@ -285,7 +314,7 @@ def compute_space_width(
except Exception:
w1[-1] = 1000.0
if "/W" in ft1:
w = list(ft1["/W"]) # type: ignore
w = list(ft1["/W"])
else:
w = []
while len(w) > 0:
Expand Down
7 changes: 4 additions & 3 deletions PyPDF2/_encryption.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
import random
import struct
from enum import IntEnum
from typing import Optional, Tuple, Union, cast
from typing import Any, Dict, Optional, Tuple, Union, cast

from PyPDF2._utils import logger_warning
from PyPDF2.errors import DependencyError
from PyPDF2.generic import (
ArrayObject,
Expand Down Expand Up @@ -565,7 +566,7 @@ def verify_perms(
@staticmethod
def generate_values(
user_pwd: bytes, owner_pwd: bytes, key: bytes, p: int, metadata_encrypted: bool
) -> dict:
) -> Dict[Any, Any]:
u_value, ue_value = AlgV5.compute_U_value(user_pwd, key)
o_value, oe_value = AlgV5.compute_O_value(owner_pwd, key, u_value)
perms = AlgV5.compute_Perms_value(key, p, metadata_encrypted)
Expand Down Expand Up @@ -826,7 +827,7 @@ def verify_v5(self, password: bytes) -> Tuple[bytes, PasswordType]:
P = (P + 0x100000000) % 0x100000000 # maybe < 0
metadata_encrypted = self.entry.get("/EncryptMetadata", True)
if not AlgV5.verify_perms(key, perms, P, metadata_encrypted):
return b"", PasswordType.NOT_DECRYPTED
logger_warning("ignore '/Perms' verify failed", __name__)
return key, rc

@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions PyPDF2/_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,12 +567,12 @@ def find_outline_item(

for i, oi_enum in enumerate(root):
if isinstance(oi_enum, list):
# b is still an inner node
# oi_enum is still an inner node
# (OutlineType, if recursive types were supported by mypy)
res = self.find_outline_item(outline_item, oi_enum) # type: ignore
if res:
return [i] + res
elif oi_enum == outline_item or oi_enum["/Title"] == outline_item:
elif oi_enum == outline_item or cast(Dict[Any, Any], oi_enum["/Title"]) == outline_item:
# we found a leaf node
return [i]

Expand Down Expand Up @@ -689,7 +689,7 @@ def add_named_destination(self, title: str, pagenum: int) -> None:

class PdfFileMerger(PdfMerger): # pragma: no cover
def __init__(self, *args: Any, **kwargs: Any) -> None:
deprecate_with_replacement("PdfFileMerger", "PdfMerge")
deprecate_with_replacement("PdfFileMerger", "PdfMerger")

if "strict" not in kwargs and len(args) < 1:
kwargs["strict"] = True # maintain the default
Expand Down
8 changes: 4 additions & 4 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,9 +506,9 @@ def _merge_page(
# Combine /ProcSet sets.
new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
frozenset(
original_resources.get(RES.PROC_SET, ArrayObject()).get_object() # type: ignore
original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
).union(
frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) # type: ignore
frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
)
)

Expand Down Expand Up @@ -1248,7 +1248,7 @@ def process_operation(operator: bytes, operands: List) -> None:
cmaps[operands[0]][2],
cmaps[operands[0]][3],
operands[0],
) # type:ignore
)
except KeyError: # font not found
_space_width = unknown_char_map[1]
cmap = (
Expand Down Expand Up @@ -1395,7 +1395,7 @@ def process_operation(operator: bytes, operands: List) -> None:
except IndexError:
pass
try:
xobj = resources_dict["/XObject"] # type: ignore
xobj = resources_dict["/XObject"]
if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
# output += text
text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore
Expand Down

0 comments on commit 80ff23b

Please sign in to comment.