Skip to content

Commit

Permalink
ROB : improve nameobject reading/writing
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Sep 13, 2022
1 parent e23b985 commit 6253a01
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 21 deletions.
4 changes: 2 additions & 2 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ._codecs import adobe_glyphs, charset_encoding
from ._utils import logger_warning
from .errors import PdfReadWarning
from .generic import DecodedStreamObject, DictionaryObject, NameObject
from .generic import DecodedStreamObject, DictionaryObject


# code freely inspired from @twiggy ; see #711
Expand Down Expand Up @@ -124,7 +124,7 @@ def parse_encoding(
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
enc = NameObject.unnumber(enc) # for #xx decoding
# allready done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding
if enc in charset_encoding:
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
Expand Down
2 changes: 1 addition & 1 deletion PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1121,7 +1121,7 @@ def _extract_text_old(
def _debug_for_extract(self) -> str: # pragma: no cover
out = ""
for ope, op in ContentStream(
self["/Contents"].getObject(), self.pdf, "bytes"
self["/Contents"].get_object(), self.pdf, "bytes"
).operations:
if op == b"TJ":
s = [x for x in ope[0] if isinstance(x, str)]
Expand Down
62 changes: 48 additions & 14 deletions PyPDF2/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import decimal
import hashlib
import re
from binascii import unhexlify
from typing import Any, Callable, Optional, Union

from .._codecs import _pdfdoc_encoding_rev
Expand Down Expand Up @@ -408,24 +409,51 @@ def writeToStream(
class NameObject(str, PdfObject):
delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
surfix = b"/"
renumber_table = {
"#": b"#23",
"(": b"#28",
")": b"#29",
"/": b"#2F",
**{chr(i): f"#{i:02X}".encode() for i in range(33)},
}

def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None:
stream.write(b_(self))
stream.write(self.renumber()) # b_(renumber(self)))

def writeToStream(
self, stream: StreamType, encryption_key: Union[None, str, bytes]
) -> None: # pragma: no cover
deprecate_with_replacement("writeToStream", "write_to_stream")
self.write_to_stream(stream, encryption_key)

def renumber(self) -> bytes:
out = self[0].encode("utf-8")
if out != b"/":
logger_warning(f"Incorrect first char in NameObject:({self})", __name__)
for c in self[1:]:
if c > "~":
for x in c.encode("utf-8"):
out += f"#{x:02X}".encode()
else:
try:
out += self.renumber_table[c]
except KeyError:
out += c.encode("utf-8")
return out

@staticmethod
def unnumber(sin: str) -> str:
i = sin.find("#", 0)
def unnumber(sin: bytes) -> bytes:
i = sin.find(b"#", 0)
while i >= 0:
sin = sin[:i] + chr(int(sin[i + 1 : i + 3], 16)) + sin[i + 3 :]
i = sin.find("#", i + 1)
try:
sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :]
i = sin.find(b"#", i + 1)
except ValueError:
# if the 2 characters after # can not be converted to hexa
# we change nothing and carry on
i = i + 1
return sin

@staticmethod
Expand All @@ -435,20 +463,26 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
raise PdfReadError("name read error")
name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True)
try:
try:
ret = name.decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
ret = name.decode("gbk")
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
ret = NameObject.unnumber(ret)
return NameObject(ret)
name = NameObject.unnumber(name)
for enc in ("utf-8", "latin-1", "gbk"):
try:
ret = name.decode("utf-8")
return NameObject(ret)
except Exception:
pass
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
except (UnicodeEncodeError, UnicodeDecodeError) as e:
if not pdf.strict:
logger_warning("Illegal character in Name Object", __name__)
return NameObject(name)
logger_warning(
f"Illegal character in Name Object ({repr(name)})", __name__
)
return NameObject(name.decode("charmap"))
else:
raise PdfReadError("Illegal character in Name Object") from e
raise PdfReadError(
f"Illegal character in Name Object ({repr(name)})"
) from e

@staticmethod
def readFromStream(
Expand Down
43 changes: 39 additions & 4 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,11 @@ def test_readStringFromStream_excape_digit2():
assert read_string_from_stream(stream) == "hello \x01\x02\x03\x04"


def test_NameObject():
class fake:
strict = False


def test_NameObject(caplog):
stream = BytesIO(b"x")
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, None)
Expand All @@ -186,17 +190,48 @@ def test_NameObject():
== "/paired()parentheses"
)
assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB"

assert (
NameObject.read_from_stream(
BytesIO(b"/#f1j#d4#aa#0c#ce#87#b4#b3#b0#23J#86#fe#2a#b2jYJ#94"), None
BytesIO(b"/#f1j#d4#aa#0c#ce#87#b4#b3#b0#23J#86#fe#2a#b2jYJ#94"), fake()
)
== "/ñjÔª\x0cÎ\x87´³°#J\x86þ*²jYJ\x94"
)

assert (NameObject.read_from_stream(BytesIO(b"/#JA#231f"), None)) == "/#JA#1f"

assert (
NameObject.read_from_stream(
BytesIO(b"/#e4#bd#a0#e5#a5#bd#e4#b8#96#e7#95#8c"), None
)
) == "/你好世界"

# test write
b = BytesIO()
NameObject("/hello").write_to_stream(b, None)
assert bytes(b.getbuffer()) == b"/hello"

caplog.clear()
b = BytesIO()
NameObject("hello").write_to_stream(b, None)
assert bytes(b.getbuffer()) == b"hello"
assert "Incorrect first char" in caplog.text

caplog.clear()
b = BytesIO()
NameObject("/DIJMAC+Arial Black#1").write_to_stream(b, None)
assert bytes(b.getbuffer()) == b"/DIJMAC+Arial#20Black#231"
assert caplog.text == ""

b = BytesIO()
NameObject("/你好世界").write_to_stream(b, None)
assert bytes(b.getbuffer()) == b"/#E4#BD#A0#E5#A5#BD#E4#B8#96#E7#95#8C"
assert caplog.text == ""


def test_destination_fit_r():
d = Destination(
NameObject("title"),
TextStringObject("title"),
NullObject(),
NameObject(TF.FIT_R),
FloatObject(0),
Expand Down Expand Up @@ -812,7 +847,7 @@ def test_name_object_invalid_decode():
# strict:
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, ReaderDummy(strict=True))
assert exc.value.args[0] == "Illegal character in Name Object"
assert "Illegal character in Name Object" in exc.value.args[0]

# non-strict:
stream.seek(0)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,3 +380,15 @@ def test_deprecate_bookmark_decorator_output():
merger.merge(0, reader, import_bookmarks=True)
first_oi_title = 'Valid Destination: Action /GoTo Named Destination "section.1"'
assert merger.outline[0].title == first_oi_title


def test_iss1344(caplog):
url = "https://github.com/py-pdf/PyPDF2/files/9549001/input.pdf"
name = "iss1344.pdf"
m = PdfMerger()
m.append(PdfReader(BytesIO(get_pdf_from_url(url, name=name))))
b = BytesIO()
m.write(b)
p = PdfReader(b).pages[0]
assert "/DIJMAC+Arial Black" in p._debug_for_extract()
assert "adresse où le malade peut être visité" in p.extract_text()

0 comments on commit 6253a01

Please sign in to comment.