diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 3fde174b0..bf6c75a15 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -615,7 +615,10 @@ def write_to_stream( def renumber(self) -> bytes: out = self[0].encode("utf-8") if out != b"/": - deprecate_no_replacement(f"Incorrect first char in NameObject, should start with '/': ({self})", "6.0.0") + deprecate_no_replacement( + f"Incorrect first char in NameObject, should start with '/': ({self})", + "6.0.0", + ) for c in self[1:]: if c > "~": for x in c.encode("utf-8"): @@ -640,6 +643,8 @@ def unnumber(sin: bytes) -> bytes: i = i + 1 return sin + CHARSETS = ("utf-8", "gbk", "latin1") + @staticmethod def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) @@ -650,7 +655,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number name = NameObject.unnumber(name) - for enc in ("utf-8", "gbk"): + for enc in NameObject.CHARSETS: try: ret = name.decode(enc) return NameObject(ret) @@ -659,11 +664,16 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") except (UnicodeEncodeError, UnicodeDecodeError) as e: if not pdf.strict: - logger_warning(f"Illegal character in Name Object ({name!r})", __name__) + logger_warning( + f"Illegal character in NameObject ({name!r}), " + "you may need to adjust NameObject.CHARSETS", + __name__, + ) return NameObject(name.decode("charmap")) else: raise PdfReadError( - f"Illegal character in Name Object ({name!r})" + f"Illegal character in NameObject ({name!r}). " + "You may need to adjust NameObject.CHARSETS.", ) from e diff --git a/tests/test_generic.py b/tests/test_generic.py index 39cf479b7..e772618e6 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,5 +1,6 @@ """Test the pypdf.generic module.""" +from copy import deepcopy from io import BytesIO from pathlib import Path from unittest.mock import patch @@ -212,6 +213,11 @@ def test_name_object(caplog): ) ) == "/你好世界" + # to test latin-1 aka stdencoding + assert ( + NameObject.read_from_stream(BytesIO(b"/DocuSign\xae"), None) + ) == "/DocuSign®" + # test write b = BytesIO() NameObject("/hello").write_to_stream(b) @@ -1036,16 +1042,20 @@ def test_checkboxradiobuttonattributes_opt(): def test_name_object_invalid_decode(): - stream = BytesIO(b"/\x80\x02\x03") - - # strict: - with pytest.raises(PdfReadError) as exc: - NameObject.read_from_stream(stream, ReaderDummy(strict=True)) - assert "Illegal character in Name Object" in exc.value.args[0] - - # non-strict: - stream.seek(0) - NameObject.read_from_stream(stream, ReaderDummy(strict=False)) + charsets = deepcopy(NameObject.CHARSETS) + try: + NameObject.CHARSETS = ("utf-8",) + stream = BytesIO(b"/\x80\x02\x03") + # strict: + with pytest.raises(PdfReadError) as exc: + NameObject.read_from_stream(stream, ReaderDummy(strict=True)) + assert "Illegal character in NameObject " in exc.value.args[0] + + # non-strict: + stream.seek(0) + NameObject.read_from_stream(stream, ReaderDummy(strict=False)) + finally: + NameObject.CHARSETS = charsets def test_indirect_object_invalid_read():