diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 14ac2fabf..683a68369 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -27,7 +27,6 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import codecs import collections import decimal import enum @@ -180,13 +179,7 @@ def __init__( # info object info = DictionaryObject() - info.update( - { - NameObject("/Producer"): create_string_object( - codecs.BOM_UTF16_BE + "pypdf".encode("utf-16be") - ) - } - ) + info.update({NameObject("/Producer"): create_string_object("pypdf")}) self._info_obj: PdfObject = self._add_object(info) # root object diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index bf6c75a15..cf50d820f 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -508,6 +508,28 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000 to occur. """ + autodetect_pdfdocencoding: bool + autodetect_utf16: bool + utf16_bom: bytes + + def __new__(cls, value: Any) -> "TextStringObject": + if isinstance(value, bytes): + value = value.decode("charmap") + o = str.__new__(cls, value) + o.autodetect_utf16 = False + o.autodetect_pdfdocencoding = False + o.utf16_bom = b"" + if value.startswith(("\xfe\xff", "\xff\xfe")): + o.autodetect_utf16 = True + o.utf16_bom = value[:2].encode("charmap") + else: + try: + encode_pdfdocencoding(o) + o.autodetect_pdfdocencoding = True + except UnicodeEncodeError: + o.autodetect_utf16 = True + return o + def clone( self, pdf_dest: Any, @@ -518,13 +540,11 @@ def clone( obj = TextStringObject(self) obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding obj.autodetect_utf16 = self.autodetect_utf16 + obj.utf16_bom = self.utf16_bom return cast( "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) ) - autodetect_pdfdocencoding = False - autodetect_utf16 = False - @property def original_bytes(self) -> bytes: """ @@ -542,20 +562,32 @@ def get_original_bytes(self) -> bytes: # would have been used to create this object, based upon the autodetect # method. if self.autodetect_utf16: - return codecs.BOM_UTF16_BE + self.encode("utf-16be") + if self.utf16_bom == codecs.BOM_UTF16_LE: + return codecs.BOM_UTF16_LE + self.encode("utf-16le") + elif self.utf16_bom == codecs.BOM_UTF16_BE: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + else: + return self.encode("utf-16be") elif self.autodetect_pdfdocencoding: return encode_pdfdocencoding(self) else: - raise Exception("no information about original bytes") + raise Exception("no information about original bytes") # pragma: no cover def get_encoded_bytes(self) -> bytes: # Try to write the string out as a PDFDocEncoding encoded string. It's # nicer to look at in the PDF file. Sadly, we take a performance hit # here for trying... try: + if self.autodetect_utf16: + raise UnicodeEncodeError("", "forced", -1, -1, "") bytearr = encode_pdfdocencoding(self) except UnicodeEncodeError: - bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if self.utf16_bom == codecs.BOM_UTF16_LE: + bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") + elif self.utf16_bom == codecs.BOM_UTF16_BE: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + else: + bytearr = self.encode("utf-16be") return bytearr def write_to_stream( diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index e6da5cf09..049166f71 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -147,6 +147,7 @@ def create_string_object( if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): retval = TextStringObject(string.decode("utf-16")) retval.autodetect_utf16 = True + retval.utf16_bom = string[:2] return retval else: # This is probably a big performance hit here, but we need diff --git a/tests/test_generic.py b/tests/test_generic.py index f59c559e0..88206f723 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,5 +1,6 @@ """Test the pypdf.generic module.""" +import codecs from base64 import a85encode from copy import deepcopy from io import BytesIO @@ -485,14 +486,13 @@ def test_rectangleobject(): def test_textstringobject_exc(): tso = TextStringObject("foo") - with pytest.raises(Exception) as exc: - tso.get_original_bytes() - assert exc.value.args[0] == "no information about original bytes" + assert tso.get_original_bytes() == b"foo" def test_textstringobject_autodetect_utf16(): tso = TextStringObject("foo") tso.autodetect_utf16 = True + tso.utf16_bom = codecs.BOM_UTF16_BE assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o" @@ -1107,20 +1107,37 @@ def test_indirect_object_invalid_read(): assert exc.value.args[0] == "Error reading indirect object reference at byte 0x5" -def test_create_string_object_utf16be_bom(): +def test_create_string_object_utf16_bom(): + # utf16-be result = create_string_object( b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00" ) assert result == "PaperPort 14\x00" assert result.autodetect_utf16 is True + assert result.utf16_bom == b"\xfe\xff" + assert ( + result.get_encoded_bytes() + == b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00" + ) - -def test_create_string_object_utf16le_bom(): + # utf16-le result = create_string_object( b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" ) assert result == "PaperPort 14\x00" assert result.autodetect_utf16 is True + assert result.utf16_bom == b"\xff\xfe" + assert ( + result.get_encoded_bytes() + == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" + ) + + # utf16-be without bom + result = TextStringObject("ΓΏ") + result.autodetect_utf16 = True + result.utf16_bom = b"" + assert result.get_encoded_bytes() == b"\x00\xFF" + assert result.original_bytes == b"\x00\xFF" def test_create_string_object_force(): diff --git a/tests/test_images.py b/tests/test_images.py index 0dbc3956e..5982ecf20 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -358,6 +358,22 @@ def test_large_compressed_image(): list(reader.pages[0].images) +@pytest.mark.enable_socket() +def test_ff_fe_starting_lut(): + """Cf issue #2660""" + url = "https://github.com/py-pdf/pypdf/files/15385628/original_before_merge.pdf" + name = "iss2660.pdf" + writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) + b = BytesIO() + writer.write(b) + reader = PdfReader(b) + url = "https://github.com/py-pdf/pypdf/assets/4083478/6150700d-87fd-43a2-8695-c2c05a44838c" + name = "iss2660.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(writer.pages[1].images[0].image, img) == 1.0 + assert image_similarity(reader.pages[1].images[0].image, img) == 1.0 + + @pytest.mark.enable_socket() def test_inline_image_extraction(): """Cf #2598"""