Skip to content

Commit

Permalink
BUG: Fix images issue 4 bits encoding and LUT starting with UTF16_BOM (
Browse files Browse the repository at this point in the history
…#2675)

Closes #2660.
  • Loading branch information
pubpub-zz committed May 27, 2024
1 parent 6346e4c commit 7481f36
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 20 deletions.
9 changes: 1 addition & 8 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import codecs
import collections
import decimal
import enum
Expand Down Expand Up @@ -180,13 +179,7 @@ def __init__(

# info object
info = DictionaryObject()
info.update(
{
NameObject("/Producer"): create_string_object(
codecs.BOM_UTF16_BE + "pypdf".encode("utf-16be")
)
}
)
info.update({NameObject("/Producer"): create_string_object("pypdf")})
self._info_obj: PdfObject = self._add_object(info)

# root object
Expand Down
44 changes: 38 additions & 6 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,28 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000
to occur.
"""

autodetect_pdfdocencoding: bool
autodetect_utf16: bool
utf16_bom: bytes

def __new__(cls, value: Any) -> "TextStringObject":
if isinstance(value, bytes):
value = value.decode("charmap")
o = str.__new__(cls, value)
o.autodetect_utf16 = False
o.autodetect_pdfdocencoding = False
o.utf16_bom = b""
if value.startswith(("\xfe\xff", "\xff\xfe")):
o.autodetect_utf16 = True
o.utf16_bom = value[:2].encode("charmap")
else:
try:
encode_pdfdocencoding(o)
o.autodetect_pdfdocencoding = True
except UnicodeEncodeError:
o.autodetect_utf16 = True
return o

def clone(
self,
pdf_dest: Any,
Expand All @@ -518,13 +540,11 @@ def clone(
obj = TextStringObject(self)
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
obj.autodetect_utf16 = self.autodetect_utf16
obj.utf16_bom = self.utf16_bom
return cast(
"TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
)

autodetect_pdfdocencoding = False
autodetect_utf16 = False

@property
def original_bytes(self) -> bytes:
"""
Expand All @@ -542,20 +562,32 @@ def get_original_bytes(self) -> bytes:
# would have been used to create this object, based upon the autodetect
# method.
if self.autodetect_utf16:
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
if self.utf16_bom == codecs.BOM_UTF16_LE:
return codecs.BOM_UTF16_LE + self.encode("utf-16le")
elif self.utf16_bom == codecs.BOM_UTF16_BE:
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
else:
return self.encode("utf-16be")
elif self.autodetect_pdfdocencoding:
return encode_pdfdocencoding(self)
else:
raise Exception("no information about original bytes")
raise Exception("no information about original bytes") # pragma: no cover

def get_encoded_bytes(self) -> bytes:
# Try to write the string out as a PDFDocEncoding encoded string. It's
# nicer to look at in the PDF file. Sadly, we take a performance hit
# here for trying...
try:
if self.autodetect_utf16:
raise UnicodeEncodeError("", "forced", -1, -1, "")
bytearr = encode_pdfdocencoding(self)
except UnicodeEncodeError:
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
if self.utf16_bom == codecs.BOM_UTF16_LE:
bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
elif self.utf16_bom == codecs.BOM_UTF16_BE:
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
else:
bytearr = self.encode("utf-16be")
return bytearr

def write_to_stream(
Expand Down
1 change: 1 addition & 0 deletions pypdf/generic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def create_string_object(
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
retval = TextStringObject(string.decode("utf-16"))
retval.autodetect_utf16 = True
retval.utf16_bom = string[:2]
return retval
else:
# This is probably a big performance hit here, but we need
Expand Down
29 changes: 23 additions & 6 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Test the pypdf.generic module."""

import codecs
from base64 import a85encode
from copy import deepcopy
from io import BytesIO
Expand Down Expand Up @@ -485,14 +486,13 @@ def test_rectangleobject():

def test_textstringobject_exc():
tso = TextStringObject("foo")
with pytest.raises(Exception) as exc:
tso.get_original_bytes()
assert exc.value.args[0] == "no information about original bytes"
assert tso.get_original_bytes() == b"foo"


def test_textstringobject_autodetect_utf16():
tso = TextStringObject("foo")
tso.autodetect_utf16 = True
tso.utf16_bom = codecs.BOM_UTF16_BE
assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"


Expand Down Expand Up @@ -1107,20 +1107,37 @@ def test_indirect_object_invalid_read():
assert exc.value.args[0] == "Error reading indirect object reference at byte 0x5"


def test_create_string_object_utf16be_bom():
def test_create_string_object_utf16_bom():
# utf16-be
result = create_string_object(
b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00"
)
assert result == "PaperPort 14\x00"
assert result.autodetect_utf16 is True
assert result.utf16_bom == b"\xfe\xff"
assert (
result.get_encoded_bytes()
== b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00"
)


def test_create_string_object_utf16le_bom():
# utf16-le
result = create_string_object(
b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)
assert result == "PaperPort 14\x00"
assert result.autodetect_utf16 is True
assert result.utf16_bom == b"\xff\xfe"
assert (
result.get_encoded_bytes()
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)

# utf16-be without bom
result = TextStringObject("ÿ")
result.autodetect_utf16 = True
result.utf16_bom = b""
assert result.get_encoded_bytes() == b"\x00\xFF"
assert result.original_bytes == b"\x00\xFF"


def test_create_string_object_force():
Expand Down
16 changes: 16 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,22 @@ def test_large_compressed_image():
list(reader.pages[0].images)


@pytest.mark.enable_socket()
def test_ff_fe_starting_lut():
"""Cf issue #2660"""
url = "https://github.com/py-pdf/pypdf/files/15385628/original_before_merge.pdf"
name = "iss2660.pdf"
writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)))
b = BytesIO()
writer.write(b)
reader = PdfReader(b)
url = "https://github.com/py-pdf/pypdf/assets/4083478/6150700d-87fd-43a2-8695-c2c05a44838c"
name = "iss2660.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(writer.pages[1].images[0].image, img) == 1.0
assert image_similarity(reader.pages[1].images[0].image, img) == 1.0


@pytest.mark.enable_socket()
def test_inline_image_extraction():
"""Cf #2598"""
Expand Down

0 comments on commit 7481f36

Please sign in to comment.