Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: improve inline image extraction #2622

Merged
merged 48 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
b449664
ROB: improve inline image extraction
pubpub-zz May 3, 2024
44b41a7
fix
pubpub-zz May 4, 2024
0952fee
complete testing
pubpub-zz May 4, 2024
0ba5ae4
complete test
pubpub-zz May 5, 2024
fdbc092
tests
pubpub-zz May 5, 2024
fd57ef7
fix
pubpub-zz May 6, 2024
70f9c02
fix DCT
pubpub-zz May 7, 2024
8996a73
Fix A85
pubpub-zz May 7, 2024
fd6334e
Merge remote-tracking branch 'origin/iss2598' into iss2598
pubpub-zz May 7, 2024
5b38f34
blank
pubpub-zz May 7, 2024
67d51ea
with new link
pubpub-zz May 7, 2024
9fb0974
Merge branch 'pb_stanford' into iss2598
pubpub-zz May 7, 2024
092e2a5
fix test
pubpub-zz May 7, 2024
c5d62a3
BUG: Incorrect number of inline images
pubpub-zz May 8, 2024
ae93628
Merge branch 'iss2629' into iss2598
pubpub-zz May 8, 2024
51bea2c
add test for RL + fix
pubpub-zz May 11, 2024
bd84496
remove encode as not used for the moment
pubpub-zz May 11, 2024
770aaba
Fix + Test
pubpub-zz May 11, 2024
a37b73f
test+fix
pubpub-zz May 11, 2024
184e141
test
pubpub-zz May 11, 2024
85e08bb
test + fix
pubpub-zz May 11, 2024
a7ce07c
test + fix +refactor
pubpub-zz May 11, 2024
d17d192
fix regeneration of inline images
pubpub-zz May 12, 2024
6807f3c
coverage
pubpub-zz May 12, 2024
5d713fc
coverage
pubpub-zz May 12, 2024
623b715
check for space after EI
pubpub-zz May 12, 2024
0da933e
coverage
pubpub-zz May 12, 2024
422eb18
coverage
pubpub-zz May 12, 2024
b79164e
test / fix /refactoring
pubpub-zz May 12, 2024
e247b72
Merge remote-tracking branch 'py-pdf/main' into iss2598
pubpub-zz May 14, 2024
66f858c
fix
pubpub-zz May 14, 2024
ee637c0
fix2
pubpub-zz May 14, 2024
2874e56
Update pypdf/_page.py
pubpub-zz May 20, 2024
81e1f30
Update pypdf/_page.py
pubpub-zz May 20, 2024
90fe459
Update pypdf/_page.py
pubpub-zz May 20, 2024
54e4c1d
Update pypdf/_page.py
pubpub-zz May 20, 2024
d9841dd
Update pypdf/generic/_data_structures.py
pubpub-zz May 20, 2024
ecdba02
Update pypdf/generic/_data_structures.py
pubpub-zz May 20, 2024
ae9fdfc
update from comments
pubpub-zz May 20, 2024
5347820
Merge branch 'main' into iss2598
pubpub-zz May 20, 2024
bcabdc8
Update _data_structures.py
pubpub-zz May 20, 2024
dc045b6
Update _image_inline.py
pubpub-zz May 20, 2024
9c03aa7
Update test_generic.py
pubpub-zz May 20, 2024
a569598
Update test_workflows.py
pubpub-zz May 26, 2024
a52541e
Update _image_inline.py
pubpub-zz May 26, 2024
cfe61a9
Update _image_inline.py
pubpub-zz May 26, 2024
54399d7
Merge branch 'main' into iss2598
pubpub-zz May 26, 2024
7be1fd6
remove coverage ignore on PIL import
pubpub-zz May 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
143 changes: 81 additions & 62 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
# POSSIBILITY OF SUCH DAMAGE.

import math
import re
import sys
from decimal import Decimal
from pathlib import Path
Expand Down Expand Up @@ -58,7 +57,6 @@
mult,
)
from ._utils import (
WHITESPACES_AS_REGEXP,
CompressedTransformationMatrix,
File,
ImageFile,
Expand All @@ -82,6 +80,7 @@
NameObject,
NullObject,
NumberObject,
PdfObject,
RectangleObject,
StreamObject,
)
Expand Down Expand Up @@ -335,7 +334,6 @@ def __init__(
self.pdf = pdf
self.inline_images: Optional[Dict[str, ImageFile]] = None
# below Union for mypy but actually Optional[List[str]]
self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None
self.indirect_reference = indirect_reference

def hash_value_data(self) -> bytes:
Expand Down Expand Up @@ -439,19 +437,8 @@ def _get_ids_image(
return []
else:
call_stack.append(_i)
if self.inline_images_keys is None:
content = self._get_contents_as_bytes() or b""
nb_inlines = 0
for matching in re.finditer(
WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
content,
):
start_of_string = content[: matching.start()]
if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len(
re.findall(b"[^\\\\]\\)", start_of_string)
):
nb_inlines += 1
self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
if self.inline_images is None:
self.inline_images = self._get_inline_images()
if obj is None:
obj = self
if ancest is None:
Expand All @@ -460,7 +447,7 @@ def _get_ids_image(
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
DictionaryObject, obj[PG.RESOURCES]
):
return self.inline_images_keys
return [] if self.inline_images is None else list(self.inline_images.keys())

x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
Expand All @@ -470,7 +457,9 @@ def _get_ids_image(
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))
return lst + self.inline_images_keys
assert self.inline_images is not None
lst.extend(list(self.inline_images.keys()))
return lst

def _get_image(
self,
Expand Down Expand Up @@ -551,6 +540,46 @@ def images(self) -> List[ImageFile]:
"""
return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore

def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject:
"""Translate values used in inline image"""
try:
v = NameObject(
{
"/G": "/DeviceGray",
"/RGB": "/DeviceRGB",
"/CMYK": "/DeviceCMYK",
"/I": "/Indexed",
"/AHx": "/ASCIIHexDecode",
"/A85": "/ASCII85Decode",
"/LZW": "/LZWDecode",
"/Fl": "/FlateDecode",
"/RL": "/RunLengthDecode",
"/CCF": "/CCITTFaxDecode",
"/DCT": "/DCTDecode",
"/DeviceGray": "/DeviceGray",
"/DeviceRGB": "/DeviceRGB",
"/DeviceCMYK": "/DeviceCMYK",
"/Indexed": "/Indexed",
"/ASCIIHexDecode": "/ASCIIHexDecode",
"/ASCII85Decode": "/ASCII85Decode",
"/LZWDecode": "/LZWDecode",
"/FlateDecode": "/FlateDecode",
"/RunLengthDecode": "/RunLengthDecode",
"/CCITTFaxDecode": "/CCITTFaxDecode",
"/DCTDecode": "/DCTDecode",
}[cast(str, v)]
)
except (TypeError, KeyError):
if isinstance(v, NameObject):
# It is a custom name, thus we have to look in resources.
# The only applicable case is for ColorSpace.
try:
res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
v = cast(DictionaryObject, res)[v]
except KeyError: # for res and v
raise PdfReadError(f"Cannot find resource entry {v} for {k}")
return v

def _get_inline_images(self) -> Dict[str, ImageFile]:
"""
get inline_images
Expand Down Expand Up @@ -593,51 +622,39 @@ def _get_inline_images(self) -> Dict[str, ImageFile]:
"/Length": len(ii["__streamdata__"]),
}
for k, v in ii["settings"].items():
try:
v = NameObject(
{
"/G": "/DeviceGray",
"/RGB": "/DeviceRGB",
"/CMYK": "/DeviceCMYK",
"/I": "/Indexed",
"/AHx": "/ASCIIHexDecode",
"/A85": "/ASCII85Decode",
"/LZW": "/LZWDecode",
"/Fl": "/FlateDecode",
"/RL": "/RunLengthDecode",
"/CCF": "/CCITTFaxDecode",
"/DCT": "/DCTDecode",
}[v]
)
except (TypeError, KeyError):
if isinstance(v, NameObject):
# it is a custom name : we have to look in resources :
# the only applicable case is for ColorSpace
try:
res = cast(DictionaryObject, self["/Resources"])[
"/ColorSpace"
]
v = cast(DictionaryObject, res)[v]
except KeyError: # for res and v
raise PdfReadError(
f"Can not find resource entry {v} for {k}"
)
init[
NameObject(
{
"/BPC": "/BitsPerComponent",
"/CS": "/ColorSpace",
"/D": "/Decode",
"/DP": "/DecodeParms",
"/F": "/Filter",
"/H": "/Height",
"/W": "/Width",
"/I": "/Interpolate",
"/Intent": "/Intent",
"/IM": "/ImageMask",
}[k]
if k in {"/Length", "/L"}: # no length is expected
continue
if isinstance(v, list):
v = ArrayObject(
[self._translate_value_inlineimage(k, x) for x in v]
)
] = v
else:
v = self._translate_value_inlineimage(k, v)
k = NameObject(
{
"/BPC": "/BitsPerComponent",
"/CS": "/ColorSpace",
"/D": "/Decode",
"/DP": "/DecodeParms",
"/F": "/Filter",
"/H": "/Height",
"/W": "/Width",
"/I": "/Interpolate",
"/Intent": "/Intent",
"/IM": "/ImageMask",
"/BitsPerComponent": "/BitsPerComponent",
"/ColorSpace": "/ColorSpace",
"/Decode": "/Decode",
"/DecodeParms": "/DecodeParms",
"/Filter": "/Filter",
"/Height": "/Height",
"/Width": "/Width",
"/Interpolate": "/Interpolate",
"/ImageMask": "/ImageMask",
}[k]
)
if k not in init:
init[k] = v
ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
extension, byte_stream, img = _xobj_to_image(ii["object"])
files[f"~{num}~"] = ImageFile(
Expand Down Expand Up @@ -934,6 +951,8 @@ def replace_contents(
# as a backup solution, we put content as an object although not in accordance with pdf ref
# this will be fixed with the _add_object
self[NameObject(PG.CONTENTS)] = content
# forces recalculation of inline_images
self.inline_images = None

def merge_page(
self, page2: "PageObject", expand: bool = False, over: bool = True
Expand Down
3 changes: 2 additions & 1 deletion pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:


WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00")
WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]"
WHITESPACES_AS_BYTES = b"".join(WHITESPACES)
WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"


def paeth_predictor(left: int, up: int, up_left: int) -> int:
Expand Down
42 changes: 21 additions & 21 deletions pypdf/_xobj_image_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@


try:
from PIL import Image
except ImportError:
raise ImportError(
from PIL import Image, UnidentifiedImageError # noqa: F401
except ImportError: # pragma: no cover
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
raise ImportError( # pragma: no cover
"pillow is required to do image extraction. "
"It can be installed via 'pip install pypdf[image]'"
)
Expand Down Expand Up @@ -123,6 +123,24 @@ def _get_imagemode(
return mode, mode == "CMYK"


def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
mask = (1 << bits) - 1
nbuff = bytearray(size[0] * size[1])
by = 0
bit = 8 - bits
for y in range(size[1]):
if (bit != 0) and (bit != 8 - bits):
by += 1
bit = 8 - bits
for x in range(size[0]):
nbuff[y * size[0] + x] = (data[by] >> bit) & mask
bit -= bits
if bit < 0:
by += 1
bit = 8 - bits
return bytes(nbuff)


def _extended_image_frombytes(
mode: str, size: Tuple[int, int], data: bytes
) -> Image.Image:
Expand Down Expand Up @@ -150,24 +168,6 @@ def _handle_flate(
Process image encoded in flateEncode
Returns img, image_format, extension, color inversion
"""

def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
mask = (2 << bits) - 1
nbuff = bytearray(size[0] * size[1])
by = 0
bit = 8 - bits
for y in range(size[1]):
if (bit != 0) and (bit != 8 - bits):
by += 1
bit = 8 - bits
for x in range(size[0]):
nbuff[y * size[0] + x] = (data[by] >> bit) & mask
bit -= bits
if bit < 0:
by += 1
bit = 8 - bits
return bytes(nbuff)

extension = ".png" # mime_type = "image/png"
image_format = "PNG"
lookup: Any
Expand Down
45 changes: 21 additions & 24 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@
import math
import struct
import zlib
from base64 import a85decode
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._utils import (
WHITESPACES_AS_BYTES,
b_,
deprecate_with_replacement,
deprecation_no_replacement,
Expand Down Expand Up @@ -467,7 +469,7 @@ def decode(
Decode an LZW encoded data stream.

Args:
data: bytes`` or ``str`` text to decode.
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.

Returns:
Expand All @@ -487,29 +489,20 @@ def decode(
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
# decode_parms is unused here
"""
Decode an Ascii85 encoded data stream.

Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.

Returns:
decoded data.
"""
if isinstance(data, str):
data = data.encode("ascii")
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
group_index = b = 0
out = bytearray()
for char in data:
if ord("!") <= char <= ord("u"):
group_index += 1
b = b * 85 + (char - 33)
if group_index == 5:
out += struct.pack(b">L", b)
group_index = b = 0
elif char == ord("z"):
assert group_index == 0
out += b"\0\0\0\0"
elif char == ord("~"):
if group_index:
for _ in range(5 - group_index):
b = b * 85 + 84
out += struct.pack(b">L", b)[: group_index - 1]
break
return bytes(out)
data = data.encode()
data = data.strip(WHITESPACES_AS_BYTES)
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)


class DCTDecode:
Expand Down Expand Up @@ -742,6 +735,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
"""
from ._xobj_image_helpers import (
Image,
UnidentifiedImageError,
_extended_image_frombytes,
_get_imagemode,
_handle_flate,
Expand Down Expand Up @@ -808,13 +802,16 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
# I'm not sure if the following logic is correct.
# There might not be any relationship between the filters and the
# extension
if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE):
extension = ".tiff" # mime_type = "image/tiff"
image_format = "TIFF"
else:
extension = ".png" # mime_type = "image/png"
image_format = "PNG"
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
try:
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
except UnidentifiedImageError:
img = _extended_image_frombytes(mode, size, data)
elif lfilters == FT.DCT_DECODE:
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
# invert_color kept unchanged
Expand Down