Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add PageObject.images attribute #1330

Merged
merged 20 commits into from Sep 24, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 20 additions & 0 deletions PyPDF2/_page.py
Expand Up @@ -48,15 +48,18 @@
from ._cmap import build_char_map, unknown_char_map
from ._utils import (
CompressedTransformationMatrix,
File,
TransformationMatrixType,
deprecate_no_replacement,
deprecate_with_replacement,
logger_warning,
matrix_multiply,
)
from .constants import ImageAttributes as IA
from .constants import PageAttributes as PG
from .constants import Ressources as RES
from .errors import PageSizeNotDefinedError
from .filters import _xobj_to_image
from .generic import (
ArrayObject,
ContentStream,
Expand Down Expand Up @@ -365,6 +368,23 @@ def createBlankPage(
deprecate_with_replacement("createBlankPage", "create_blank_page")
return PageObject.create_blank_page(pdf, width, height)

@property
def images(self) -> List[File]:
images_extracted: List[File] = []
if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore
return images_extracted

x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
mime_type, byte_stream = _xobj_to_image(x_object[obj])
if mime_type is not None:
filename = f"{obj[1:]}.{File._mime2extension(mime_type)}"
images_extracted.append(
File(name=filename, data=byte_stream, mime_type=mime_type)
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This strikes me as an odd abstraction, where we are passing in the mime_type as part of the File constructor, but we also need to construct the full filename, using a private static function to boot, but also that the file_extension method doesn't correspond to the extension of the passed in name, but rather mime_type.

If we go the route of passing in the mime_type for the File, I'd advocate for just passing in name sans extension altogether and we can have a special property function that does the concatenation of name + extension to give a "filename" on demand as needed by users.

The only caveat would be for attachments, it may make sense to pass in the full filename, but I'm not well versed on that part of the spec to even know how that API might look.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'll go with 'File only has name + data (no mime_type)' for the moment, because it seems to have only advantages:

  • Less clutter / less code to maintain
  • No potential to discover the wrong mime type
  • We could make _xobj_to_image just pass the file extension as before
  • As _xobj_to_image is a private function, we can easily change the behavior if we see a clear advantage

return images_extracted

@property
def rotation(self) -> int:
"""
Expand Down
23 changes: 23 additions & 0 deletions PyPDF2/_utils.py
Expand Up @@ -33,6 +33,7 @@
import logging
import warnings
from codecs import getencoder
from dataclasses import dataclass
from io import (
DEFAULT_BUFFER_SIZE,
BufferedReader,
Expand Down Expand Up @@ -413,3 +414,25 @@ def rename_kwargs( # type: ignore
f"{old_term} is deprecated as an argument. Use {new_term} instead"
)
)


@dataclass
class File:
name: str
data: bytes
mime_type: str

@property
def file_extension(self) -> str:
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
return File._mime2extension(self.mime_type)

@staticmethod
def _mime2extension(mime_type: str) -> str:
mapping = {
"image/png": "png",
"image/jpeg": "jpg",
"image/x-jp2": "jp2",
"image/gif": "gif",
"image/tiff": "tiff",
}
return mapping.get(mime_type, "unknown")
22 changes: 13 additions & 9 deletions PyPDF2/filters.py
Expand Up @@ -562,7 +562,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
It's unclear if PyPDF2 will keep this function here, hence it's private.
It might get removed at any point.

:return: Tuple[file extension, bytes]
:return: Tuple[mime type, bytes]
"""
from PIL import Image

Expand All @@ -576,10 +576,10 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
mode: Literal["RGB", "P"] = "RGB"
else:
mode = "P"
extension = None
mime_type = None
if SA.FILTER in x_object_obj:
if x_object_obj[SA.FILTER] == FT.FLATE_DECODE:
extension = ".png"
mime_type = "image/png"
color_space = None
if "/ColorSpace" in x_object_obj:
color_space = x_object_obj["/ColorSpace"].get_object()
Expand All @@ -606,19 +606,23 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
[FT.ASCII_85_DECODE],
[FT.CCITT_FAX_DECODE],
):
extension = ".png"
# I'm not sure if the mime types have any relationship to the filters
if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
mime_type = "image/tiff"
else:
mime_type = "image/png"
data = b_(data)
elif x_object_obj[SA.FILTER] == FT.DCT_DECODE:
extension = ".jpg"
mime_type = "image/jpeg"
elif x_object_obj[SA.FILTER] == "/JPXDecode":
extension = ".jp2"
mime_type = "image/x-jp2"
elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE:
extension = ".tiff"
mime_type = "image/tiff"
else:
extension = ".png"
mime_type = "image/png"
img = Image.frombytes(mode, size, data)
img_byte_arr = BytesIO()
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()

return extension, data
return mime_type, data
1 change: 1 addition & 0 deletions docs/index.rst
Expand Up @@ -24,6 +24,7 @@ You can contribute to `PyPDF2 on Github <https://github.com/py-pdf/PyPDF2>`_.
user/suppress-warnings
user/metadata
user/extract-text
user/extract-images
user/encryption-decryption
user/merging-pdfs
user/cropping-and-transforming
Expand Down
18 changes: 18 additions & 0 deletions docs/user/extract-images.md
@@ -0,0 +1,18 @@
# Extract Images

Every page of a PDF document can contain an arbitrary amount of images.
The names of the files may not be unique.

```python
from PyPDF2 import PdfReader

reader = PdfReader("example.pdf")

page = reader.pages[0]
count = 0

for image_file_object in page.images:
with open(str(count) + image_file_object.name, "wb") as fp:
fp.write(image_file_object.data)
count += 1
```
1 change: 1 addition & 0 deletions requirements/ci.in
Expand Up @@ -10,3 +10,4 @@ pytest-benchmark
pycryptodome
typeguard
types-Pillow
types-dataclasses
2 changes: 2 additions & 0 deletions requirements/ci.txt
Expand Up @@ -73,6 +73,8 @@ typed-ast==1.5.4
# via mypy
typeguard==2.13.3
# via -r requirements/ci.in
types-dataclasses==0.6.6
# via -r requirements/ci.in
types-pillow==9.2.1
# via -r requirements/ci.in
typing-extensions==4.1.1
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Expand Up @@ -39,6 +39,7 @@ packages =
python_requires = >=3.6
install_requires =
typing_extensions >= 3.10.0.0; python_version < '3.10'
dataclasses; python_version < '3.7'

[options.extras_require]
crypto = PyCryptodome
Expand Down
60 changes: 29 additions & 31 deletions tests/test_reader.py
Expand Up @@ -5,20 +5,19 @@
from pathlib import Path

import pytest
from PIL import Image

from PyPDF2 import PdfReader
from PyPDF2._reader import convert_to_int, convertToInt
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import (
EmptyFileError,
FileNotDecryptedError,
PdfReadError,
PdfReadWarning,
WrongPasswordError,
)
from PyPDF2.filters import _xobj_to_image
from PyPDF2.generic import Destination

from . import get_pdf_from_url, normalize_warnings
Expand Down Expand Up @@ -166,45 +165,44 @@ def test_get_outline(src, outline_elements):


@pytest.mark.parametrize(
("src", "nb_images"),
("src", "expected_images"),
[
("pdflatex-outline.pdf", 0),
("crazyones.pdf", 0),
("git.pdf", 1),
("imagemagick-lzw.pdf", 1),
("imagemagick-ASCII85Decode.pdf", 1),
("imagemagick-CCITTFaxDecode.pdf", 1),
("pdflatex-outline.pdf", []),
("crazyones.pdf", []),
("git.pdf", [("Image9.png", "image/png")]),
pytest.param(
"imagemagick-lzw.pdf",
[("Im0.png", "unknown")],
marks=pytest.mark.xfail(reason="broken image extraction"),
),
pytest.param(
"imagemagick-ASCII85Decode.pdf",
[("Im0.png", "unknown")],
marks=pytest.mark.xfail(reason="broken image extraction"),
),
("imagemagick-CCITTFaxDecode.pdf", [("Im0.tiff", "image/tiff")]),
],
)
def test_get_images(src, nb_images):
src = RESOURCE_ROOT / src
reader = PdfReader(src)
def test_get_images(src, expected_images):
src_abs = RESOURCE_ROOT / src
reader = PdfReader(src_abs)

with pytest.raises(TypeError):
page = reader.pages["0"]

page = reader.pages[-1]
page = reader.pages[0]

images_extracted = []

if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = obj[1:] + ".png"
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)

assert len(images_extracted) == nb_images

# Cleanup
for filepath in images_extracted:
os.remove(filepath)
images_extracted = page.images
assert len(images_extracted) == len(expected_images)
for image, (expected_image, expected_mime) in zip(
images_extracted, expected_images
):
assert image.name == expected_image
with open(f"test-out-{src}-{image.name}", "wb") as fp:
fp.write(image.data)
assert image.file_extension.upper() == Image.open(io.BytesIO(image.data)).format
assert image.mime_type == expected_mime


@pytest.mark.parametrize(
Expand Down
51 changes: 15 additions & 36 deletions tests/test_workflows.py
Expand Up @@ -14,11 +14,8 @@
import pytest

from PyPDF2 import PdfMerger, PdfReader, PdfWriter
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadWarning
from PyPDF2.filters import _xobj_to_image

from . import get_pdf_from_url, normalize_warnings

Expand Down Expand Up @@ -651,17 +648,11 @@ def test_image_extraction(url, name):
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand All @@ -684,17 +675,11 @@ def test_image_extraction_strict():
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as fp:
fp.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand Down Expand Up @@ -723,17 +708,11 @@ def test_image_extraction2(url, name):
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand Down