From 85b3e8785830bf35fc1c21f5c2edf29636281290 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 24 Sep 2022 07:39:46 +0200 Subject: [PATCH] ENH: Add PageObject.images attribute (#1330) --- PyPDF2/_page.py | 24 +++++++++++++++ PyPDF2/_utils.py | 7 +++++ PyPDF2/filters.py | 18 +++++++---- docs/index.rst | 1 + docs/user/extract-images.md | 18 +++++++++++ requirements/ci.in | 1 + requirements/ci.txt | 2 ++ setup.cfg | 1 + tests/test_reader.py | 60 ++++++++++++++++++------------------- tests/test_workflows.py | 51 ++++++++++--------------------- 10 files changed, 110 insertions(+), 73 deletions(-) create mode 100644 docs/user/extract-images.md diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 33d15cf09..5553e3da6 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -48,15 +48,18 @@ from ._cmap import build_char_map, unknown_char_map from ._utils import ( CompressedTransformationMatrix, + File, TransformationMatrixType, deprecate_no_replacement, deprecate_with_replacement, logger_warning, matrix_multiply, ) +from .constants import ImageAttributes as IA from .constants import PageAttributes as PG from .constants import Ressources as RES from .errors import PageSizeNotDefinedError +from .filters import _xobj_to_image from .generic import ( ArrayObject, ContentStream, @@ -365,6 +368,27 @@ def createBlankPage( deprecate_with_replacement("createBlankPage", "create_blank_page") return PageObject.create_blank_page(pdf, width, height) + @property + def images(self) -> List[File]: + """ + Get a list of all images of the page. + + For the moment, this does NOT include inline images. They will be added + in future. + """ + images_extracted: List[File] = [] + if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore + return images_extracted + + x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for obj in x_object: + if x_object[obj][IA.SUBTYPE] == "/Image": + extension, byte_stream = _xobj_to_image(x_object[obj]) + if extension is not None: + filename = f"{obj[1:]}{extension}" + images_extracted.append(File(name=filename, data=byte_stream)) + return images_extracted + @property def rotation(self) -> int: """ diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index eeceda1b4..3aa8986bb 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -33,6 +33,7 @@ import logging import warnings from codecs import getencoder +from dataclasses import dataclass from io import ( DEFAULT_BUFFER_SIZE, BufferedReader, @@ -413,3 +414,9 @@ def rename_kwargs( # type: ignore f"{old_term} is deprecated as an argument. Use {new_term} instead" ) ) + + +@dataclass +class File: + name: str + data: bytes diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 4ac651b39..de7ea8433 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -579,7 +579,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: extension = None if SA.FILTER in x_object_obj: if x_object_obj[SA.FILTER] == FT.FLATE_DECODE: - extension = ".png" + extension = ".png" # mime_type = "image/png" color_space = None if "/ColorSpace" in x_object_obj: color_space = x_object_obj["/ColorSpace"].get_object() @@ -606,16 +606,22 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE], ): - extension = ".png" + # I'm not sure if the following logic is correct. + # There might not be any relationship between the filters and the + # extension + if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]: + extension = ".tiff" # mime_type = "image/tiff" + else: + extension = ".png" # mime_type = "image/png" data = b_(data) elif x_object_obj[SA.FILTER] == FT.DCT_DECODE: - extension = ".jpg" + extension = ".jpg" # mime_type = "image/jpeg" elif x_object_obj[SA.FILTER] == "/JPXDecode": - extension = ".jp2" + extension = ".jp2" # mime_type = "image/x-jp2" elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE: - extension = ".tiff" + extension = ".tiff" # mime_type = "image/tiff" else: - extension = ".png" + extension = ".png" # mime_type = "image/png" img = Image.frombytes(mode, size, data) img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") diff --git a/docs/index.rst b/docs/index.rst index 016e10fe9..a2f7b044b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ You can contribute to `PyPDF2 on Github `_. user/suppress-warnings user/metadata user/extract-text + user/extract-images user/encryption-decryption user/merging-pdfs user/cropping-and-transforming diff --git a/docs/user/extract-images.md b/docs/user/extract-images.md new file mode 100644 index 000000000..5c07bfc44 --- /dev/null +++ b/docs/user/extract-images.md @@ -0,0 +1,18 @@ +# Extract Images + +Every page of a PDF document can contain an arbitrary amount of images. +The names of the files may not be unique. + +```python +from PyPDF2 import PdfReader + +reader = PdfReader("example.pdf") + +page = reader.pages[0] +count = 0 + +for image_file_object in page.images: + with open(str(count) + image_file_object.name, "wb") as fp: + fp.write(image_file_object.data) + count += 1 +``` diff --git a/requirements/ci.in b/requirements/ci.in index 0527a1f05..aa27ae1c1 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -10,3 +10,4 @@ pytest-benchmark pycryptodome typeguard types-Pillow +types-dataclasses diff --git a/requirements/ci.txt b/requirements/ci.txt index bf8372cb2..ab2537fb4 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -73,6 +73,8 @@ typed-ast==1.5.4 # via mypy typeguard==2.13.3 # via -r requirements/ci.in +types-dataclasses==0.6.6 + # via -r requirements/ci.in types-pillow==9.2.1 # via -r requirements/ci.in typing-extensions==4.1.1 diff --git a/setup.cfg b/setup.cfg index 2c0eebe8f..e3fa3556d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,6 +39,7 @@ packages = python_requires = >=3.6 install_requires = typing_extensions >= 3.10.0.0; python_version < '3.10' + dataclasses; python_version < '3.7' [options.extras_require] crypto = PyCryptodome diff --git a/tests/test_reader.py b/tests/test_reader.py index 12adb4b79..693060a50 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -5,12 +5,12 @@ from pathlib import Path import pytest +from PIL import Image from PyPDF2 import PdfReader from PyPDF2._reader import convert_to_int, convertToInt from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG -from PyPDF2.constants import Ressources as RES from PyPDF2.errors import ( EmptyFileError, FileNotDecryptedError, @@ -18,7 +18,6 @@ PdfReadWarning, WrongPasswordError, ) -from PyPDF2.filters import _xobj_to_image from PyPDF2.generic import Destination from . import get_pdf_from_url, normalize_warnings @@ -166,19 +165,27 @@ def test_get_outline(src, outline_elements): @pytest.mark.parametrize( - ("src", "nb_images"), + ("src", "expected_images"), [ - ("pdflatex-outline.pdf", 0), - ("crazyones.pdf", 0), - ("git.pdf", 1), - ("imagemagick-lzw.pdf", 1), - ("imagemagick-ASCII85Decode.pdf", 1), - ("imagemagick-CCITTFaxDecode.pdf", 1), + ("pdflatex-outline.pdf", []), + ("crazyones.pdf", []), + ("git.pdf", ["Image9.png"]), + pytest.param( + "imagemagick-lzw.pdf", + ["Im0.png"], + marks=pytest.mark.xfail(reason="broken image extraction"), + ), + pytest.param( + "imagemagick-ASCII85Decode.pdf", + ["Im0.png"], + marks=pytest.mark.xfail(reason="broken image extraction"), + ), + ("imagemagick-CCITTFaxDecode.pdf", ["Im0.tiff"]), ], ) -def test_get_images(src, nb_images): - src = RESOURCE_ROOT / src - reader = PdfReader(src) +def test_get_images(src, expected_images): + src_abs = RESOURCE_ROOT / src + reader = PdfReader(src_abs) with pytest.raises(TypeError): page = reader.pages["0"] @@ -186,25 +193,16 @@ def test_get_images(src, nb_images): page = reader.pages[-1] page = reader.pages[0] - images_extracted = [] - - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = obj[1:] + ".png" - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) - - assert len(images_extracted) == nb_images - - # Cleanup - for filepath in images_extracted: - os.remove(filepath) + images_extracted = page.images + assert len(images_extracted) == len(expected_images) + for image, expected_image in zip(images_extracted, expected_images): + assert image.name == expected_image + with open(f"test-out-{src}-{image.name}", "wb") as fp: + fp.write(image.data) + assert ( + image.name.split(".")[-1].upper() + == Image.open(io.BytesIO(image.data)).format + ) @pytest.mark.parametrize( diff --git a/tests/test_workflows.py b/tests/test_workflows.py index b5826604c..f738226f4 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -14,11 +14,8 @@ import pytest from PyPDF2 import PdfMerger, PdfReader, PdfWriter -from PyPDF2.constants import ImageAttributes as IA from PyPDF2.constants import PageAttributes as PG -from PyPDF2.constants import Ressources as RES from PyPDF2.errors import PdfReadWarning -from PyPDF2.filters import _xobj_to_image from . import get_pdf_from_url, normalize_warnings @@ -651,17 +648,11 @@ def test_image_extraction(url, name): os.mkdir(root) for page in reader.pages: - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = root / (obj[1:] + extension) - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) + for image in page.images: + filename = root / image.name + with open(filename, "wb") as img: + img.write(image.data) + images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection @@ -684,17 +675,11 @@ def test_image_extraction_strict(): os.mkdir(root) for page in reader.pages: - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = root / (obj[1:] + extension) - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) + for image in page.images: + filename = root / image.name + with open(filename, "wb") as fp: + fp.write(image.data) + images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection @@ -723,17 +708,11 @@ def test_image_extraction2(url, name): os.mkdir(root) for page in reader.pages: - if RES.XOBJECT in page[PG.RESOURCES]: - x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - - for obj in x_object: - if x_object[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(x_object[obj]) - if extension is not None: - filename = root / (obj[1:] + extension) - with open(filename, "wb") as img: - img.write(byte_stream) - images_extracted.append(filename) + for image in page.images: + filename = root / image.name + with open(filename, "wb") as img: + img.write(image.data) + images_extracted.append(filename) # Cleanup do_cleanup = True # set this to False for manual inspection