Skip to content

Commit

Permalink
ENH: Add PageObject.images attribute (#1330)
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Sep 24, 2022
1 parent dcab241 commit 85b3e87
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 73 deletions.
24 changes: 24 additions & 0 deletions PyPDF2/_page.py
Expand Up @@ -48,15 +48,18 @@
from ._cmap import build_char_map, unknown_char_map
from ._utils import (
CompressedTransformationMatrix,
File,
TransformationMatrixType,
deprecate_no_replacement,
deprecate_with_replacement,
logger_warning,
matrix_multiply,
)
from .constants import ImageAttributes as IA
from .constants import PageAttributes as PG
from .constants import Ressources as RES
from .errors import PageSizeNotDefinedError
from .filters import _xobj_to_image
from .generic import (
ArrayObject,
ContentStream,
Expand Down Expand Up @@ -365,6 +368,27 @@ def createBlankPage(
deprecate_with_replacement("createBlankPage", "create_blank_page")
return PageObject.create_blank_page(pdf, width, height)

@property
def images(self) -> List[File]:
"""
Get a list of all images of the page.
For the moment, this does NOT include inline images. They will be added
in future.
"""
images_extracted: List[File] = []
if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore
return images_extracted

x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = f"{obj[1:]}{extension}"
images_extracted.append(File(name=filename, data=byte_stream))
return images_extracted

@property
def rotation(self) -> int:
"""
Expand Down
7 changes: 7 additions & 0 deletions PyPDF2/_utils.py
Expand Up @@ -33,6 +33,7 @@
import logging
import warnings
from codecs import getencoder
from dataclasses import dataclass
from io import (
DEFAULT_BUFFER_SIZE,
BufferedReader,
Expand Down Expand Up @@ -413,3 +414,9 @@ def rename_kwargs( # type: ignore
f"{old_term} is deprecated as an argument. Use {new_term} instead"
)
)


@dataclass
class File:
name: str
data: bytes
18 changes: 12 additions & 6 deletions PyPDF2/filters.py
Expand Up @@ -579,7 +579,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
extension = None
if SA.FILTER in x_object_obj:
if x_object_obj[SA.FILTER] == FT.FLATE_DECODE:
extension = ".png"
extension = ".png" # mime_type = "image/png"
color_space = None
if "/ColorSpace" in x_object_obj:
color_space = x_object_obj["/ColorSpace"].get_object()
Expand All @@ -606,16 +606,22 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]:
[FT.ASCII_85_DECODE],
[FT.CCITT_FAX_DECODE],
):
extension = ".png"
# I'm not sure if the following logic is correct.
# There might not be any relationship between the filters and the
# extension
if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]:
extension = ".tiff" # mime_type = "image/tiff"
else:
extension = ".png" # mime_type = "image/png"
data = b_(data)
elif x_object_obj[SA.FILTER] == FT.DCT_DECODE:
extension = ".jpg"
extension = ".jpg" # mime_type = "image/jpeg"
elif x_object_obj[SA.FILTER] == "/JPXDecode":
extension = ".jp2"
extension = ".jp2" # mime_type = "image/x-jp2"
elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE:
extension = ".tiff"
extension = ".tiff" # mime_type = "image/tiff"
else:
extension = ".png"
extension = ".png" # mime_type = "image/png"
img = Image.frombytes(mode, size, data)
img_byte_arr = BytesIO()
img.save(img_byte_arr, format="PNG")
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Expand Up @@ -24,6 +24,7 @@ You can contribute to `PyPDF2 on Github <https://github.com/py-pdf/PyPDF2>`_.
user/suppress-warnings
user/metadata
user/extract-text
user/extract-images
user/encryption-decryption
user/merging-pdfs
user/cropping-and-transforming
Expand Down
18 changes: 18 additions & 0 deletions docs/user/extract-images.md
@@ -0,0 +1,18 @@
# Extract Images

Every page of a PDF document can contain an arbitrary amount of images.
The names of the files may not be unique.

```python
from PyPDF2 import PdfReader

reader = PdfReader("example.pdf")

page = reader.pages[0]
count = 0

for image_file_object in page.images:
with open(str(count) + image_file_object.name, "wb") as fp:
fp.write(image_file_object.data)
count += 1
```
1 change: 1 addition & 0 deletions requirements/ci.in
Expand Up @@ -10,3 +10,4 @@ pytest-benchmark
pycryptodome
typeguard
types-Pillow
types-dataclasses
2 changes: 2 additions & 0 deletions requirements/ci.txt
Expand Up @@ -73,6 +73,8 @@ typed-ast==1.5.4
# via mypy
typeguard==2.13.3
# via -r requirements/ci.in
types-dataclasses==0.6.6
# via -r requirements/ci.in
types-pillow==9.2.1
# via -r requirements/ci.in
typing-extensions==4.1.1
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Expand Up @@ -39,6 +39,7 @@ packages =
python_requires = >=3.6
install_requires =
typing_extensions >= 3.10.0.0; python_version < '3.10'
dataclasses; python_version < '3.7'

[options.extras_require]
crypto = PyCryptodome
Expand Down
60 changes: 29 additions & 31 deletions tests/test_reader.py
Expand Up @@ -5,20 +5,19 @@
from pathlib import Path

import pytest
from PIL import Image

from PyPDF2 import PdfReader
from PyPDF2._reader import convert_to_int, convertToInt
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import (
EmptyFileError,
FileNotDecryptedError,
PdfReadError,
PdfReadWarning,
WrongPasswordError,
)
from PyPDF2.filters import _xobj_to_image
from PyPDF2.generic import Destination

from . import get_pdf_from_url, normalize_warnings
Expand Down Expand Up @@ -166,45 +165,44 @@ def test_get_outline(src, outline_elements):


@pytest.mark.parametrize(
("src", "nb_images"),
("src", "expected_images"),
[
("pdflatex-outline.pdf", 0),
("crazyones.pdf", 0),
("git.pdf", 1),
("imagemagick-lzw.pdf", 1),
("imagemagick-ASCII85Decode.pdf", 1),
("imagemagick-CCITTFaxDecode.pdf", 1),
("pdflatex-outline.pdf", []),
("crazyones.pdf", []),
("git.pdf", ["Image9.png"]),
pytest.param(
"imagemagick-lzw.pdf",
["Im0.png"],
marks=pytest.mark.xfail(reason="broken image extraction"),
),
pytest.param(
"imagemagick-ASCII85Decode.pdf",
["Im0.png"],
marks=pytest.mark.xfail(reason="broken image extraction"),
),
("imagemagick-CCITTFaxDecode.pdf", ["Im0.tiff"]),
],
)
def test_get_images(src, nb_images):
src = RESOURCE_ROOT / src
reader = PdfReader(src)
def test_get_images(src, expected_images):
src_abs = RESOURCE_ROOT / src
reader = PdfReader(src_abs)

with pytest.raises(TypeError):
page = reader.pages["0"]

page = reader.pages[-1]
page = reader.pages[0]

images_extracted = []

if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = obj[1:] + ".png"
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)

assert len(images_extracted) == nb_images

# Cleanup
for filepath in images_extracted:
os.remove(filepath)
images_extracted = page.images
assert len(images_extracted) == len(expected_images)
for image, expected_image in zip(images_extracted, expected_images):
assert image.name == expected_image
with open(f"test-out-{src}-{image.name}", "wb") as fp:
fp.write(image.data)
assert (
image.name.split(".")[-1].upper()
== Image.open(io.BytesIO(image.data)).format
)


@pytest.mark.parametrize(
Expand Down
51 changes: 15 additions & 36 deletions tests/test_workflows.py
Expand Up @@ -14,11 +14,8 @@
import pytest

from PyPDF2 import PdfMerger, PdfReader, PdfWriter
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadWarning
from PyPDF2.filters import _xobj_to_image

from . import get_pdf_from_url, normalize_warnings

Expand Down Expand Up @@ -651,17 +648,11 @@ def test_image_extraction(url, name):
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand All @@ -684,17 +675,11 @@ def test_image_extraction_strict():
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as fp:
fp.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand Down Expand Up @@ -723,17 +708,11 @@ def test_image_extraction2(url, name):
os.mkdir(root)

for page in reader.pages:
if RES.XOBJECT in page[PG.RESOURCES]:
x_object = page[PG.RESOURCES][RES.XOBJECT].get_object()

for obj in x_object:
if x_object[obj][IA.SUBTYPE] == "/Image":
extension, byte_stream = _xobj_to_image(x_object[obj])
if extension is not None:
filename = root / (obj[1:] + extension)
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand Down

0 comments on commit 85b3e87

Please sign in to comment.