Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: Allow loading truncated images if required #2586

Merged
merged 7 commits into from
Apr 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 22 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,25 @@ def test_csv_consistency():

# Ensure the urls are unique
assert len(pdfs) == len({pdf["url"] for pdf in pdfs})


class PILContext:
"""Allow changing the PIL/Pillow configuration for some limited scope."""

def __init__(self):
self._saved_load_truncated_images = False

def __enter__(self):
# Allow loading incomplete images.
from PIL import ImageFile
self._saved_load_truncated_images = ImageFile.LOAD_TRUNCATED_IMAGES
ImageFile.LOAD_TRUNCATED_IMAGES = True
return self

def __exit__(self, type_, value, traceback):
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = self._saved_load_truncated_images
if type_:
# Error.
return
return True
17 changes: 9 additions & 8 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject

from . import get_data_from_url
from . import PILContext, get_data_from_url
from .test_encryption import HAS_AES
from .test_images import image_similarity

Expand Down Expand Up @@ -371,13 +371,14 @@ def test_tiff_predictor():
@pytest.mark.enable_socket()
def test_rgba():
"""Decode rgb with transparency"""
reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf")))
data = reader.pages[0].images[0]
assert ".jp2" in data.name
similarity = image_similarity(
data.image, BytesIO(get_data_from_url(name="tika-972174_p0-im0.png"))
)
assert similarity > 0.99
with PILContext():
reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf")))
data = reader.pages[0].images[0]
assert ".jp2" in data.name
similarity = image_similarity(
data.image, BytesIO(get_data_from_url(name="tika-972174_p0-im0.png"))
)
assert similarity > 0.99


@pytest.mark.enable_socket()
Expand Down
15 changes: 8 additions & 7 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
read_object,
)

from . import get_data_from_url, normalize_warnings
from . import PILContext, get_data_from_url, normalize_warnings

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
Expand Down Expand Up @@ -672,12 +672,13 @@ def test_image_extraction(url, name):
if not root.exists():
root.mkdir()

for page in reader.pages:
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)
with PILContext():
for page in reader.pages:
for image in page.images:
filename = root / image.name
with open(filename, "wb") as img:
img.write(image.data)
images_extracted.append(filename)

# Cleanup
do_cleanup = True # set this to False for manual inspection
Expand Down