Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add parameter to select images to be removed #2214

Merged
merged 8 commits into from
Oct 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pypdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ._reader import DocumentInformation, PdfFileReader, PdfReader
from ._version import __version__
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
from .constants import ImageType
from .pagerange import PageRange, parse_filename_page_ranges
from .papersizes import PaperSize

Expand All @@ -31,6 +32,7 @@
__all__ = [
"__version__",
"_debug_versions",
"ImageType",
"mult",
"PageRange",
"PaperSize",
Expand Down
111 changes: 73 additions & 38 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
FieldFlag,
FileSpecificationDictionaryEntries,
GoToActionArguments,
ImageType,
InteractiveFormDictEntries,
PageLabelStyle,
TypFitArguments,
Expand Down Expand Up @@ -132,12 +133,16 @@


class ObjectDeletionFlag(enum.IntFlag):
NONE = 0
TEXT = enum.auto()
IMAGES = enum.auto()
LINKS = enum.auto()
ATTACHMENTS = enum.auto()
OBJECTS_3D = enum.auto()
ALL_ANNOTATIONS = enum.auto()
XOBJECT_IMAGES = enum.auto()
INLINE_IMAGES = enum.auto()
DRAWING_IMAGES = enum.auto()
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES


def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
Expand Down Expand Up @@ -2193,33 +2198,42 @@ def remove_objects_from_page(
if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
return self._remove_annots_from_page(page, None)

if to_delete & ObjectDeletionFlag.IMAGES:
jump_operators = []
if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
jump_operators = (
[b"w", b"J", b"j", b"M", b"d", b"i"]
+ [b"W", b"W*"]
+ [b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n"]
+ [b"m", b"l", b"c", b"v", b"y", b"h", b"re"]
+ [b"sh"]
)
else: # del text
if to_delete & ObjectDeletionFlag.TEXT:
jump_operators = [b"Tj", b"TJ", b"'", b'"']

def clean(content: ContentStream, images: List[str], forms: List[str]) -> None:
nonlocal to_delete
nonlocal jump_operators, to_delete
i = 0
while i < len(content.operations):
operands, operator = content.operations[i]
if operator in jump_operators:
if (
(
operator == b"INLINE IMAGE"
and (
cast(ObjectDeletionFlag, to_delete)
& ObjectDeletionFlag.INLINE_IMAGES
)
)
or (operator in jump_operators)
or (
operator == b"Do"
and (
cast(ObjectDeletionFlag, to_delete)
& ObjectDeletionFlag.XOBJECT_IMAGES
)
and (operands[0] in images)
)
):
del content.operations[i]
elif operator == b"Do":
if (
to_delete & ObjectDeletionFlag.IMAGES
and operands[0] in images
or to_delete & ObjectDeletionFlag.TEXT
and operands[0] in forms
):
del content.operations[i]
i += 1
else:
i += 1
content.get_data() # this ensures ._data is rebuilt from the .operations
Expand All @@ -2242,23 +2256,25 @@ def clean_forms(
try:
content: Any = None
if (
to_delete & ObjectDeletionFlag.IMAGES
to_delete
& ObjectDeletionFlag.XOBJECT_IMAGES
and o["/Subtype"] == "/Image"
):
content = NullObject()
content = NullObject() # to delete the image keeping the entry
images.append(k)
if o["/Subtype"] == "/Form":
forms.append(k)
if isinstance(o, ContentStream):
content = o
else:
content = ContentStream(o, self)
content.update(o.items())
for k1 in ["/Length", "/Filter", "/DecodeParms"]:
try:
del content[k1]
except KeyError:
pass
content.update(
{
k1: v1
for k1, v1 in o.items()
if k1 not in ["/Length", "/Filter", "/DecodeParms"]
}
)
clean_forms(content, stack + [elt]) # clean sub forms
if content is not None:
if isinstance(v, IndirectObject):
Expand All @@ -2269,6 +2285,8 @@ def clean_forms(
d[k] = self._add_object(content) # pragma: no cover
except (TypeError, KeyError):
pass
for im in images:
del d[im] # for clean-up
if isinstance(elt, StreamObject): # for /Form
if not isinstance(elt, ContentStream): # pragma: no cover
e = ContentStream(elt, self)
Expand All @@ -2277,40 +2295,57 @@ def clean_forms(
clean(elt, images, forms) # clean the content
return images, forms

if not isinstance(page, PageObject):
page = PageObject(self, page.indirect_reference) # pragma: no cover
if "/Contents" in page:
content = page["/Contents"].get_object()
content = cast(ContentStream, page.get_contents())

if not isinstance(content, ContentStream):
content = ContentStream(content, page)
images, forms = clean_forms(page, [])

clean(content, images, forms)
if isinstance(page["/Contents"], ArrayObject):
for o in page["/Contents"]:
self._objects[o.idnum - 1] = NullObject()
try:
self._objects[
cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1
] = NullObject()
except AttributeError:
pass
page[NameObject("/Contents")] = self._add_object(content)
page.replace_contents(content)

def remove_images(self, ignore_byte_string_object: Optional[bool] = None) -> None:
def remove_images(
self,
to_delete: ImageType = ImageType.ALL,
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
ignore_byte_string_object: Optional[bool] = None,
) -> None:
"""
Remove images from this output.

Args:
to_delete : The type of images to be deleted
(default = all images types)
ignore_byte_string_object: deprecated
"""
if isinstance(to_delete, bool):
ignore_byte_string_object = to_delete
to_delete = ImageType.ALL
if ignore_byte_string_object is not None:
warnings.warn(
"The 'ignore_byte_string_object' argument of remove_images is "
"deprecated and will be removed in pypdf 4.0.0.",
category=DeprecationWarning,
)
i = (
(
ObjectDeletionFlag.XOBJECT_IMAGES
if to_delete & ImageType.XOBJECT_IMAGES
else ObjectDeletionFlag.NONE
)
| (
ObjectDeletionFlag.INLINE_IMAGES
if to_delete & ImageType.INLINE_IMAGES
else ObjectDeletionFlag.NONE
)
| (
ObjectDeletionFlag.DRAWING_IMAGES
if to_delete & ImageType.DRAWING_IMAGES
else ObjectDeletionFlag.NONE
)
)
for page in self.pages:
self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES)
self.remove_objects_from_page(page, i)

def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated
"""
Expand All @@ -2319,7 +2354,7 @@ def removeImages(self, ignoreByteStringObject: bool = False) -> None: # depreca
.. deprecated:: 1.28.0
"""
deprecation_with_replacement("removeImages", "remove_images", "3.0.0")
return self.remove_images(ignoreByteStringObject)
return self.remove_images()

def remove_text(self, ignore_byte_string_object: Optional[bool] = None) -> None:
"""
Expand Down
11 changes: 10 additions & 1 deletion pypdf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
PDF Reference, sixth edition, Version 1.7, 2006.
"""

from enum import IntFlag
from enum import IntFlag, auto
from typing import Dict, Tuple


Expand Down Expand Up @@ -585,3 +585,12 @@ class AnnotationFlag(IntFlag):
TypArguments,
TypFitArguments,
)


class ImageType(IntFlag):
NONE = 0
XOBJECT_IMAGES = auto()
INLINE_IMAGES = auto()
DRAWING_IMAGES = auto()
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
IMAGES = ALL # for consistency with ObjectDeletionFlag
31 changes: 31 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest

from pypdf import (
ImageType,
ObjectDeletionFlag,
PageObject,
PdfMerger,
Expand Down Expand Up @@ -1862,6 +1863,36 @@ def test_object_contains_indirect_reference_to_self():
writer.append(reader)


def test_remove_image_per_type():
writer = PdfWriter(clone_from=RESOURCE_ROOT / "reportlab-inline-image.pdf")
writer.remove_images(ImageType.INLINE_IMAGES)

assert all(
x not in writer.pages[0].get_contents().get_data()
for x in (b"BI", b"ID", b"EI")
)

with pytest.raises(DeprecationWarning):
writer.remove_images(True)

writer = PdfWriter(clone_from=RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf")
writer.remove_images(ImageType.DRAWING_IMAGES)
assert all(
x not in writer.pages[1].get_contents().get_data()
for x in (b" re\n", b"W*", b"f*")
)
assert all(
x in writer.pages[1].get_contents().get_data() for x in (b" TJ\n", b"rg", b"Tm")
)
assert all(
x not in writer.pages[9]["/Resources"]["/XObject"]["/Meta84"].get_data()
for x in (b" re\n", b"W*", b"f*")
)
writer.remove_images(ImageType.XOBJECT_IMAGES)
assert b"Do\n" not in writer.pages[0].get_contents().get_data()
assert len(writer.pages[0]["/Resources"]["/XObject"]) == 0


@pytest.mark.enable_socket()
def test_add_outlines_on_empty_dict():
"""Cf #2233"""
Expand Down
Loading