diff --git a/pypdf/__init__.py b/pypdf/__init__.py index df07b5306..e5e48975a 100644 --- a/pypdf/__init__.py +++ b/pypdf/__init__.py @@ -14,6 +14,7 @@ from ._reader import DocumentInformation, PdfFileReader, PdfReader from ._version import __version__ from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter +from .constants import ImageType from .pagerange import PageRange, parse_filename_page_ranges from .papersizes import PaperSize @@ -31,6 +32,7 @@ __all__ = [ "__version__", "_debug_versions", + "ImageType", "mult", "PageRange", "PaperSize", diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 8f4767415..8f365cf60 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -77,6 +77,7 @@ FieldFlag, FileSpecificationDictionaryEntries, GoToActionArguments, + ImageType, InteractiveFormDictEntries, PageLabelStyle, TypFitArguments, @@ -132,12 +133,16 @@ class ObjectDeletionFlag(enum.IntFlag): + NONE = 0 TEXT = enum.auto() - IMAGES = enum.auto() LINKS = enum.auto() ATTACHMENTS = enum.auto() OBJECTS_3D = enum.auto() ALL_ANNOTATIONS = enum.auto() + XOBJECT_IMAGES = enum.auto() + INLINE_IMAGES = enum.auto() + DRAWING_IMAGES = enum.auto() + IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: @@ -2193,7 +2198,8 @@ def remove_objects_from_page( if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS: return self._remove_annots_from_page(page, None) - if to_delete & ObjectDeletionFlag.IMAGES: + jump_operators = [] + if to_delete & ObjectDeletionFlag.DRAWING_IMAGES: jump_operators = ( [b"w", b"J", b"j", b"M", b"d", b"i"] + [b"W", b"W*"] @@ -2201,25 +2207,33 @@ def remove_objects_from_page( + [b"m", b"l", b"c", b"v", b"y", b"h", b"re"] + [b"sh"] ) - else: # del text + if to_delete & ObjectDeletionFlag.TEXT: jump_operators = [b"Tj", b"TJ", b"'", b'"'] def clean(content: ContentStream, images: List[str], forms: List[str]) -> None: - nonlocal to_delete + nonlocal jump_operators, to_delete i = 0 while i < len(content.operations): operands, operator = content.operations[i] - if operator in jump_operators: + if ( + ( + operator == b"INLINE IMAGE" + and ( + cast(ObjectDeletionFlag, to_delete) + & ObjectDeletionFlag.INLINE_IMAGES + ) + ) + or (operator in jump_operators) + or ( + operator == b"Do" + and ( + cast(ObjectDeletionFlag, to_delete) + & ObjectDeletionFlag.XOBJECT_IMAGES + ) + and (operands[0] in images) + ) + ): del content.operations[i] - elif operator == b"Do": - if ( - to_delete & ObjectDeletionFlag.IMAGES - and operands[0] in images - or to_delete & ObjectDeletionFlag.TEXT - and operands[0] in forms - ): - del content.operations[i] - i += 1 else: i += 1 content.get_data() # this ensures ._data is rebuilt from the .operations @@ -2242,10 +2256,11 @@ def clean_forms( try: content: Any = None if ( - to_delete & ObjectDeletionFlag.IMAGES + to_delete + & ObjectDeletionFlag.XOBJECT_IMAGES and o["/Subtype"] == "/Image" ): - content = NullObject() + content = NullObject() # to delete the image keeping the entry images.append(k) if o["/Subtype"] == "/Form": forms.append(k) @@ -2253,12 +2268,13 @@ def clean_forms( content = o else: content = ContentStream(o, self) - content.update(o.items()) - for k1 in ["/Length", "/Filter", "/DecodeParms"]: - try: - del content[k1] - except KeyError: - pass + content.update( + { + k1: v1 + for k1, v1 in o.items() + if k1 not in ["/Length", "/Filter", "/DecodeParms"] + } + ) clean_forms(content, stack + [elt]) # clean sub forms if content is not None: if isinstance(v, IndirectObject): @@ -2269,6 +2285,8 @@ def clean_forms( d[k] = self._add_object(content) # pragma: no cover except (TypeError, KeyError): pass + for im in images: + del d[im] # for clean-up if isinstance(elt, StreamObject): # for /Form if not isinstance(elt, ContentStream): # pragma: no cover e = ContentStream(elt, self) @@ -2277,40 +2295,57 @@ def clean_forms( clean(elt, images, forms) # clean the content return images, forms + if not isinstance(page, PageObject): + page = PageObject(self, page.indirect_reference) # pragma: no cover if "/Contents" in page: - content = page["/Contents"].get_object() + content = cast(ContentStream, page.get_contents()) - if not isinstance(content, ContentStream): - content = ContentStream(content, page) images, forms = clean_forms(page, []) clean(content, images, forms) - if isinstance(page["/Contents"], ArrayObject): - for o in page["/Contents"]: - self._objects[o.idnum - 1] = NullObject() - try: - self._objects[ - cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1 - ] = NullObject() - except AttributeError: - pass - page[NameObject("/Contents")] = self._add_object(content) + page.replace_contents(content) - def remove_images(self, ignore_byte_string_object: Optional[bool] = None) -> None: + def remove_images( + self, + to_delete: ImageType = ImageType.ALL, + ignore_byte_string_object: Optional[bool] = None, + ) -> None: """ Remove images from this output. Args: + to_delete : The type of images to be deleted + (default = all images types) ignore_byte_string_object: deprecated """ + if isinstance(to_delete, bool): + ignore_byte_string_object = to_delete + to_delete = ImageType.ALL if ignore_byte_string_object is not None: warnings.warn( "The 'ignore_byte_string_object' argument of remove_images is " "deprecated and will be removed in pypdf 4.0.0.", category=DeprecationWarning, ) + i = ( + ( + ObjectDeletionFlag.XOBJECT_IMAGES + if to_delete & ImageType.XOBJECT_IMAGES + else ObjectDeletionFlag.NONE + ) + | ( + ObjectDeletionFlag.INLINE_IMAGES + if to_delete & ImageType.INLINE_IMAGES + else ObjectDeletionFlag.NONE + ) + | ( + ObjectDeletionFlag.DRAWING_IMAGES + if to_delete & ImageType.DRAWING_IMAGES + else ObjectDeletionFlag.NONE + ) + ) for page in self.pages: - self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES) + self.remove_objects_from_page(page, i) def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated """ @@ -2319,7 +2354,7 @@ def removeImages(self, ignoreByteStringObject: bool = False) -> None: # depreca .. deprecated:: 1.28.0 """ deprecation_with_replacement("removeImages", "remove_images", "3.0.0") - return self.remove_images(ignoreByteStringObject) + return self.remove_images() def remove_text(self, ignore_byte_string_object: Optional[bool] = None) -> None: """ diff --git a/pypdf/constants.py b/pypdf/constants.py index bde9ff22d..56a24b183 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -8,7 +8,7 @@ PDF Reference, sixth edition, Version 1.7, 2006. """ -from enum import IntFlag +from enum import IntFlag, auto from typing import Dict, Tuple @@ -585,3 +585,12 @@ class AnnotationFlag(IntFlag): TypArguments, TypFitArguments, ) + + +class ImageType(IntFlag): + NONE = 0 + XOBJECT_IMAGES = auto() + INLINE_IMAGES = auto() + DRAWING_IMAGES = auto() + ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES + IMAGES = ALL # for consistency with ObjectDeletionFlag diff --git a/tests/test_writer.py b/tests/test_writer.py index ca5492b43..281232c4b 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -9,6 +9,7 @@ import pytest from pypdf import ( + ImageType, ObjectDeletionFlag, PageObject, PdfMerger, @@ -1862,6 +1863,36 @@ def test_object_contains_indirect_reference_to_self(): writer.append(reader) +def test_remove_image_per_type(): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "reportlab-inline-image.pdf") + writer.remove_images(ImageType.INLINE_IMAGES) + + assert all( + x not in writer.pages[0].get_contents().get_data() + for x in (b"BI", b"ID", b"EI") + ) + + with pytest.raises(DeprecationWarning): + writer.remove_images(True) + + writer = PdfWriter(clone_from=RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") + writer.remove_images(ImageType.DRAWING_IMAGES) + assert all( + x not in writer.pages[1].get_contents().get_data() + for x in (b" re\n", b"W*", b"f*") + ) + assert all( + x in writer.pages[1].get_contents().get_data() for x in (b" TJ\n", b"rg", b"Tm") + ) + assert all( + x not in writer.pages[9]["/Resources"]["/XObject"]["/Meta84"].get_data() + for x in (b" re\n", b"W*", b"f*") + ) + writer.remove_images(ImageType.XOBJECT_IMAGES) + assert b"Do\n" not in writer.pages[0].get_contents().get_data() + assert len(writer.pages[0]["/Resources"]["/XObject"]) == 0 + + @pytest.mark.enable_socket() def test_add_outlines_on_empty_dict(): """Cf #2233"""