diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 90a6725b2..e2d061a66 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -54,7 +54,7 @@ jobs: if: matrix.python-version == '3.10.1' - name: Test with mypy run : | - mypy PyPDF2 --show-error-codes + mypy PyPDF2 --show-error-codes --disallow-untyped-defs --disallow-incomplete-defs - name: Upload coverage data uses: actions/upload-artifact@v3 with: diff --git a/CHANGELOG b/CHANGELOG index 506a40f55..66c317969 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,8 +5,10 @@ Deprecations (DEP): - PyPDF2 2.0 requires Python 3.6+. Python 2.7 and 3.5 support were dropped. - PdfFileReader and PdfFileMerger no longer have the `overwriteWarnings` parameter. The new behavior is `overwriteWarnings=False`. +- PdfFileReader: The "warndest" parameter was removed - merger: OutlinesObject was removed. - utils: + * `ConvertFunctionsToVirtualList` was removed * `formatWarning` was removed * `isInt(obj)`: Use `instance(obj, int)` instead * `u_(s)`: Use `s` directly diff --git a/Makefile b/Makefile index ef450eaca..29e6f5f3e 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,9 @@ clean: test: pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 +testtype: + pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 --typeguard-packages=PyPDF2 + mutation-test: mutmut run diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 54d1f96cf..39cf4ef3a 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -29,6 +29,18 @@ import math import uuid +from decimal import Decimal +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, + cast, +) from PyPDF2 import utils from PyPDF2.constants import PageAttributes as PG @@ -49,8 +61,8 @@ from PyPDF2.utils import b_ -def getRectangle(self, name, defaults): - retval = self.get(name) +def getRectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: + retval: Union[None, RectangleObject, IndirectObject] = self.get(name) if isinstance(retval, RectangleObject): return retval if retval is None: @@ -60,22 +72,22 @@ def getRectangle(self, name, defaults): break if isinstance(retval, IndirectObject): retval = self.pdf.getObject(retval) - retval = RectangleObject(retval) + retval = RectangleObject(retval) # type: ignore setRectangle(self, name, retval) return retval -def setRectangle(self, name, value): +def setRectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: if not isinstance(name, NameObject): name = NameObject(name) self[name] = value -def deleteRectangle(self, name): +def deleteRectangle(self: Any, name: str) -> None: del self[name] -def createRectangleAccessor(name, fallback): +def createRectangleAccessor(name: str, fallback: Iterable[str]) -> property: return property( lambda self: getRectangle(self, name, fallback), lambda self, value: setRectangle(self, name, value), @@ -98,13 +110,23 @@ class PageObject(DictionaryObject): this object in its source PDF """ - def __init__(self, pdf=None, indirectRef=None): + def __init__( + self, + pdf: Optional[Any] = None, # PdfFileReader + indirectRef: Optional[IndirectObject] = None, + ) -> None: + from PyPDF2._reader import PdfFileReader + DictionaryObject.__init__(self) - self.pdf = pdf + self.pdf: Optional[PdfFileReader] = pdf self.indirectRef = indirectRef @staticmethod - def createBlankPage(pdf=None, width=None, height=None): + def createBlankPage( + pdf: Optional[Any] = None, # PdfFileReader + width: Union[float, Decimal, None] = None, + height: Union[float, Decimal, None] = None, + ) -> "PageObject": """ Return a new blank page. @@ -135,12 +157,12 @@ def createBlankPage(pdf=None, width=None, height=None): else: raise PageSizeNotDefinedError() page.__setitem__( - NameObject(PG.MEDIABOX), RectangleObject([0, 0, width, height]) + NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) ) return page - def rotateClockwise(self, angle): + def rotateClockwise(self, angle: float) -> "PageObject": """ Rotate a page clockwise by increments of 90 degrees. @@ -152,7 +174,7 @@ def rotateClockwise(self, angle): self._rotate(angle) return self - def rotateCounterClockwise(self, angle): + def rotateCounterClockwise(self, angle: float) -> "PageObject": """ Rotate a page counter-clockwise by increments of 90 degrees. @@ -164,7 +186,7 @@ def rotateCounterClockwise(self, angle): self._rotate(-angle) return self - def _rotate(self, angle): + def _rotate(self, angle: float) -> None: rotate_obj = self.get(PG.ROTATE, 0) current_angle = ( rotate_obj if isinstance(rotate_obj, int) else rotate_obj.getObject() @@ -172,10 +194,14 @@ def _rotate(self, angle): self[NameObject(PG.ROTATE)] = NumberObject(current_angle + angle) @staticmethod - def _mergeResources(res1, res2, resource): + def _mergeResources( + res1: DictionaryObject, res2: DictionaryObject, resource: Any + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: new_res = DictionaryObject() new_res.update(res1.get(resource, DictionaryObject()).getObject()) - page2res = res2.get(resource, DictionaryObject()).getObject() + page2res = cast( + DictionaryObject, res2.get(resource, DictionaryObject()).getObject() + ) rename_res = {} for key in list(page2res.keys()): if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): @@ -187,7 +213,9 @@ def _mergeResources(res1, res2, resource): return new_res, rename_res @staticmethod - def _contentStreamRename(stream, rename, pdf): + def _contentStreamRename( + stream: ContentStream, rename: Dict[Any, Any], pdf: Any # PdfFileReader + ) -> ContentStream: if not rename: return stream stream = ContentStream(stream, pdf) @@ -207,17 +235,19 @@ def _contentStreamRename(stream, rename, pdf): return stream @staticmethod - def _pushPopGS(contents, pdf): + def _pushPopGS(contents: Any, pdf: Any) -> ContentStream: # PdfFileReader # adds a graphics state "push" and "pop" to the beginning and end # of a content stream. This isolates it from changes such as # transformation matricies. stream = ContentStream(contents, pdf) - stream.operations.insert(0, [[], "q"]) - stream.operations.append([[], "Q"]) + stream.operations.insert(0, ([], "q")) + stream.operations.append(([], "Q")) return stream @staticmethod - def _addTransformationMatrix(contents, pdf, ctm): + def _addTransformationMatrix( + contents: Any, pdf: Any, ctm: Iterable[float] + ) -> ContentStream: # PdfFileReader # adds transformation matrix at the beginning of the given # contents stream. a, b, c, d, e, f = ctm @@ -238,7 +268,7 @@ def _addTransformationMatrix(contents, pdf, ctm): ) return contents - def getContents(self): + def getContents(self) -> Optional[ContentStream]: """ Access the page contents. @@ -246,11 +276,11 @@ def getContents(self): ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 """ if PG.CONTENTS in self: - return self[PG.CONTENTS].getObject() + return self[PG.CONTENTS].getObject() # type: ignore else: return None - def mergePage(self, page2): + def mergePage(self, page2: "PageObject") -> None: """ Merge the content streams of two pages into one. @@ -265,15 +295,21 @@ def mergePage(self, page2): """ self._mergePage(page2) - def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): + def _mergePage( + self, + page2: "PageObject", + page2transformation: Optional[Callable[[Any], ContentStream]] = None, + ctm: Optional[Iterable[float]] = None, + expand: bool = False, + ) -> None: # First we work on merging the resource dictionaries. This allows us # to find out what symbols in the content streams we might need to # rename. new_resources = DictionaryObject() rename = {} - original_resources = self[PG.RESOURCES].getObject() - page2resources = page2[PG.RESOURCES].getObject() + original_resources = cast(DictionaryObject, self[PG.RESOURCES].getObject()) + page2resources = cast(DictionaryObject, page2[PG.RESOURCES].getObject()) new_annots = ArrayObject() for page in (self, page2): @@ -302,9 +338,9 @@ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): # Combine /ProcSet sets. new_resources[NameObject(RES.PROC_SET)] = ArrayObject( frozenset( - original_resources.get(RES.PROC_SET, ArrayObject()).getObject() + original_resources.get(RES.PROC_SET, ArrayObject()).getObject() # type: ignore ).union( - frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).getObject()) + frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).getObject()) # type: ignore ) ) @@ -319,7 +355,7 @@ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): page2content = ContentStream(page2content, self.pdf) page2content.operations.insert( 0, - [ + ( map( FloatObject, [ @@ -330,10 +366,10 @@ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): ], ), "re", - ], + ), ) - page2content.operations.insert(1, [[], "W"]) - page2content.operations.insert(2, [[], "n"]) + page2content.operations.insert(1, ([], "W")) + page2content.operations.insert(2, ([], "n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._contentStreamRename( @@ -388,7 +424,9 @@ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): self[NameObject(PG.RESOURCES)] = new_resources self[NameObject(PG.ANNOTS)] = new_annots - def mergeTransformedPage(self, page2, ctm, expand=False): + def mergeTransformedPage( + self, page2: "PageObject", ctm: Iterable[float], expand: bool = False + ) -> None: """ mergeTransformedPage is similar to mergePage, but a transformation matrix is applied to the merged stream. @@ -409,7 +447,9 @@ def mergeTransformedPage(self, page2, ctm, expand=False): expand, ) - def mergeScaledPage(self, page2, scale, expand=False): + def mergeScaledPage( + self, page2: "PageObject", scale: float, expand: bool = False + ) -> None: """ mergeScaledPage is similar to mergePage, but the stream to be merged is scaled by appling a transformation matrix. @@ -421,9 +461,11 @@ def mergeScaledPage(self, page2, scale, expand=False): dimensions of the page to be merged. """ # CTM to scale : [ sx 0 0 sy 0 0 ] - return self.mergeTransformedPage(page2, [scale, 0, 0, scale, 0, 0], expand) + self.mergeTransformedPage(page2, [scale, 0, 0, scale, 0, 0], expand) - def mergeRotatedPage(self, page2, rotation, expand=False): + def mergeRotatedPage( + self, page2: "PageObject", rotation: float, expand: bool = False + ) -> None: """ mergeRotatedPage is similar to mergePage, but the stream to be merged is rotated by appling a transformation matrix. @@ -435,7 +477,7 @@ def mergeRotatedPage(self, page2, rotation, expand=False): dimensions of the page to be merged. """ rotation = math.radians(rotation) - return self.mergeTransformedPage( + self.mergeTransformedPage( page2, [ math.cos(rotation), @@ -448,7 +490,9 @@ def mergeRotatedPage(self, page2, rotation, expand=False): expand, ) - def mergeTranslatedPage(self, page2, tx, ty, expand=False): + def mergeTranslatedPage( + self, page2: "PageObject", tx: float, ty: float, expand: bool = False + ) -> None: """ mergeTranslatedPage is similar to mergePage, but the stream to be merged is translated by appling a transformation matrix. @@ -460,9 +504,16 @@ def mergeTranslatedPage(self, page2, tx, ty, expand=False): :param bool expand: Whether the page should be expanded to fit the dimensions of the page to be merged. """ - return self.mergeTransformedPage(page2, [1, 0, 0, 1, tx, ty], expand) + self.mergeTransformedPage(page2, [1, 0, 0, 1, tx, ty], expand) - def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): + def mergeRotatedTranslatedPage( + self, + page2: "PageObject", + rotation: float, + tx: float, + ty: float, + expand: bool = False, + ) -> None: """ mergeRotatedTranslatedPage is similar to mergePage, but the stream to be merged is rotated and translated by appling a transformation matrix. @@ -476,14 +527,14 @@ def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): dimensions of the page to be merged. """ - translation = [[1, 0, 0], [0, 1, 0], [-tx, -ty, 1]] + translation: List[List[float]] = [[1, 0, 0], [0, 1, 0], [-tx, -ty, 1]] rotation = math.radians(rotation) - rotating = [ + rotating: List[List[float]] = [ [math.cos(rotation), math.sin(rotation), 0], [-math.sin(rotation), math.cos(rotation), 0], [0, 0, 1], ] - rtranslation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] + rtranslation: List[List[float]] = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] ctm = utils.matrixMultiply(translation, rotating) ctm = utils.matrixMultiply(ctm, rtranslation) @@ -493,7 +544,9 @@ def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): expand, ) - def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): + def mergeRotatedScaledPage( + self, page2: "PageObject", rotation: float, scale: float, expand: bool = False + ) -> None: """ mergeRotatedScaledPage is similar to mergePage, but the stream to be merged is rotated and scaled by appling a transformation matrix. @@ -506,21 +559,28 @@ def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): dimensions of the page to be merged. """ rotation = math.radians(rotation) - rotating = [ + rotating: List[List[float]] = [ [math.cos(rotation), math.sin(rotation), 0], [-math.sin(rotation), math.cos(rotation), 0], [0, 0, 1], ] - scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] + scaling: List[List[float]] = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] ctm = utils.matrixMultiply(rotating, scaling) - return self.mergeTransformedPage( + self.mergeTransformedPage( page2, [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], expand, ) - def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): + def mergeScaledTranslatedPage( + self, + page2: "PageObject", + scale: float, + tx: float, + ty: float, + expand: bool = False, + ) -> None: """ mergeScaledTranslatedPage is similar to mergePage, but the stream to be merged is translated and scaled by appling a transformation matrix. @@ -534,8 +594,8 @@ def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): dimensions of the page to be merged. """ - translation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] - scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] + translation: List[List[float]] = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] + scaling: List[List[float]] = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] ctm = utils.matrixMultiply(scaling, translation) return self.mergeTransformedPage( @@ -545,8 +605,14 @@ def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): ) def mergeRotatedScaledTranslatedPage( - self, page2, rotation, scale, tx, ty, expand=False - ): + self, + page2: "PageObject", + rotation: float, + scale: float, + tx: float, + ty: float, + expand: bool = False, + ) -> None: """ mergeRotatedScaledTranslatedPage is similar to mergePage, but the stream to be merged is translated, rotated and scaled by appling a @@ -561,24 +627,24 @@ def mergeRotatedScaledTranslatedPage( :param bool expand: Whether the page should be expanded to fit the dimensions of the page to be merged. """ - translation = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] + translation: List[List[float]] = [[1, 0, 0], [0, 1, 0], [tx, ty, 1]] rotation = math.radians(rotation) - rotating = [ + rotating: List[List[float]] = [ [math.cos(rotation), math.sin(rotation), 0], [-math.sin(rotation), math.cos(rotation), 0], [0, 0, 1], ] - scaling = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] + scaling: List[List[float]] = [[scale, 0, 0], [0, scale, 0], [0, 0, 1]] ctm = utils.matrixMultiply(rotating, scaling) ctm = utils.matrixMultiply(ctm, translation) - return self.mergeTransformedPage( + self.mergeTransformedPage( page2, [ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]], expand, ) - def addTransformation(self, ctm): + def addTransformation(self, ctm: List[float]) -> None: """ Apply a transformation matrix to the page. @@ -593,7 +659,7 @@ def addTransformation(self, ctm): new_content = PageObject._pushPopGS(new_content, self.pdf) self[NameObject(PG.CONTENTS)] = new_content - def scale(self, sx, sy): + def scale(self, sx: float, sy: float) -> None: """ Scale a page by the given factors by appling a transformation matrix to its content and updating the page size. @@ -603,35 +669,35 @@ def scale(self, sx, sy): """ self.addTransformation([sx, 0, 0, sy, 0, 0]) self.mediaBox = RectangleObject( - [ + ( float(self.mediaBox.getLowerLeft_x()) * sx, float(self.mediaBox.getLowerLeft_y()) * sy, float(self.mediaBox.getUpperRight_x()) * sx, float(self.mediaBox.getUpperRight_y()) * sy, - ] + ) ) if PG.VP in self: viewport = self[PG.VP] if isinstance(viewport, ArrayObject): bbox = viewport[0]["/BBox"] else: - bbox = viewport["/BBox"] + bbox = viewport["/BBox"] # type: ignore scaled_bbox = RectangleObject( - [ + ( float(bbox[0]) * sx, float(bbox[1]) * sy, float(bbox[2]) * sx, float(bbox[3]) * sy, - ] + ) ) if isinstance(viewport, ArrayObject): - self[NameObject(PG.VP)][NumberObject(0)][ + self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore NameObject("/BBox") ] = scaled_bbox else: - self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox + self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore - def scaleBy(self, factor): + def scaleBy(self, factor: float) -> None: """ Scale a page by the given factor by appling a transformation matrix to its content and updating the page size. @@ -640,7 +706,7 @@ def scaleBy(self, factor): """ self.scale(factor, factor) - def scaleTo(self, width, height): + def scaleTo(self, width: float, height: float) -> None: """ Scale a page to the specified dimentions by appling a transformation matrix to its content and updating the page size. @@ -656,7 +722,7 @@ def scaleTo(self, width, height): ) self.scale(sx, sy) - def compressContentStreams(self): + def compressContentStreams(self) -> None: """ Compress the size of this page by joining all content streams and applying a FlateDecode filter. @@ -670,7 +736,7 @@ def compressContentStreams(self): content = ContentStream(content, self.pdf) self[NameObject(PG.CONTENTS)] = content.flateEncode() - def extractText(self, Tj_sep="", TJ_sep=""): + def extractText(self, Tj_sep: str = "", TJ_sep: str = "") -> str: """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF @@ -679,7 +745,7 @@ def extractText(self, Tj_sep="", TJ_sep=""): this function, as it will change if this function is made more sophisticated. - :return: a unicode string object. + :return: a string object. """ text = "" content = self[PG.CONTENTS].getObject() diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index f7d7b73f7..5de83f009 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -28,15 +28,27 @@ # POSSIBILITY OF SUCH DAMAGE. import struct -import sys import warnings from hashlib import md5 from io import BytesIO +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Union, + cast, +) from PyPDF2 import utils from PyPDF2._page import PageObject from PyPDF2._security import _alg33_1, _alg34, _alg35 from PyPDF2.constants import CatalogAttributes as CA +from PyPDF2.constants import CatalogDictionary as CD from PyPDF2.constants import Core as CO from PyPDF2.constants import DocumentInformationAttributes as DI from PyPDF2.constants import EncryptionDictAttributes as ED @@ -49,23 +61,36 @@ ArrayObject, BooleanObject, ByteStringObject, + ContentStream, + DecodedStreamObject, Destination, DictionaryObject, + EncodedStreamObject, Field, + FloatObject, IndirectObject, NameObject, NullObject, NumberObject, + PdfObject, StreamObject, TextStringObject, + TreeObject, createStringObject, - readNonWhitespace, readObject, ) -from PyPDF2.utils import ConvertFunctionsToVirtualList, b_, readUntilWhitespace +from PyPDF2.types import OutlinesType +from PyPDF2.utils import ( + StrByteType, + StreamType, + b_, + readNonWhitespace, + readUntilWhitespace, +) +from PyPDF2.xmp import XmpInformation -def convertToInt(d, size): +def convertToInt(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: if size > 8: raise PdfReadError("invalid size in convertToInt") d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) @@ -73,6 +98,39 @@ def convertToInt(d, size): return struct.unpack(">q", d)[0] +class _VirtualList: + def __init__( + self, + length_function: Callable[[], int], + get_function: Callable[[int], PageObject], + ) -> None: + self.length_function = length_function + self.get_function = get_function + self.current = -1 + + def __len__(self) -> int: + return self.length_function() + + def __getitem__(self, index: int) -> PageObject: + if isinstance(index, slice): + indices = range(*index.indices(len(self))) + cls = type(self) + return cls(indices.__len__, lambda idx: self[indices[idx]]) + if not isinstance(index, int): + raise TypeError("sequence indices must be integers") + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.get_function(index) + + def __iter__(self) -> Iterator[PageObject]: + for i in range(len(self)): + yield self[i] + + class DocumentInformation(DictionaryObject): """ A class representing the basic document metadata provided in a PDF File. @@ -88,57 +146,57 @@ class DocumentInformation(DictionaryObject): therefore is not as commonly accessed. """ - def __init__(self): + def __init__(self) -> None: DictionaryObject.__init__(self) - def getText(self, key): + def getText(self, key: str) -> Optional[str]: retval = self.get(key, None) if isinstance(retval, TextStringObject): return retval return None @property - def title(self): + def title(self) -> Optional[str]: """Read-only property accessing the document's **title**. Returns a unicode string (``TextStringObject``) or ``None`` if the title is not specified.""" return ( - self.getText(DI.TITLE) or self.get(DI.TITLE).getObject() + self.getText(DI.TITLE) or self.get(DI.TITLE).getObject() # type: ignore if self.get(DI.TITLE) else None ) @property - def title_raw(self): + def title_raw(self) -> Optional[str]: """The "raw" version of title; can return a ``ByteStringObject``.""" return self.get(DI.TITLE) @property - def author(self): + def author(self) -> Optional[str]: """Read-only property accessing the document's **author**. Returns a unicode string (``TextStringObject``) or ``None`` if the author is not specified.""" return self.getText(DI.AUTHOR) @property - def author_raw(self): + def author_raw(self) -> Optional[str]: """The "raw" version of author; can return a ``ByteStringObject``.""" return self.get(DI.AUTHOR) @property - def subject(self): + def subject(self) -> Optional[str]: """Read-only property accessing the document's **subject**. Returns a unicode string (``TextStringObject``) or ``None`` if the subject is not specified.""" return self.getText(DI.SUBJECT) @property - def subject_raw(self): + def subject_raw(self) -> Optional[str]: """The "raw" version of subject; can return a ``ByteStringObject``.""" return self.get(DI.SUBJECT) @property - def creator(self): + def creator(self) -> Optional[str]: """Read-only property accessing the document's **creator**. If the document was converted to PDF from another format, this is the name of the application (e.g. OpenOffice) that created the original document from @@ -147,12 +205,12 @@ def creator(self): return self.getText(DI.CREATOR) @property - def creator_raw(self): + def creator_raw(self) -> Optional[str]: """The "raw" version of creator; can return a ``ByteStringObject``.""" return self.get(DI.CREATOR) @property - def producer(self): + def producer(self) -> Optional[str]: """Read-only property accessing the document's **producer**. If the document was converted to PDF from another format, this is the name of the application (for example, OSX Quartz) that converted @@ -161,7 +219,7 @@ def producer(self): return self.getText(DI.PRODUCER) @property - def producer_raw(self): + def producer_raw(self) -> Optional[str]: """The "raw" version of producer; can return a ``ByteStringObject``.""" return self.get(DI.PRODUCER) @@ -179,31 +237,31 @@ class PdfFileReader: :param bool strict: Determines whether user should be warned of all problems and also causes some correctable problems to be fatal. Defaults to ``False``. - :param warndest: Destination for logging warnings (defaults to - ``sys.stderr``). """ - def __init__(self, stream, strict=False, warndest=None): + def __init__(self, stream: StrByteType, strict: bool = False) -> None: self.strict = strict - self.flattenedPages = None - self.resolvedObjects = {} + self.flattenedPages: Optional[List[PageObject]] = None + self.resolvedObjects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {} self.xrefIndex = 0 - self._pageId2Num = None # map page IndirectRef number to Page Number - if hasattr(stream, "mode") and "b" not in stream.mode: + self._pageId2Num: Optional[ + Dict[Any, Any] + ] = None # map page IndirectRef number to Page Number + if hasattr(stream, "mode") and "b" not in stream.mode: # type: ignore warnings.warn( "PdfFileReader stream/file object is not in binary mode. " "It may not be read correctly.", PdfReadWarning, ) if isinstance(stream, str): - with open(stream, "rb") as fileobj: - stream = BytesIO(b_(fileobj.read())) + with open(stream, "rb") as fh: + stream = BytesIO(b_(fh.read())) self.read(stream) self.stream = stream self._override_encryption = False - def getDocumentInfo(self): + def getDocumentInfo(self) -> Optional[DocumentInformation]: """ Retrieve the PDF file's document information dictionary, if it exists. Note that some PDF files use metadata streams instead of docinfo @@ -218,18 +276,18 @@ def getDocumentInfo(self): return None obj = self.trailer[TK.INFO] retval = DocumentInformation() - retval.update(obj) + retval.update(obj) # type: ignore return retval @property - def documentInfo(self): + def documentInfo(self) -> Optional[DocumentInformation]: """ Read-only property that accesses the :meth:`getDocumentInfo()` function. """ return self.getDocumentInfo() - def getXmpMetadata(self): + def getXmpMetadata(self) -> Optional[XmpInformation]: """ Retrieve XMP (Extensible Metadata Platform) data from the PDF document root. @@ -241,19 +299,19 @@ def getXmpMetadata(self): """ try: self._override_encryption = True - return self.trailer[TK.ROOT].getXmpMetadata() + return self.trailer[TK.ROOT].getXmpMetadata() # type: ignore finally: self._override_encryption = False @property - def xmpMetadata(self): + def xmpMetadata(self) -> Optional[XmpInformation]: """ Read-only property that accesses the :meth:`getXmpMetadata()` function. """ return self.getXmpMetadata() - def getNumPages(self): + def getNumPages(self) -> int: """ Calculates the number of pages in this PDF file. @@ -270,7 +328,7 @@ def getNumPages(self): try: self._override_encryption = True self.decrypt("") - return self.trailer[TK.ROOT]["/Pages"]["/Count"] + return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore except Exception: raise PdfReadError("File has not been decrypted") finally: @@ -278,17 +336,17 @@ def getNumPages(self): else: if self.flattenedPages is None: self._flatten() - return len(self.flattenedPages) + return len(self.flattenedPages) # type: ignore @property - def numPages(self): + def numPages(self) -> int: """ Read-only property that accesses the :meth:`getNumPages()` function. """ return self.getNumPages() - def getPage(self, pageNumber): + def getPage(self, pageNumber: int) -> PageObject: """ Retrieves a page by number from this PDF file. @@ -301,10 +359,11 @@ def getPage(self, pageNumber): # assert not self.trailer.has_key(TK.ENCRYPT) if self.flattenedPages is None: self._flatten() + assert self.flattenedPages is not None, "hint for mypy" return self.flattenedPages[pageNumber] @property - def namedDestinations(self): + def namedDestinations(self) -> Dict[str, Any]: """ Read-only property that accesses the :meth:`getNamedDestinations()` function. @@ -314,7 +373,12 @@ def namedDestinations(self): # A select group of relevant field attributes. For the complete list, # see section 8.6.2 of the PDF 1.7 reference. - def getFields(self, tree=None, retval=None, fileobj=None): + def getFields( + self, + tree: Optional[TreeObject] = None, + retval: Optional[Dict[Any, Any]] = None, + fileobj: Optional[Any] = None, + ) -> Optional[Dict[str, Any]]: """ Extracts field data if this PDF contains interactive form fields. The *tree* and *retval* parameters are for recursive use. @@ -338,10 +402,10 @@ def getFields(self, tree=None, retval=None, fileobj=None): } if retval is None: retval = {} - catalog = self.trailer[TK.ROOT] + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) # get the AcroForm tree if "/AcroForm" in catalog: - tree = catalog["/AcroForm"] + tree = cast(Optional[TreeObject], catalog["/AcroForm"]) else: return None if tree is None: @@ -355,14 +419,20 @@ def getFields(self, tree=None, retval=None, fileobj=None): break if "/Fields" in tree: - fields = tree["/Fields"] + fields = cast(ArrayObject, tree["/Fields"]) for f in fields: field = f.getObject() self._buildField(field, retval, fileobj, field_attributes) return retval - def _buildField(self, field, retval, fileobj, fieldAttributes): + def _buildField( + self, + field: Union[TreeObject, DictionaryObject], + retval: Dict[Any, Any], + fileobj: Any, + fieldAttributes: Any, + ) -> None: self._checkKids(field, retval, fileobj) try: key = field["/TM"] @@ -377,13 +447,15 @@ def _buildField(self, field, retval, fileobj, fieldAttributes): fileobj.write("\n") retval[key] = Field(field) - def _checkKids(self, tree, retval, fileobj): + def _checkKids( + self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any + ) -> None: if PA.KIDS in tree: # recurse down the tree - for kid in tree[PA.KIDS]: + for kid in tree[PA.KIDS]: # type: ignore self.getFields(kid.getObject(), retval, fileobj) - def _writeField(self, fileobj, field, fieldAttributes): + def _writeField(self, fileobj: Any, field: Any, fieldAttributes: Any) -> None: order = ["/TM", "/T", "/FT", PA.PARENT, "/TU", "/Ff", "/V", "/DV"] for attr in order: attr_name = fieldAttributes[attr] @@ -411,7 +483,7 @@ def _writeField(self, fileobj, field, fieldAttributes): # Field attribute is N/A or unknown, so don't write anything pass - def getFormTextFields(self): + def getFormTextFields(self) -> Dict[str, Any]: """Retrieves form fields from the document with textual data (inputs, dropdowns)""" # Retrieve document form fields formfields = self.getFields() @@ -423,7 +495,11 @@ def getFormTextFields(self): if formfields[field].get("/FT") == "/Tx" } - def getNamedDestinations(self, tree=None, retval=None): + def getNamedDestinations( + self, + tree: Union[TreeObject, None] = None, + retval: Optional[Any] = None, + ) -> Dict[str, Any]: """ Retrieves the named destinations present in the document. @@ -433,46 +509,49 @@ def getNamedDestinations(self, tree=None, retval=None): """ if retval is None: retval = {} - catalog = self.trailer[TK.ROOT] + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) # get the name tree if CA.DESTS in catalog: - tree = catalog[CA.DESTS] + tree = cast(TreeObject, catalog[CA.DESTS]) elif CA.NAMES in catalog: - names = catalog[CA.NAMES] + names = cast(DictionaryObject, catalog[CA.NAMES]) if CA.DESTS in names: - tree = names[CA.DESTS] + tree = cast(TreeObject, names[CA.DESTS]) if tree is None: return retval if PA.KIDS in tree: # recurse down the tree - for kid in tree[PA.KIDS]: + for kid in cast(ArrayObject, tree[PA.KIDS]): self.getNamedDestinations(kid.getObject(), retval) + # TABLE 3.33 Entries in a name tree node dictionary (PDF 1.7 specs) if CA.NAMES in tree: - names = tree[CA.NAMES] + names = cast(DictionaryObject, tree[CA.NAMES]) for i in range(0, len(names), 2): - key = names[i].getObject() - val = names[i + 1].getObject() - if isinstance(val, DictionaryObject) and "/D" in val: - val = val["/D"] - dest = self._buildDestination(key, val) + key = cast(str, names[i].getObject()) + value = names[i + 1].getObject() + if isinstance(value, DictionaryObject) and "/D" in value: + value = value["/D"] + dest = self._buildDestination(key, value) # type: ignore if dest is not None: retval[key] = dest return retval @property - def outlines(self): + def outlines(self) -> OutlinesType: """ Read-only property that accesses the :meth:`getOutlines()` function. """ return self.getOutlines() - def getOutlines(self, node=None, outlines=None): + def getOutlines( + self, node: Optional[DictionaryObject] = None, outlines: Optional[Any] = None + ) -> OutlinesType: """ Retrieve the document outline present in the document. @@ -480,20 +559,21 @@ def getOutlines(self, node=None, outlines=None): """ if outlines is None: outlines = [] - catalog = self.trailer[TK.ROOT] + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) # get the outline dictionary and named destinations if CO.OUTLINES in catalog: try: - lines = catalog[CO.OUTLINES] + lines = cast(DictionaryObject, catalog[CO.OUTLINES]) except PdfReadError: # this occurs if the /Outlines object reference is incorrect # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf # so continue to load the file without the Bookmarks return outlines + # TABLE 8.3 Entries in the outline dictionary if "/First" in lines: - node = lines["/First"] + node = cast(DictionaryObject, lines["/First"]) self._namedDests = self.getNamedDestinations() if node is None: @@ -507,36 +587,38 @@ def getOutlines(self, node=None, outlines=None): # check for sub-outlines if "/First" in node: - sub_outlines = [] - self.getOutlines(node["/First"], sub_outlines) + sub_outlines: List[Any] = [] + self.getOutlines(cast(DictionaryObject, node["/First"]), sub_outlines) if sub_outlines: outlines.append(sub_outlines) if "/Next" not in node: break - node = node["/Next"] + node = cast(DictionaryObject, node["/Next"]) return outlines - def _getPageNumberByIndirect(self, indirectRef): + def _getPageNumberByIndirect( + self, indirectRef: Union[None, int, NullObject, IndirectObject] + ) -> int: """Generate _pageId2Num""" if self._pageId2Num is None: id2num = {} for i, x in enumerate(self.pages): - id2num[x.indirectRef.idnum] = i + id2num[x.indirectRef.idnum] = i # type: ignore self._pageId2Num = id2num - if isinstance(indirectRef, NullObject): + if indirectRef is None or isinstance(indirectRef, NullObject): return -1 if isinstance(indirectRef, int): idnum = indirectRef else: idnum = indirectRef.idnum - + assert self._pageId2Num is not None, "hint for mypy" ret = self._pageId2Num.get(idnum, -1) return ret - def getPageNumber(self, page): + def getPageNumber(self, page: PageObject) -> int: """ Retrieve page number of a given PageObject @@ -545,11 +627,9 @@ def getPageNumber(self, page): :return: the page number or -1 if page not found :rtype: int """ - indirect_ref = page.indirectRef - ret = self._getPageNumberByIndirect(indirect_ref) - return ret + return self._getPageNumberByIndirect(page.indirectRef) - def getDestinationPageNumber(self, destination): + def getDestinationPageNumber(self, destination: Destination) -> int: """ Retrieve page number of a given Destination object @@ -559,33 +639,38 @@ def getDestinationPageNumber(self, destination): :return: the page number or -1 if page not found :rtype: int """ - indirect_ref = destination.page - ret = self._getPageNumberByIndirect(indirect_ref) - return ret + return self._getPageNumberByIndirect(destination.page) - def _buildDestination(self, title, array): + def _buildDestination( + self, + title: str, + array: List[Union[NumberObject, IndirectObject, NullObject, DictionaryObject]], + ) -> Destination: page, typ = array[0:2] array = array[2:] try: - return Destination(title, page, typ, *array) + return Destination(title, page, typ, *array) # type: ignore except PdfReadError: warnings.warn("Unknown destination : " + title + " " + str(array)) if self.strict: raise else: # create a link to first Page + tmp = self.getPage(0).indirectRef + indirect_ref = NullObject() if tmp is None else tmp return Destination( - title, self.getPage(0).indirectRef, TextStringObject("/Fit") + title, indirect_ref, TextStringObject("/Fit") # type: ignore ) - def _buildOutline(self, node): + def _buildOutline(self, node: DictionaryObject) -> Optional[Destination]: dest, title, outline = None, None, None if "/A" in node and "/Title" in node: # Action, section 8.5 (only type GoTo supported) title = node["/Title"] - action = node["/A"] - if action["/S"] == "/GoTo": + action = cast(DictionaryObject, node["/A"]) + action_type = cast(NameObject, action["/S"]) + if action_type == "/GoTo": dest = action["/D"] elif "/Dest" in node and "/Title" in node: # Destination, section 8.2.1 @@ -595,24 +680,24 @@ def _buildOutline(self, node): # if destination found, then create outline if dest: if isinstance(dest, ArrayObject): - outline = self._buildDestination(title, dest) + outline = self._buildDestination(title, dest) # type: ignore elif isinstance(dest, str) and dest in self._namedDests: outline = self._namedDests[dest] - outline[NameObject("/Title")] = title + outline[NameObject("/Title")] = title # type: ignore else: raise PdfReadError("Unexpected destination %r" % dest) return outline @property - def pages(self): + def pages(self) -> _VirtualList: """ Read-only property that emulates a list based upon the :meth:`getNumPages()` and :meth:`getPage()` methods. """ - return ConvertFunctionsToVirtualList(self.getNumPages, self.getPage) + return _VirtualList(self.getNumPages, self.getPage) - def getPageLayout(self): + def getPageLayout(self) -> Optional[str]: """ Get the page layout. @@ -622,18 +707,18 @@ def getPageLayout(self): :return: Page layout currently being used. :rtype: ``str``, ``None`` if not specified """ - try: - return self.trailer[TK.ROOT]["/PageLayout"] - except KeyError: - return None + trailer = cast(DictionaryObject, self.trailer[TK.ROOT]) + if CD.PAGE_LAYOUT in trailer: + return cast(NameObject, trailer[CD.PAGE_LAYOUT]) + return None @property - def pageLayout(self): + def pageLayout(self) -> Optional[str]: """Read-only property accessing the :meth:`getPageLayout()` method.""" return self.getPageLayout() - def getPageMode(self): + def getPageMode(self) -> Optional[str]: """ Get the page mode. See :meth:`setPageMode()` @@ -643,17 +728,22 @@ def getPageMode(self): :rtype: ``str``, ``None`` if not specified """ try: - return self.trailer[TK.ROOT]["/PageMode"] + return self.trailer[TK.ROOT]["/PageMode"] # type: ignore except KeyError: return None @property - def pageMode(self): + def pageMode(self) -> Optional[str]: """Read-only property accessing the :meth:`getPageMode()` method.""" return self.getPageMode() - def _flatten(self, pages=None, inherit=None, indirectRef=None): + def _flatten( + self, + pages: Union[None, DictionaryObject, PageObject] = None, + inherit: Optional[Dict[str, Any]] = None, + indirectRef: Optional[IndirectObject] = None, + ) -> None: inheritablePageAttributes = ( NameObject(PG.RESOURCES), NameObject(PG.MEDIABOX), @@ -666,43 +756,47 @@ def _flatten(self, pages=None, inherit=None, indirectRef=None): # Fix issue 327: set flattenedPages attribute only for # decrypted file catalog = self.trailer[TK.ROOT].getObject() - pages = catalog["/Pages"].getObject() + pages = catalog["/Pages"].getObject() # type: ignore self.flattenedPages = [] t = "/Pages" if PA.TYPE in pages: - t = pages[PA.TYPE] + t = pages[PA.TYPE] # type: ignore if t == "/Pages": for attr in inheritablePageAttributes: if attr in pages: inherit[attr] = pages[attr] - for page in pages[PA.KIDS]: + for page in pages[PA.KIDS]: # type: ignore addt = {} if isinstance(page, IndirectObject): addt["indirectRef"] = page self._flatten(page.getObject(), inherit, **addt) elif t == "/Page": - for attr, value in list(inherit.items()): + for attr_in, value in list(inherit.items()): # if the page has it's own value, it does not inherit the # parent's value: - if attr not in pages: - pages[attr] = value + if attr_in not in pages: + pages[attr_in] = value page_obj = PageObject(self, indirectRef) page_obj.update(pages) - self.flattenedPages.append(page_obj) - def _getObjectFromStream(self, indirectReference): + # TODO: Could flattenedPages be None at this point? + self.flattenedPages.append(page_obj) # type: ignore + + def _getObjectFromStream( + self, indirectReference: IndirectObject + ) -> Union[int, PdfObject, str]: # indirect reference to object in object stream # read the entire object stream into memory stmnum, idx = self.xref_objStm[indirectReference.idnum] - obj_stm = IndirectObject(stmnum, 0, self).getObject() + obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).getObject() # type: ignore # This is an xref to a stream, so its type better be a stream assert obj_stm["/Type"] == "/ObjStm" # /N is the number of indirect objects in the stream assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.getData())) - for i in range(obj_stm["/N"]): + stream_data = BytesIO(b_(obj_stm.getData())) # type: ignore + for i in range(obj_stm["/N"]): # type: ignore readNonWhitespace(stream_data) stream_data.seek(-1, 1) objnum = NumberObject.readFromStream(stream_data) @@ -716,21 +810,20 @@ def _getObjectFromStream(self, indirectReference): continue if self.strict and idx != i: raise PdfReadError("Object is in wrong index.") - stream_data.seek(obj_stm["/First"] + offset, 0) + stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore try: obj = readObject(stream_data, self) - except PdfStreamError as e: + except PdfStreamError as exc: # Stream object cannot be read. Normally, a critical error, but # Adobe Reader doesn't complain, so continue (in strict mode?) - e = sys.exc_info()[1] warnings.warn( "Invalid stream (index %d) within object %d %d: %s" - % (i, indirectReference.idnum, indirectReference.generation, e), + % (i, indirectReference.idnum, indirectReference.generation, exc), PdfReadWarning, ) if self.strict: - raise PdfReadError("Can't read object stream: %s" % e) + raise PdfReadError("Can't read object stream: %s" % exc) # Replace with null. Hopefully it's nothing important. obj = NullObject() return obj @@ -739,7 +832,7 @@ def _getObjectFromStream(self, indirectReference): raise PdfReadError("This is a fatal error in strict mode.") return NullObject() - def getObject(self, indirectReference): + def getObject(self, indirectReference: IndirectObject) -> Optional[PdfObject]: retval = self.cacheGetIndirectObject( indirectReference.generation, indirectReference.idnum ) @@ -749,7 +842,7 @@ def getObject(self, indirectReference): indirectReference.generation == 0 and indirectReference.idnum in self.xref_objStm ): - retval = self._getObjectFromStream(indirectReference) + retval = self._getObjectFromStream(indirectReference) # type: ignore elif ( indirectReference.generation in self.xref and indirectReference.idnum in self.xref[indirectReference.generation] @@ -784,7 +877,7 @@ def getObject(self, indirectReference): ) if self.strict: assert generation == indirectReference.generation - retval = readObject(self.stream, self) + retval = readObject(self.stream, self) # type: ignore # override encryption is used for the /Encrypt dictionary if not self._override_encryption and self.isEncrypted: @@ -798,7 +891,7 @@ def getObject(self, indirectReference): assert len(key) == (len(self._decryption_key) + 5) md5_hash = md5(key).digest() key = md5_hash[: min(16, len(self._decryption_key) + 5)] - retval = self._decryptObject(retval, key) + retval = self._decryptObject(retval, key) # type: ignore else: warnings.warn( "Object %d %d not defined." @@ -812,7 +905,23 @@ def getObject(self, indirectReference): ) return retval - def _decryptObject(self, obj, key): + def _decryptObject( + self, + obj: Union[ + ArrayObject, + BooleanObject, + ByteStringObject, + DictionaryObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + StreamObject, + TextStringObject, + ], + key: Union[str, bytes], + ) -> PdfObject: if isinstance(obj, (ByteStringObject, TextStringObject)): obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes)) elif isinstance(obj, StreamObject): @@ -825,7 +934,7 @@ def _decryptObject(self, obj, key): obj[i] = self._decryptObject(obj[i], key) return obj - def readObjectHeader(self, stream): + def readObjectHeader(self, stream: StreamType) -> Tuple[int, int]: # Should never be necessary to read out whitespace, since the # cross-reference table should put us in the right spot to read the # object header. In reality... some files have stupid cross reference @@ -849,16 +958,20 @@ def readObjectHeader(self, stream): if extra and self.strict: warnings.warn( "Superfluous whitespace found in object header %s %s" - % (idnum, generation), + % (idnum, generation), # type: ignore PdfReadWarning, ) return int(idnum), int(generation) - def cacheGetIndirectObject(self, generation, idnum): + def cacheGetIndirectObject( + self, generation: int, idnum: int + ) -> Optional[PdfObject]: out = self.resolvedObjects.get((generation, idnum)) return out - def cacheIndirectObject(self, generation, idnum, obj): + def cacheIndirectObject( + self, generation: int, idnum: int, obj: Optional[PdfObject] + ) -> Optional[PdfObject]: if (generation, idnum) in self.resolvedObjects: msg = f"Overwriting cache for {generation} {idnum}" if self.strict: @@ -868,7 +981,7 @@ def cacheIndirectObject(self, generation, idnum, obj): self.resolvedObjects[(generation, idnum)] = obj return obj - def read(self, stream): + def read(self, stream: StreamType) -> None: # start at the end: stream.seek(-1, 2) if not stream.tell(): @@ -903,8 +1016,8 @@ def read(self, stream): ) # read all cross reference tables and their trailers - self.xref = {} - self.xref_objStm = {} + self.xref: Dict[Any, Any] = {} + self.xref_objStm: Dict[Any, Any] = {} self.trailer = DictionaryObject() while True: # load the xref table @@ -914,8 +1027,8 @@ def read(self, stream): self._read_standard_xref_table(stream) readNonWhitespace(stream) stream.seek(-1, 1) - new_trailer = readObject(stream, self) - for key, value in list(new_trailer.items()): + new_trailer = cast(Dict[str, Any], readObject(stream, self)) + for key, value in new_trailer.items(): if key not in self.trailer: self.trailer[key] = value if "/Prev" in new_trailer: @@ -936,7 +1049,7 @@ def read(self, stream): if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/Prev" in xrefstream: - startxref = xrefstream["/Prev"] + startxref = cast(int, xrefstream["/Prev"]) else: break else: @@ -993,7 +1106,7 @@ def read(self, stream): # non-zero-index is actually correct stream.seek(loc, 0) # return to where it was - def _find_startxref_pos(self, stream): + def _find_startxref_pos(self, stream: StreamType) -> int: """Find startxref entry - the location of the xref table""" line = self.readNextEndLine(stream) try: @@ -1010,7 +1123,7 @@ def _find_startxref_pos(self, stream): raise PdfReadError("startxref not found") return startxref - def _read_standard_xref_table(self, stream): + def _read_standard_xref_table(self, stream: StreamType) -> None: # standard cross-reference table ref = stream.read(4) if ref[:3] != b_("ref"): @@ -1019,7 +1132,7 @@ def _read_standard_xref_table(self, stream): stream.seek(-1, 1) firsttime = True # check if the first time looking at the xref table while True: - num = readObject(stream, self) + num = cast(int, readObject(stream, self)) if firsttime and num != 0: self.xrefIndex = num if self.strict: @@ -1032,7 +1145,7 @@ def _read_standard_xref_table(self, stream): firsttime = False readNonWhitespace(stream) stream.seek(-1, 1) - size = readObject(stream, self) + size = cast(int, readObject(stream, self)) readNonWhitespace(stream) stream.seek(-1, 1) cnt = 0 @@ -1058,8 +1171,8 @@ def _read_standard_xref_table(self, stream): if line[-1] in b_("0123456789t"): stream.seek(-1, 1) - offset, generation = line[:16].split(b_(" ")) - offset, generation = int(offset), int(generation) + offset_b, generation_b = line[:16].split(b_(" ")) + offset, generation = int(offset_b), int(generation_b) if generation not in self.xref: self.xref[generation] = {} if num in self.xref[generation]: @@ -1081,23 +1194,25 @@ def _read_standard_xref_table(self, stream): else: break - def _read_pdf15_xref_stream(self, stream): + def _read_pdf15_xref_stream( + self, stream: StreamType + ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: # PDF 1.5+ Cross-Reference Stream stream.seek(-1, 1) idnum, generation = self.readObjectHeader(stream) - xrefstream = readObject(stream, self) + xrefstream = cast(ContentStream, readObject(stream, self)) assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) stream_data = BytesIO(b_(xrefstream.getData())) # Index pairs specify the subsections in the dictionary. If # none create one subsection that spans everything. idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) - entry_sizes = xrefstream.get("/W") + entry_sizes = cast(Dict[Any, Any], xrefstream.get("/W")) assert len(entry_sizes) >= 3 if self.strict and len(entry_sizes) > 3: raise PdfReadError("Too many entry sizes: %s" % entry_sizes) - def get_entry(i): + def get_entry(i: int) -> Union[int, Tuple[int, ...]]: # Reads the correct number of bytes for each entry. See the # discussion of the W parameter in PDF spec table 17. if entry_sizes[i] > 0: @@ -1111,7 +1226,7 @@ def get_entry(i): else: return 0 - def used_before(num, generation): + def used_before(num: int, generation: Union[int, Tuple[int, ...]]) -> bool: # We move backwards through the xrefs, don't replace any. return num in self.xref.get(generation, []) or num in self.xref_objStm @@ -1120,7 +1235,7 @@ def used_before(num, generation): return xrefstream @staticmethod - def _get_xref_issues(stream, startxref): + def _get_xref_issues(stream: StreamType, startxref: int) -> int: """Return an int which indicates an issue. 0 means there is no issue.""" stream.seek(startxref - 1, 0) # -1 to check character before line = stream.read(1) @@ -1144,7 +1259,7 @@ def _get_xref_issues(stream, startxref): # return 4 return 0 - def _rebuild_xref_table(self, stream): + def _rebuild_xref_table(self, stream: StreamType) -> None: self.xref = {} stream.seek(0, 0) f_ = stream.read(-1) @@ -1163,13 +1278,18 @@ def _rebuild_xref_table(self, stream): stream.seek(-1, 1) # there might be something that is not a dict (see #856) - new_trailer = readObject(stream, self) + new_trailer = cast(Dict[Any, Any], readObject(stream, self)) for key, value in list(new_trailer.items()): if key not in self.trailer: self.trailer[key] = value - def _read_xref_subsections(self, idx_pairs, getEntry, used_before): + def _read_xref_subsections( + self, + idx_pairs: List[int], + getEntry: Callable[[int], Union[int, Tuple[int, ...]]], + used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], + ) -> None: last_end = 0 for start, size in self._pairs(idx_pairs): # The subsections must increase @@ -1201,12 +1321,12 @@ def _read_xref_subsections(self, idx_pairs, getEntry, used_before): elif self.strict: raise PdfReadError("Unknown xref type: %s" % xref_type) - def _zeroXref(self, generation): + def _zeroXref(self, generation: int) -> None: self.xref[generation] = { k - self.xrefIndex: v for (k, v) in list(self.xref[generation].items()) } - def _pairs(self, array): + def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]: i = 0 while True: yield array[i], array[i + 1] @@ -1214,7 +1334,7 @@ def _pairs(self, array): if (i + 1) >= len(array): break - def readNextEndLine(self, stream, limit_offset=0): + def readNextEndLine(self, stream: StreamType, limit_offset: int = 0) -> bytes: line_parts = [] while True: # Prevent infinite loops in malformed PDFs @@ -1243,7 +1363,7 @@ def readNextEndLine(self, stream, limit_offset=0): line_parts.reverse() return b"".join(line_parts) - def decrypt(self, password): + def decrypt(self, password: Union[str, bytes]) -> int: """ When using an encrypted / secured PDF file with the PDF Standard encryption handler, this function will allow the file to be decrypted. @@ -1269,7 +1389,7 @@ def decrypt(self, password): finally: self._override_encryption = False - def decode_permissions(self, permissions_code): + def decode_permissions(self, permissions_code: int) -> Dict[str, bool]: # Takes the permissions as an integer, returns the allowed access permissions = {} permissions["print"] = permissions_code & (1 << 3 - 1) != 0 # bit 3 @@ -1284,11 +1404,11 @@ def decode_permissions(self, permissions_code): ) # bit 12 return permissions - def _decrypt(self, password): + def _decrypt(self, password: Union[str, bytes]) -> int: # Decrypts data as per Section 3.5 (page 117) of PDF spec v1.7 # "The security handler defines the use of encryption and decryption in # the document, using the rules specified by the CF, StmF, and StrF entries" - encrypt = self.trailer[TK.ENCRYPT].getObject() + encrypt = cast(DictionaryObject, self.trailer[TK.ENCRYPT].getObject()) # /Encrypt Keys: # Filter (name) : "name of the preferred security handler " # V (number) : Algorithm Code @@ -1303,23 +1423,24 @@ def _decrypt(self, password): raise NotImplementedError( "only Standard PDF encryption handler is available" ) - if not (encrypt["/V"] in (1, 2)): + encrypt_v = cast(int, encrypt["/V"]) + if encrypt_v not in (1, 2): raise NotImplementedError( "only algorithm code 1 and 2 are supported. This PDF uses code %s" - % encrypt["/V"] + % encrypt_v ) user_password, key = self._authenticateUserPassword(password) if user_password: self._decryption_key = key return 1 else: - rev = encrypt["/R"].getObject() + rev = cast(int, encrypt["/R"].getObject()) if rev == 2: keylen = 5 else: - keylen = encrypt[SA.LENGTH].getObject() // 8 + keylen = cast(int, encrypt[SA.LENGTH].getObject()) // 8 key = _alg33_1(password, rev, keylen) - real_O = encrypt["/O"].getObject() + real_O = cast(bytes, encrypt["/O"].getObject()) if rev == 2: userpass = utils.RC4_encrypt(key, real_O) else: @@ -1336,40 +1457,46 @@ def _decrypt(self, password): return 2 return 0 - def _authenticateUserPassword(self, password): - encrypt = self.trailer[TK.ENCRYPT].getObject() - rev = encrypt[ED.R].getObject() - owner_entry = encrypt[ED.O].getObject() - p_entry = encrypt[ED.P].getObject() + def _authenticateUserPassword( + self, password: Union[str, bytes] + ) -> Tuple[bool, bytes]: + encrypt = cast(Optional[DictionaryObject], self.trailer[TK.ENCRYPT].getObject()) + if encrypt is None: + raise Exception( + "_authenticateUserPassword was called on unencrypted document" + ) + rev = cast(int, encrypt[ED.R].getObject()) + owner_entry = cast(ByteStringObject, encrypt[ED.O].getObject()) + p_entry = cast(int, encrypt[ED.P].getObject()) if TK.ID in self.trailer: - id_entry = self.trailer[TK.ID].getObject() + id_entry = cast(ArrayObject, self.trailer[TK.ID].getObject()) else: # Some documents may not have a /ID, use two empty # byte strings instead. Solves # https://github.com/mstamy2/PyPDF2/issues/608 id_entry = ArrayObject([ByteStringObject(b""), ByteStringObject(b"")]) id1_entry = id_entry[0].getObject() - real_U = encrypt[ED.U].getObject().original_bytes + real_U = encrypt[ED.U].getObject().original_bytes # type: ignore if rev == 2: U, key = _alg34(password, owner_entry, p_entry, id1_entry) elif rev >= 3: U, key = _alg35( password, rev, - encrypt[SA.LENGTH].getObject() // 8, + encrypt[SA.LENGTH].getObject() // 8, # type: ignore owner_entry, p_entry, id1_entry, - encrypt.get(ED.ENCRYPT_METADATA, BooleanObject(False)).getObject(), + encrypt.get(ED.ENCRYPT_METADATA, BooleanObject(False)).getObject(), # type: ignore ) U, real_U = U[:16], real_U[:16] return U == real_U, key - def getIsEncrypted(self): + def getIsEncrypted(self) -> bool: return TK.ENCRYPT in self.trailer @property - def isEncrypted(self): + def isEncrypted(self) -> bool: """ Read-only boolean property showing whether this PDF file is encrypted. Note that this property, if true, will remain true even after the diff --git a/PyPDF2/_security.py b/PyPDF2/_security.py index 347dcb99e..d2c8b7c2f 100644 --- a/PyPDF2/_security.py +++ b/PyPDF2/_security.py @@ -31,8 +31,10 @@ import struct from hashlib import md5 +from typing import Any, Tuple, Union from PyPDF2 import utils +from PyPDF2.generic import BooleanObject, ByteStringObject from PyPDF2.utils import b_, ord_, str_ # ref: pdf1.8 spec section 3.5.2 algorithm 3.2 @@ -46,24 +48,30 @@ # Implementation of algorithm 3.2 of the PDF standard security handler, # section 3.5.2 of the PDF 1.6 reference. def _alg32( - password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True -): + password: Union[str, bytes], + rev: Any, + keylen: Any, + owner_entry: ByteStringObject, + p_entry: int, + id1_entry: ByteStringObject, + metadata_encrypt: Union[BooleanObject, bool] = True, +) -> bytes: # 1. Pad or truncate the password string to exactly 32 bytes. If the # password string is more than 32 bytes long, use only its first 32 bytes; # if it is less than 32 bytes long, pad it by appending the required number # of additional bytes from the beginning of the padding string # (_encryption_padding). - password = b_((str_(password) + str_(_encryption_padding))[:32]) + password_bytes = b_((str_(password) + str_(_encryption_padding))[:32]) # 2. Initialize the MD5 hash function and pass the result of step 1 as # input to this function. - m = md5(password) + m = md5(password_bytes) # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash # function. m.update(owner_entry.original_bytes) # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass # these bytes to the MD5 hash function, low-order byte first. - p_entry = struct.pack(" bytes: # steps 1 - 4 key = _alg33_1(owner_pwd, rev, keylen) # 5. Pad or truncate the user password string as described in step 1 of # algorithm 3.2. - user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) + user_pwd_bytes = b_((user_pwd + str_(_encryption_padding))[:32]) # 6. Encrypt the result of step 5, using an RC4 encryption function with # the encryption key obtained in step 4. - val = utils.RC4_encrypt(key, user_pwd) + val = utils.RC4_encrypt(key, user_pwd_bytes) # 7. (Revision 3 or greater) Do the following 19 times: Take the output # from the previous invocation of the RC4 function and pass it as input to # a new invocation of the function; use an encryption key generated by @@ -117,14 +125,16 @@ def _alg33(owner_pwd, user_pwd, rev, keylen): # Steps 1-4 of algorithm 3.3 -def _alg33_1(password, rev, keylen): +def _alg33_1(password: Union[bytes, str], rev: int, keylen: int) -> bytes: # 1. Pad or truncate the owner password string as described in step 1 of # algorithm 3.2. If there is no owner password, use the user password # instead. - password = b_((password + str_(_encryption_padding))[:32]) + if isinstance(password, bytes): + password = password.decode() + password_bytes = b_((password + str_(_encryption_padding))[:32]) # 2. Initialize the MD5 hash function and pass the result of step 1 as # input to this function. - m = md5(password) + m = md5(password_bytes) # 3. (Revision 3 or greater) Do the following 50 times: Take the output # from the previous MD5 hash and pass it as input into a new MD5 hash. md5_hash = m.digest() @@ -141,7 +151,12 @@ def _alg33_1(password, rev, keylen): # Implementation of algorithm 3.4 of the PDF standard security handler, # section 3.5.2 of the PDF 1.6 reference. -def _alg34(password, owner_entry, p_entry, id1_entry): +def _alg34( + password: Union[str, bytes], + owner_entry: ByteStringObject, + p_entry: int, + id1_entry: ByteStringObject, +) -> Tuple[bytes, bytes]: # 1. Create an encryption key based on the user password string, as # described in algorithm 3.2. key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) @@ -156,7 +171,15 @@ def _alg34(password, owner_entry, p_entry, id1_entry): # Implementation of algorithm 3.4 of the PDF standard security handler, # section 3.5.2 of the PDF 1.6 reference. -def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): +def _alg35( + password: Union[str, bytes], + rev: int, + keylen: int, + owner_entry: ByteStringObject, + p_entry: int, + id1_entry: ByteStringObject, + metadata_encrypt: Union[BooleanObject, bool], +) -> Tuple[bytes, bytes]: # 1. Create an encryption key based on the user password string, as # described in Algorithm 3.2. key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index b2ba8c107..122df7e42 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -33,8 +33,10 @@ import uuid import warnings from hashlib import md5 +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast from PyPDF2._page import PageObject +from PyPDF2._reader import PdfFileReader from PyPDF2._security import _alg33, _alg34, _alg35 from PyPDF2.constants import CatalogAttributes as CA from PyPDF2.constants import Core as CO @@ -56,13 +58,23 @@ NameObject, NullObject, NumberObject, + PdfObject, RectangleObject, StreamObject, TextStringObject, TreeObject, createStringObject, ) -from PyPDF2.utils import b_ +from PyPDF2.types import ( + BookmarkTypes, + BorderArrayType, + FitType, + LayoutType, + PagemodeType, + ZoomArgsType, + ZoomArgType, +) +from PyPDF2.utils import StreamType, b_ logger = logging.getLogger(__name__) @@ -73,9 +85,9 @@ class PdfFileWriter: class (typically :class:`PdfFileReader`). """ - def __init__(self): + def __init__(self) -> None: self._header = b_("%PDF-1.3") - self._objects = [] # array of indirect objects + self._objects: List[Optional[PdfObject]] = [] # array of indirect objects # The root of our page tree node. pages = DictionaryObject() @@ -107,28 +119,31 @@ def __init__(self): NameObject(CO.PAGES): self._pages, } ) - self._root = None + self._root: Optional[IndirectObject] = None self._root_object = root self.set_need_appearances_writer() - def _addObject(self, obj): + def _addObject(self, obj: Optional[PdfObject]) -> IndirectObject: self._objects.append(obj) return IndirectObject(len(self._objects), 0, self) - def getObject(self, ido): + def getObject(self, ido: IndirectObject) -> PdfObject: if ido.pdf != self: raise ValueError("pdf must be self") - return self._objects[ido.idnum - 1] + return self._objects[ido.idnum - 1] # type: ignore - def _addPage(self, page, action): + def _addPage( + self, page: PageObject, action: Callable[[Any, IndirectObject], None] + ) -> None: assert page[PA.TYPE] == CO.PAGE page[NameObject(PA.PARENT)] = self._pages - page = self._addObject(page) - pages = self.getObject(self._pages) - action(pages[PA.KIDS], page) - pages[NameObject(PA.COUNT)] = NumberObject(pages[PA.COUNT] + 1) + page_ind = self._addObject(page) + pages = cast(DictionaryObject, self.getObject(self._pages)) + action(pages[PA.KIDS], page_ind) + page_count = cast(int, pages[PA.COUNT]) + pages[NameObject(PA.COUNT)] = NumberObject(page_count + 1) - def set_need_appearances_writer(self): + def set_need_appearances_writer(self) -> None: # See 12.7.2 and 7.7.2 for more information: # http://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf try: @@ -144,12 +159,12 @@ def set_need_appearances_writer(self): ) need_appearances = NameObject("/NeedAppearances") - self._root_object["/AcroForm"][need_appearances] = BooleanObject(True) + self._root_object["/AcroForm"][need_appearances] = BooleanObject(True) # type: ignore - except Exception as e: - logger.error("set_need_appearances_writer() catch : ", repr(e)) + except Exception as exc: + logger.error("set_need_appearances_writer() catch : ", repr(exc)) - def addPage(self, page): + def addPage(self, page: PageObject) -> None: """ Add a page to this PDF file. The page is usually acquired from a :class:`PdfFileReader` instance. @@ -159,7 +174,7 @@ def addPage(self, page): """ self._addPage(page, list.append) - def insertPage(self, page, index=0): + def insertPage(self, page: PageObject, index: int = 0) -> None: """ Insert a page in this PDF file. The page is usually acquired from a :class:`PdfFileReader` instance. @@ -170,7 +185,7 @@ def insertPage(self, page, index=0): """ self._addPage(page, lambda l, p: l.insert(index, p)) - def getPage(self, pageNumber): + def getPage(self, pageNumber: int) -> PageObject: """ Retrieve a page by number from this PDF file. @@ -179,19 +194,21 @@ def getPage(self, pageNumber): :return: the page at the index given by *pageNumber* :rtype: :class:`PageObject` """ - pages = self.getObject(self._pages) + pages = cast(Dict[str, Any], self.getObject(self._pages)) # XXX: crude hack return pages[PA.KIDS][pageNumber].getObject() - def getNumPages(self): + def getNumPages(self) -> int: """ :return: the number of pages. :rtype: int """ - pages = self.getObject(self._pages) + pages = cast(Dict[str, Any], self.getObject(self._pages)) return int(pages[NameObject("/Count")]) - def addBlankPage(self, width=None, height=None): + def addBlankPage( + self, width: Optional[float] = None, height: Optional[float] = None + ) -> PageObject: """ Append a blank page to this PDF file and returns it. If no page size is specified, use the size of the last page. @@ -209,7 +226,12 @@ def addBlankPage(self, width=None, height=None): self.addPage(page) return page - def insertBlankPage(self, width=None, height=None, index=0): + def insertBlankPage( + self, + width: Optional[float] = None, + height: Optional[float] = None, + index: int = 0, + ) -> PageObject: """ Insert a blank page to this PDF file and returns it. If no page size is specified, use the size of the last page. @@ -232,7 +254,7 @@ def insertBlankPage(self, width=None, height=None, index=0): self.insertPage(page, index) return page - def addJS(self, javascript): + def addJS(self, javascript: str) -> None: """ Add Javascript which will launch upon opening this PDF. @@ -275,7 +297,7 @@ def addJS(self, javascript): } ) - def addAttachment(self, fname, fdata): + def addAttachment(self, fname: str, fdata: Union[str, bytes]) -> None: """ Embed a file inside the PDF. @@ -354,7 +376,11 @@ def addAttachment(self, fname, fdata): # Update the root self._root_object.update({NameObject(CA.NAMES): embeddedFilesDictionary}) - def appendPagesFromReader(self, reader, after_page_append=None): + def appendPagesFromReader( + self, + reader: PdfFileReader, + after_page_append: Optional[Callable[[PageObject], None]] = None, + ) -> None: """ Copy pages from reader to writer. Includes an optional callback parameter which is invoked after pages are appended to the writer. @@ -372,7 +398,7 @@ def appendPagesFromReader(self, reader, after_page_append=None): writer_num_pages = self.getNumPages() # Copy pages from reader to writer - for rpagenum in range(0, reader_num_pages): + for rpagenum in range(reader_num_pages): reader_page = reader.getPage(rpagenum) self.addPage(reader_page) writer_page = self.getPage(writer_num_pages + rpagenum) @@ -380,7 +406,9 @@ def appendPagesFromReader(self, reader, after_page_append=None): if callable(after_page_append): after_page_append(writer_page) - def updatePageFormFieldValues(self, page, fields, flags=0): + def updatePageFormFieldValues( + self, page: PageObject, fields: Dict[str, Any], flags: int = 0 + ) -> None: """ Update the form field values for a given page from a fields dictionary. Copy field texts and values from fields to page. @@ -395,8 +423,8 @@ def updatePageFormFieldValues(self, page, fields, flags=0): PDF Reference Table 8.70 for details. """ # Iterate through pages, update field values - for j in range(len(page[PG.ANNOTS])): - writer_annot = page[PG.ANNOTS][j].getObject() + for j in range(len(page[PG.ANNOTS])): # type: ignore + writer_annot = page[PG.ANNOTS][j].getObject() # type: ignore # retrieve parent field values, if present writer_parent_annot = {} # fallback if it's not there if PG.PARENT in writer_annot: @@ -413,16 +441,20 @@ def updatePageFormFieldValues(self, page, fields, flags=0): {NameObject("/V"): TextStringObject(fields[field])} ) - def cloneReaderDocumentRoot(self, reader): + def cloneReaderDocumentRoot(self, reader: PdfFileReader) -> None: """ Copy the reader document root to the writer. :param reader: PdfFileReader from the document root should be copied. :callback after_page_append: """ - self._root_object = reader.trailer[TK.ROOT] + self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT]) - def cloneDocumentFromReader(self, reader, after_page_append=None): + def cloneDocumentFromReader( + self, + reader: PdfFileReader, + after_page_append: Optional[Callable[[PageObject], None]] = None, + ) -> None: """ Create a copy (clone) of a document from a PDF file reader @@ -438,7 +470,13 @@ def cloneDocumentFromReader(self, reader, after_page_append=None): self.cloneReaderDocumentRoot(reader) self.appendPagesFromReader(reader, after_page_append) - def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True, permissions_flag=-1): + def encrypt( + self, + user_pwd: str, + owner_pwd: Optional[str] = None, + use_128bit: bool = True, + permissions_flag: int = -1, + ) -> None: """ Encrypt this PDF file with the PDF Standard encryption handler. @@ -493,7 +531,7 @@ def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True, permissions_flag=-1 self._encrypt = self._addObject(encrypt) self._encrypt_key = key - def write(self, stream): + def write(self, stream: StreamType) -> None: """ Write the collection of pages added to this object out as a PDF file. @@ -502,14 +540,16 @@ def write(self, stream): """ if hasattr(stream, "mode") and "b" not in stream.mode: warnings.warn( - "File <%s> to write to is not in binary mode. It may not be written to correctly." - % stream.name + ( + "File <{}> to write to is not in binary mode. " # type: ignore + "It may not be written to correctly." + ).format(stream.name) ) if not self._root: self._root = self._addObject(self._root_object) - external_reference_map = {} + external_reference_map: Dict[Any, Any] = {} # PDF objects sometimes have circular references to their /Page objects # inside their object tree (for example, annotations). Those will be @@ -531,7 +571,7 @@ def write(self, stream): data.idnum ] = IndirectObject(obj_index + 1, 0, self) - self.stack = [] + self.stack: List[int] = [] self._sweepIndirectReferences(external_reference_map, self._root) del self.stack @@ -540,7 +580,7 @@ def write(self, stream): self._write_trailer(stream) stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) # eof - def _write_header(self, stream): + def _write_header(self, stream: StreamType) -> List[int]: object_positions = [] stream.write(self._header + b_("\n")) stream.write(b_("%\xE2\xE3\xCF\xD3\n")) @@ -563,7 +603,7 @@ def _write_header(self, stream): stream.write(b_("\nendobj\n")) return object_positions - def _write_xref_table(self, stream, object_positions): + def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> int: xref_location = stream.tell() stream.write(b_("xref\n")) stream.write(b_("0 %s\n" % (len(self._objects) + 1))) @@ -572,7 +612,7 @@ def _write_xref_table(self, stream, object_positions): stream.write(b_("%010d %05d n \n" % (offset, 0))) return xref_location - def _write_trailer(self, stream): + def _write_trailer(self, stream: StreamType) -> None: stream.write(b_("trailer\n")) trailer = DictionaryObject() trailer.update( @@ -588,7 +628,7 @@ def _write_trailer(self, stream): trailer[NameObject(TK.ENCRYPT)] = self._encrypt trailer.writeToStream(stream, None) - def addMetadata(self, infos): + def addMetadata(self, infos: Dict[str, Any]) -> None: """ Add custom metadata to the output. @@ -598,9 +638,24 @@ def addMetadata(self, infos): args = {} for key, value in list(infos.items()): args[NameObject(key)] = createStringObject(value) - self.getObject(self._info).update(args) + self.getObject(self._info).update(args) # type: ignore - def _sweepIndirectReferences(self, externMap, data): + def _sweepIndirectReferences( + self, + externMap: Dict[Any, Any], + data: Union[ + ArrayObject, + BooleanObject, + DictionaryObject, + FloatObject, + IndirectObject, + NameObject, + PdfObject, + NumberObject, + TextStringObject, + NullObject, + ], + ) -> Union[Any, StreamObject]: if isinstance(data, DictionaryObject): for key, value in list(data.items()): value = self._sweepIndirectReferences(externMap, value) @@ -665,15 +720,16 @@ def _sweepIndirectReferences(self, externMap, data): else: return data - def getReference(self, obj): + def getReference(self, obj: PdfObject) -> IndirectObject: idnum = self._objects.index(obj) + 1 ref = IndirectObject(idnum, 0, self) assert ref.getObject() == obj return ref - def getOutlineRoot(self): + def getOutlineRoot(self) -> TreeObject: if CO.OUTLINES in self._root_object: - outline = self._root_object[CO.OUTLINES] + # TABLE 3.25 Entries in the catalog dictionary + outline = cast(TreeObject, self._root_object[CO.OUTLINES]) idnum = self._objects.index(outline) + 1 outline_ref = IndirectObject(idnum, 0, self) assert outline_ref.getObject() == outline @@ -685,21 +741,23 @@ def getOutlineRoot(self): return outline - def getNamedDestRoot(self): + def getNamedDestRoot(self) -> ArrayObject: if CA.NAMES in self._root_object and isinstance( self._root_object[CA.NAMES], DictionaryObject ): - names = self._root_object[CA.NAMES] + names = cast(DictionaryObject, self._root_object[CA.NAMES]) idnum = self._objects.index(names) + 1 names_ref = IndirectObject(idnum, 0, self) assert names_ref.getObject() == names if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): - dests = names[CA.DESTS] + # 3.6.3 Name Dictionary (PDF spec 1.7) + dests = cast(DictionaryObject, names[CA.DESTS]) idnum = self._objects.index(dests) + 1 dests_ref = IndirectObject(idnum, 0, self) assert dests_ref.getObject() == dests if CA.NAMES in dests: - nd = dests[CA.NAMES] + # TABLE 3.33 Entries in a name tree node dictionary + nd = cast(ArrayObject, dests[CA.NAMES]) else: nd = ArrayObject() dests[NameObject(CA.NAMES)] = nd @@ -722,7 +780,9 @@ def getNamedDestRoot(self): return nd - def addBookmarkDestination(self, dest, parent=None): + def addBookmarkDestination( + self, dest: PageObject, parent: Optional[TreeObject] = None + ) -> IndirectObject: dest_ref = self._addObject(dest) outline_ref = self.getOutlineRoot() @@ -730,12 +790,14 @@ def addBookmarkDestination(self, dest, parent=None): if parent is None: parent = outline_ref - parent = parent.getObject() + parent = cast(TreeObject, parent.getObject()) parent.addChild(dest_ref, self) return dest_ref - def addBookmarkDict(self, bookmark, parent=None): + def addBookmarkDict( + self, bookmark: BookmarkTypes, parent: Optional[TreeObject] = None + ) -> IndirectObject: bookmark_obj = TreeObject() for k, v in list(bookmark.items()): bookmark_obj[NameObject(str(k))] = v @@ -743,7 +805,8 @@ def addBookmarkDict(self, bookmark, parent=None): if "/A" in bookmark: action = DictionaryObject() - for k, v in list(bookmark["/A"].items()): + a_dict = cast(DictionaryObject, bookmark["/A"]) + for k, v in list(a_dict.items()): action[NameObject(str(k))] = v action_ref = self._addObject(action) bookmark_obj[NameObject("/A")] = action_ref @@ -755,22 +818,23 @@ def addBookmarkDict(self, bookmark, parent=None): if parent is None: parent = outline_ref - parent = parent.getObject() + parent = parent.getObject() # type: ignore + assert parent is not None, "hint for mypy" parent.addChild(bookmark_ref, self) return bookmark_ref def addBookmark( self, - title, - pagenum, - parent=None, - color=None, - bold=False, - italic=False, - fit="/Fit", - *args, - ): + title: str, + pagenum: int, + parent: Union[None, TreeObject, IndirectObject] = None, + color: Optional[Tuple[float, float, float]] = None, + bold: bool = False, + italic: bool = False, + fit: FitType = "/Fit", + *args: ZoomArgsType, + ) -> IndirectObject: """ Add a bookmark to this PDF file. @@ -785,9 +849,10 @@ def addBookmark( :param str fit: The fit of the destination page. See :meth:`addLink()` for details. """ - page_ref = self.getObject(self._pages)[PA.KIDS][pagenum] + pages_obj = cast(Dict[str, Any], self.getObject(self._pages)) + page_ref = pages_obj[PA.KIDS][pagenum] action = DictionaryObject() - zoom_args = [] + zoom_args: ZoomArgsType = [] for a in args: if a is not None: zoom_args.append(NumberObject(a)) @@ -831,21 +896,22 @@ def addBookmark( bookmark_ref = self._addObject(bookmark) - parent = parent.getObject() - parent.addChild(bookmark_ref, self) + assert parent is not None, "hint for mypy" + parent_obj = cast(TreeObject, parent.getObject()) + parent_obj.addChild(bookmark_ref, self) return bookmark_ref - def addNamedDestinationObject(self, dest): + def addNamedDestinationObject(self, dest: PdfObject) -> IndirectObject: dest_ref = self._addObject(dest) nd = self.getNamedDestRoot() - nd.extend([dest["/Title"], dest_ref]) + nd.extend([dest["/Title"], dest_ref]) # type: ignore return dest_ref - def addNamedDestination(self, title, pagenum): - page_ref = self.getObject(self._pages)[PA.KIDS][pagenum] + def addNamedDestination(self, title: str, pagenum: int) -> IndirectObject: + page_ref = self.getObject(self._pages)[PA.KIDS][pagenum] # type: ignore dest = DictionaryObject() dest.update( { @@ -863,22 +929,24 @@ def addNamedDestination(self, title, pagenum): return dest_ref - def removeLinks(self): + def removeLinks(self) -> None: """Remove links and annotations from this output.""" - pages = self.getObject(self._pages)[PA.KIDS] + pg_dict = cast(DictionaryObject, self.getObject(self._pages)) + pages = cast(ArrayObject, pg_dict[PA.KIDS]) for page in pages: - page_ref = self.getObject(page) + page_ref = cast(DictionaryObject, self.getObject(page)) if PG.ANNOTS in page_ref: del page_ref[PG.ANNOTS] - def removeImages(self, ignoreByteStringObject=False): + def removeImages(self, ignoreByteStringObject: bool = False) -> None: """ Remove images from this output. :param bool ignoreByteStringObject: optional parameter to ignore ByteString Objects. """ - pages = self.getObject(self._pages)[PA.KIDS] + pg_dict = cast(DictionaryObject, self.getObject(self._pages)) + pages = cast(ArrayObject, pg_dict[PA.KIDS]) jump_operators = [ b_("cm"), b_("w"), @@ -908,7 +976,7 @@ def removeImages(self, ignoreByteStringObject=False): ] for j in range(len(pages)): page = pages[j] - page_ref = self.getObject(page) + page_ref = cast(DictionaryObject, self.getObject(page)) content = page_ref["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page_ref) @@ -947,17 +1015,18 @@ def removeImages(self, ignoreByteStringObject=False): content.operations = _operations page_ref.__setitem__(NameObject("/Contents"), content) - def removeText(self, ignoreByteStringObject=False): + def removeText(self, ignoreByteStringObject: bool = False) -> None: """ Remove text from this output. :param bool ignoreByteStringObject: optional parameter to ignore ByteString Objects. """ - pages = self.getObject(self._pages)[PA.KIDS] + pg_dict = cast(DictionaryObject, self.getObject(self._pages)) + pages = cast(List[IndirectObject], pg_dict[PA.KIDS]) for j in range(len(pages)): page = pages[j] - page_ref = self.getObject(page) + page_ref = cast(Dict[str, Any], self.getObject(page)) content = page_ref["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page_ref) @@ -991,7 +1060,13 @@ def removeText(self, ignoreByteStringObject=False): page_ref.__setitem__(NameObject("/Contents"), content) - def addURI(self, pagenum, uri, rect, border=None): + def addURI( + self, + pagenum: int, + uri: int, + rect: RectangleObject, + border: Optional[ArrayObject] = None, + ) -> None: """ Add an URI from a rectangular area to the specified page. This uses the basic structure of AddLink @@ -1009,9 +1084,10 @@ def addURI(self, pagenum, uri, rect, border=None): -John Mulligan """ - page_link = self.getObject(self._pages)[PA.KIDS][pagenum] - page_ref = self.getObject(page_link) + page_link = self.getObject(self._pages)[PA.KIDS][pagenum] # type: ignore + page_ref = cast(Dict[str, Any], self.getObject(page_link)) + border_arr: BorderArrayType if border is not None: border_arr = [NameObject(n) for n in border[:3]] if len(border) == 4: @@ -1053,7 +1129,15 @@ def addURI(self, pagenum, uri, rect, border=None): else: page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) - def addLink(self, pagenum, pagedest, rect, border=None, fit="/Fit", *args): + def addLink( + self, + pagenum: int, + pagedest: int, + rect: RectangleObject, + border: Optional[ArrayObject] = None, + fit: FitType = "/Fit", + *args: ZoomArgType, + ) -> None: """ Add an internal link from a rectangular area to the specified page. @@ -1088,13 +1172,12 @@ def addLink(self, pagenum, pagedest, rect, border=None, fit="/Fit", *args): * - /FitBV - [left] """ + pages_obj = cast(Dict[str, Any], self.getObject(self._pages)) + page_link = pages_obj[PA.KIDS][pagenum] + page_dest = pages_obj[PA.KIDS][pagedest] # TODO: switch for external link + page_ref = cast(Dict[str, Any], self.getObject(page_link)) - page_link = self.getObject(self._pages)[PA.KIDS][pagenum] - page_dest = self.getObject(self._pages)[PA.KIDS][ - pagedest - ] # TODO: switch for external link - page_ref = self.getObject(page_link) - + border_arr: BorderArrayType if border is not None: border_arr = [NameObject(n) for n in border[:3]] if len(border) == 4: @@ -1110,7 +1193,7 @@ def addLink(self, pagenum, pagedest, rect, border=None, fit="/Fit", *args): else: rect = RectangleObject(rect) - zoom_args = [] + zoom_args: ZoomArgsType = [] for a in args: if a is not None: zoom_args.append(NumberObject(a)) @@ -1149,7 +1232,7 @@ def addLink(self, pagenum, pagedest, rect, border=None, fit="/Fit", *args): "/TwoPageRight", ] - def getPageLayout(self): + def getPageLayout(self) -> Optional[LayoutType]: """ Get the page layout. @@ -1159,11 +1242,11 @@ def getPageLayout(self): :rtype: str, None if not specified """ try: - return self._root_object["/PageLayout"] + return cast(LayoutType, self._root_object["/PageLayout"]) except KeyError: return None - def setPageLayout(self, layout): + def setPageLayout(self, layout: Union[NameObject, LayoutType]) -> None: """ Set the page layout. @@ -1208,7 +1291,7 @@ def setPageLayout(self, layout): "/UseAttachments", ] - def getPageMode(self): + def getPageMode(self) -> Optional[PagemodeType]: """ Get the page mode. See :meth:`setPageMode()` for a description @@ -1218,11 +1301,11 @@ def getPageMode(self): :rtype: str, None if not specified. """ try: - return self._root_object["/PageMode"] + return cast(PagemodeType, self._root_object["/PageMode"]) except KeyError: return None - def setPageMode(self, mode): + def setPageMode(self, mode: Union[NameObject, PagemodeType]) -> None: """ Set the page mode. diff --git a/PyPDF2/constants.py b/PyPDF2/constants.py index de40148b2..a62aea1a4 100644 --- a/PyPDF2/constants.py +++ b/PyPDF2/constants.py @@ -204,6 +204,21 @@ class TypFitArguments: FIT_R = "/FitR" +class FieldDistionaryAttributes: + """TABLE 8.69 Entries common to all field dictionaries (PDF 1.7 reference)""" + + FT = "/FT" # name, required for terminal fields + Parent = "/Parent" # dictionary, required for children + Kids = "/Kids" # array, sometimes required + T = "/T" # text string, optional + TU = "/TU" # text string, optional + TM = "/TM" # text string, optional + Ff = "/Ff" # integer, optional + V = "/V" # text string, optional + DV = "/DV" # text string, optional + AA = "/AA" # dictionary, optional + + class DocumentInformationAttributes: """TABLE 10.2 Entries in the document information dictionary""" diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 92353ddf1..dccfebe03 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -33,6 +33,9 @@ import math import struct from io import StringIO +from typing import Any, Dict, Optional, Tuple, Union + +from PyPDF2.generic import ArrayObject, DictionaryObject try: from typing import Literal # type: ignore[attr-defined] @@ -52,7 +55,7 @@ try: import zlib - def decompress(data): + def decompress(data: bytes) -> bytes: try: return zlib.decompress(data) except zlib.error: @@ -65,28 +68,28 @@ def decompress(data): pass return result_str - def compress(data): + def compress(data: bytes) -> bytes: return zlib.compress(data) except ImportError: # pragma: no cover # Unable to import zlib. Attempt to use the System.IO.Compression # library from the .NET framework. (IronPython only) import System # type: ignore[import] - from System import IO, Array # type: ignore[import] + from System import IO, Array - def _string_to_bytearr(buf): + def _string_to_bytearr(buf): # type: ignore[no-untyped-def] retval = Array.CreateInstance(System.Byte, len(buf)) for i in range(len(buf)): retval[i] = ord(buf[i]) return retval - def _bytearr_to_string(bytes): + def _bytearr_to_string(bytes) -> str: # type: ignore[no-untyped-def] retval = "" for i in range(bytes.Length): retval += chr(bytes[i]) return retval - def _read_bytes(stream): + def _read_bytes(stream): # type: ignore[no-untyped-def] ms = IO.MemoryStream() buf = Array.CreateInstance(System.Byte, 2048) while True: @@ -99,7 +102,7 @@ def _read_bytes(stream): ms.Close() return retval - def decompress(data): + def decompress(data): # type: ignore bytes = _string_to_bytearr(data) ms = IO.MemoryStream() ms.Write(bytes, 0, bytes.Length) @@ -110,7 +113,7 @@ def decompress(data): gz.Close() return retval - def compress(data): + def compress(data): # type: ignore bytes = _string_to_bytearr(data) ms = IO.MemoryStream() gz = IO.Compression.DeflateStream( @@ -127,20 +130,20 @@ def compress(data): class FlateDecode: @staticmethod - def decode(data, decodeParms): + def decode( + data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] + ) -> bytes: """ :param data: flate-encoded data. :param decodeParms: a dictionary of values, understanding the "/Predictor": key only :return: the flate-decoded data. """ - data = decompress(data) + str_data = decompress(data) predictor = 1 if decodeParms: try: - from PyPDF2.generic import ArrayObject - if isinstance(decodeParms, ArrayObject): for decodeParm in decodeParms: if "/Predictor" in decodeParm: @@ -153,18 +156,24 @@ def decode(data, decodeParms): if predictor != 1: # The /Columns param. has 1 as the default value; see ISO 32000, # §7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8 - columns = decodeParms.get(LZW.COLUMNS, 1) + if isinstance(decodeParms, ArrayObject): + columns = 1 + for decodeParm in decodeParms: + if "/Columns" in decodeParm: + columns = decodeParm["/Columns"] + else: + columns = 1 if decodeParms is None else decodeParms.get(LZW.COLUMNS, 1) # PNG prediction: if 10 <= predictor <= 15: - data = FlateDecode._decode_png_prediction(data, columns) + str_data = FlateDecode._decode_png_prediction(str_data, columns) # type: ignore else: # unsupported predictor raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) - return data + return str_data @staticmethod - def _decode_png_prediction(data, columns): + def _decode_png_prediction(data: str, columns: int) -> str: output = StringIO() # PNG prediction can vary from row to row rowlength = columns + 1 @@ -198,12 +207,12 @@ def _decode_png_prediction(data, columns): else: # unsupported PNG filter raise PdfReadError("Unsupported PNG filter %r" % filter_byte) - prev_rowdata = rowdata + prev_rowdata = tuple(rowdata) output.write("".join([chr(x) for x in rowdata[1:]])) return output.getvalue() @staticmethod - def encode(data): + def encode(data: bytes) -> bytes: return compress(data) @@ -214,7 +223,9 @@ class ASCIIHexDecode: """ @staticmethod - def decode(data, decodeParms=None): + def decode( + data: str, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + ) -> str: """ :param data: a str sequence of hexadecimal-encoded values to be converted into a base-7 ASCII string @@ -249,7 +260,7 @@ class LZWDecode: """ class decoder: - def __init__(self, data): + def __init__(self, data: bytes) -> None: self.STOP = 257 self.CLEARDICT = 256 self.data = data @@ -260,11 +271,11 @@ def __init__(self, data): self.dict[i] = chr(i) self.resetDict() - def resetDict(self): + def resetDict(self) -> None: self.dictlen = 258 self.bitspercode = 9 - def nextCode(self): + def nextCode(self) -> int: fillbits = self.bitspercode value = 0 while fillbits > 0: @@ -285,7 +296,7 @@ def nextCode(self): self.bytepos = self.bytepos + 1 return value - def decode(self): + def decode(self) -> str: """ TIFF 6.0 specification explains in sufficient details the steps to implement the LZW encode() and decode() algorithms. @@ -328,7 +339,9 @@ def decode(self): return baos @staticmethod - def decode(data, decodeParms=None): + def decode( + data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + ) -> str: """ :param data: ``bytes`` or ``str`` text to decode. :param decodeParms: a dictionary of parameter values. @@ -342,7 +355,10 @@ class ASCII85Decode: """Decodes string ASCII85-encoded data into a byte format.""" @staticmethod - def decode(data, decodeParms=None): + def decode( + data: Union[str, bytes], + decodeParms: Union[None, ArrayObject, DictionaryObject] = None, + ) -> bytes: if isinstance(data, str): data = data.encode("ascii") group_index = b = 0 @@ -368,20 +384,24 @@ def decode(data, decodeParms=None): class DCTDecode: @staticmethod - def decode(data, decodeParms=None): + def decode( + data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + ) -> bytes: return data class JPXDecode: @staticmethod - def decode(data, decodeParms=None): + def decode( + data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + ) -> bytes: return data class CCITParameters: """TABLE 3.9 Optional parameters for the CCITTFaxDecode filter""" - def __init__(self, K=0, columns=0, rows=0): + def __init__(self, K: int = 0, columns: int = 0, rows: int = 0) -> None: self.K = K self.EndOfBlock = None self.EndOfLine = None @@ -391,7 +411,7 @@ def __init__(self, K=0, columns=0, rows=0): self.DamagedRowsBeforeError = None @property - def group(self): + def group(self) -> int: if self.K < 0: CCITTgroup = 4 else: @@ -412,7 +432,9 @@ class CCITTFaxDecode: """ @staticmethod - def _get_parameters(parameters, rows): + def _get_parameters( + parameters: Union[None, ArrayObject, DictionaryObject], rows: int + ) -> CCITParameters: k = 0 columns = 0 if parameters: @@ -425,13 +447,17 @@ def _get_parameters(parameters, rows): if CCITT.K in decodeParm: k = decodeParm[CCITT.K] else: - columns = parameters[CCITT.COLUMNS] - k = parameters[CCITT.K] + columns = parameters[CCITT.COLUMNS] # type: ignore + k = parameters[CCITT.K] # type: ignore return CCITParameters(k, columns, rows) @staticmethod - def decode(data, decodeParms=None, height=0): + def decode( + data: bytes, + decodeParms: Union[None, ArrayObject, DictionaryObject] = None, + height: int = 0, + ) -> bytes: parms = CCITTFaxDecode._get_parameters(decodeParms, height) img_size = len(data) @@ -482,7 +508,7 @@ def decode(data, decodeParms=None, height=0): return tiff_header + data -def decodeStreamData(stream): +def decodeStreamData(stream: Any) -> Union[str, bytes]: # utils.StreamObject from .generic import NameObject filters = stream.get(SA.FILTER, ()) @@ -490,16 +516,16 @@ def decodeStreamData(stream): if len(filters) and not isinstance(filters[0], NameObject): # we have a single filter instance filters = (filters,) - data = stream._data + data: bytes = stream._data # If there is not data to decode we should not try to decode the data. if data: for filterType in filters: if filterType == FT.FLATE_DECODE or filterType == FTA.FL: data = FlateDecode.decode(data, stream.get(SA.DECODE_PARMS)) elif filterType == FT.ASCII_HEX_DECODE or filterType == FTA.AHx: - data = ASCIIHexDecode.decode(data) + data = ASCIIHexDecode.decode(data) # type: ignore elif filterType == FT.LZW_DECODE or filterType == FTA.LZW: - data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS)) + data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS)) # type: ignore elif filterType == FT.ASCII_85_DECODE or filterType == FTA.A85: data = ASCII85Decode.decode(data) elif filterType == FT.DCT_DECODE: @@ -523,7 +549,7 @@ def decodeStreamData(stream): return data -def _xobj_to_image(x_object_obj): +def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: """ Users need to have the pillow package installed. @@ -539,7 +565,7 @@ def _xobj_to_image(x_object_obj): from PyPDF2.constants import GraphicsStateParameters as G size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT]) - data = x_object_obj.getData() + data = x_object_obj.getData() # type: ignore if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB: mode: Literal["RGB", "P"] = "RGB" else: diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 40010722d..368c03641 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -38,7 +38,7 @@ import re import warnings from io import BytesIO -from typing import Dict, Optional +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from PyPDF2.constants import FilterTypes as FT from PyPDF2.constants import StreamAttributes as SA @@ -49,8 +49,15 @@ PdfStreamError, ) -from . import filters, utils -from .utils import RC4_encrypt, b_, ord_, readNonWhitespace, skipOverComment +from . import utils +from .utils import ( + RC4_encrypt, + StreamType, + b_, + ord_, + readNonWhitespace, + skipOverComment, +) logger = logging.getLogger(__name__) ObjectPrefix = b_("/<[tf(n%") @@ -59,61 +66,24 @@ class PdfObject: - def getObject(self): + def getObject(self) -> Optional["PdfObject"]: """Resolve indirect references.""" return self - -def readObject(stream, pdf) -> PdfObject: - tok = stream.read(1) - stream.seek(-1, 1) # reset to start - idx = ObjectPrefix.find(tok) - if idx == 0: - return NameObject.readFromStream(stream, pdf) - elif idx == 1: - # hexadecimal string OR dictionary - peek = stream.read(2) - stream.seek(-2, 1) # reset to start - - if peek == b_("<<"): - return DictionaryObject.readFromStream(stream, pdf) - else: - return readHexStringFromStream(stream) - elif idx == 2: - return ArrayObject.readFromStream(stream, pdf) - elif idx == 3 or idx == 4: - return BooleanObject.readFromStream(stream) - elif idx == 5: - return readStringFromStream(stream) - elif idx == 6: - return NullObject.readFromStream(stream) - elif idx == 7: - # comment - while tok not in (b_("\r"), b_("\n")): - tok = stream.read(1) - # Prevents an infinite loop by raising an error if the stream is at - # the EOF - if len(tok) <= 0: - raise PdfStreamError("File ended unexpectedly.") - tok = readNonWhitespace(stream) - stream.seek(-1, 1) - return readObject(stream, pdf) - else: - # number object OR indirect reference - peek = stream.read(20) - stream.seek(-len(peek), 1) # reset to start - if IndirectPattern.match(peek) is not None: - return IndirectObject.readFromStream(stream, pdf) - else: - return NumberObject.readFromStream(stream) + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + raise NotImplementedError() class NullObject(PdfObject): - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_("null")) @staticmethod - def readFromStream(stream): + def readFromStream(stream: StreamType) -> "NullObject": nulltxt = stream.read(4) if nulltxt != b_("null"): raise PdfReadError("Could not read Null object") @@ -121,17 +91,19 @@ def readFromStream(stream): class BooleanObject(PdfObject): - def __init__(self, value): + def __init__(self, value: Any) -> None: self.value = value - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: if self.value: stream.write(b_("true")) else: stream.write(b_("false")) @staticmethod - def readFromStream(stream): + def readFromStream(stream: StreamType) -> "BooleanObject": word = stream.read(4) if word == b_("true"): return BooleanObject(True) @@ -143,7 +115,9 @@ def readFromStream(stream): class ArrayObject(list, PdfObject): - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_("[")) for data in self: stream.write(b_(" ")) @@ -151,7 +125,7 @@ def writeToStream(self, stream, encryption_key): stream.write(b_(" ]")) @staticmethod - def readFromStream(stream, pdf): + def readFromStream(stream: StreamType, pdf: Any) -> "ArrayObject": # PdfFileReader arr = ArrayObject() tmp = stream.read(1) if tmp != b_("["): @@ -173,18 +147,18 @@ def readFromStream(stream, pdf): class IndirectObject(PdfObject): - def __init__(self, idnum, generation, pdf): + def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfFileReader self.idnum = idnum self.generation = generation self.pdf = pdf - def getObject(self): + def getObject(self) -> Optional[PdfObject]: return self.pdf.getObject(self).getObject() - def __repr__(self): + def __repr__(self) -> str: return f"IndirectObject({self.idnum!r}, {self.generation!r})" - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: return ( other is not None and isinstance(other, IndirectObject) @@ -193,14 +167,18 @@ def __eq__(self, other): and self.pdf is other.pdf ) - def __ne__(self, other): + def __ne__(self, other: Any) -> bool: return not self.__eq__(other) - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_(f"{self.idnum} {self.generation} R")) @staticmethod - def readFromStream(stream, pdf): + def readFromStream( + stream: StreamType, pdf: Any # PdfFileReader + ) -> "IndirectObject": idnum = b_("") while True: tok = stream.read(1) @@ -229,7 +207,9 @@ def readFromStream(stream, pdf): class FloatObject(decimal.Decimal, PdfObject): - def __new__(cls, value="0", context=None): + def __new__( + cls, value: Union[str, Any] = "0", context: Optional[Any] = None + ) -> "FloatObject": try: return decimal.Decimal.__new__(cls, utils.str_(value), context) except Exception: @@ -241,7 +221,7 @@ def __new__(cls, value="0", context=None): logger.warning(f"Invalid FloatObject {value}") return decimal.Decimal.__new__(cls, "0") - def __repr__(self): + def __repr__(self) -> str: if self == self.to_integral(): return str(self.quantize(decimal.Decimal(1))) else: @@ -252,10 +232,12 @@ def __repr__(self): o = o[:-1] return o - def as_numeric(self): + def as_numeric(self) -> float: return float(b_(repr(self))) - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_(repr(self))) @@ -263,21 +245,23 @@ class NumberObject(int, PdfObject): NumberPattern = re.compile(b_("[^+-.0-9]")) ByteDot = b_(".") - def __new__(cls, value): + def __new__(cls, value: Any) -> "NumberObject": val = int(value) try: return int.__new__(cls, val) except OverflowError: return int.__new__(cls, 0) - def as_numeric(self): + def as_numeric(self) -> int: return int(b_(repr(self))) - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_(repr(self))) @staticmethod - def readFromStream(stream): + def readFromStream(stream: StreamType) -> Union["NumberObject", FloatObject]: num = utils.readUntilRegex(stream, NumberObject.NumberPattern) if num.find(NumberObject.ByteDot) != -1: return FloatObject(num) @@ -285,34 +269,9 @@ def readFromStream(stream): return NumberObject(num) -def createStringObject(string): - """ - Given a string (either a "str" or "unicode"), create a ByteStringObject or a - TextStringObject to represent the string. - """ - if isinstance(string, str): - return TextStringObject(string) - elif isinstance(string, utils.bytes_type): - try: - if string.startswith(codecs.BOM_UTF16_BE): - retval = TextStringObject(string.decode("utf-16")) - retval.autodetect_utf16 = True - return retval - else: - # This is probably a big performance hit here, but we need to - # convert string objects into the text/unicode-aware version if - # possible... and the only way to check if that's possible is - # to try. Some strings are strings, some are just byte arrays. - retval = TextStringObject(decode_pdfdocencoding(string)) - retval.autodetect_pdfdocencoding = True - return retval - except UnicodeDecodeError: - return ByteStringObject(string) - else: - raise TypeError("createStringObject should have str or unicode arg") - - -def readHexStringFromStream(stream): +def readHexStringFromStream( + stream: StreamType, +) -> Union["TextStringObject", "ByteStringObject"]: stream.read(1) txt = "" x = b_("") @@ -333,7 +292,9 @@ def readHexStringFromStream(stream): return createStringObject(b_(txt)) -def readStringFromStream(stream): +def readStringFromStream( + stream: StreamType, +) -> Union["TextStringObject", "ByteStringObject"]: tok = stream.read(1) parens = 1 txt = b_("") @@ -415,20 +376,22 @@ class ByteStringObject(utils.bytes_type, PdfObject): # type: ignore """ @property - def original_bytes(self): + def original_bytes(self) -> bytes: """For compatibility with TextStringObject.original_bytes.""" return self - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: bytearr = self if encryption_key: - bytearr = RC4_encrypt(encryption_key, bytearr) + bytearr = RC4_encrypt(encryption_key, bytearr) # type: ignore stream.write(b_("<")) stream.write(utils.hexencode(bytearr)) stream.write(b_(">")) -class TextStringObject(str, PdfObject): # type: ignore +class TextStringObject(str, PdfObject): """ Represents a string object that has been decoded into a real unicode string. If read from a PDF document, this string appeared to match the @@ -440,7 +403,7 @@ class TextStringObject(str, PdfObject): # type: ignore autodetect_utf16 = False @property - def original_bytes(self): + def original_bytes(self) -> bytes: """ It is occasionally possible that a text string object gets created where a byte string object was expected due to the autodetection mechanism -- @@ -449,7 +412,7 @@ def original_bytes(self): """ return self.get_original_bytes() - def get_original_bytes(self): + def get_original_bytes(self) -> bytes: # We're a text string object, but the library is trying to get our raw # bytes. This can happen if we auto-detected this string as text, but # we were wrong. It's pretty common. Return the original bytes that @@ -462,7 +425,9 @@ def get_original_bytes(self): else: raise Exception("no information about original bytes") - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # Try to write the string out as a PDFDocEncoding encoded string. It's # nicer to look at in the PDF file. Sadly, we take a performance hit # here for trying... @@ -488,11 +453,13 @@ class NameObject(str, PdfObject): delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) surfix = b_("/") - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_(self)) @staticmethod - def readFromStream(stream, pdf): + def readFromStream(stream: StreamType, pdf: Any) -> "NameObject": # PdfFileReader name = stream.read(1) if name != NameObject.surfix: raise PdfReadError("name read error") @@ -516,27 +483,27 @@ def readFromStream(stream, pdf): class DictionaryObject(dict, PdfObject): - def raw_get(self, key): + def raw_get(self, key: Any) -> Any: return dict.__getitem__(self, key) - def __setitem__(self, key, value): + def __setitem__(self, key: Any, value: Any) -> Any: if not isinstance(key, PdfObject): raise ValueError("key must be PdfObject") if not isinstance(value, PdfObject): raise ValueError("value must be PdfObject") return dict.__setitem__(self, key, value) - def setdefault(self, key, value=None): + def setdefault(self, key: Any, value: Optional[Any] = None) -> Any: if not isinstance(key, PdfObject): raise ValueError("key must be PdfObject") if not isinstance(value, PdfObject): raise ValueError("value must be PdfObject") - return dict.setdefault(self, key, value) + return dict.setdefault(self, key, value) # type: ignore - def __getitem__(self, key): + def __getitem__(self, key: Any) -> PdfObject: return dict.__getitem__(self, key).getObject() - def getXmpMetadata(self): + def getXmpMetadata(self) -> Optional[PdfObject]: # XmpInformation """ Retrieve XMP (Extensible Metadata Platform) data relevant to the this object, if available. @@ -546,19 +513,20 @@ def getXmpMetadata(self): that can be used to access XMP metadata from the document. Can also return None if no metadata was found on the document root. """ + from PyPDF2.xmp import XmpInformation + metadata = self.get("/Metadata", None) if metadata is None: return None metadata = metadata.getObject() - from . import xmp - if not isinstance(metadata, xmp.XmpInformation): - metadata = xmp.XmpInformation(metadata) + if not isinstance(metadata, XmpInformation): + metadata = XmpInformation(metadata) self[NameObject("/Metadata")] = metadata return metadata @property - def xmpMetadata(self): + def xmpMetadata(self) -> Optional[PdfObject]: # XmpInformation """ Read-only property that accesses the {@link #DictionaryObject.getXmpData getXmpData} function. @@ -567,7 +535,9 @@ def xmpMetadata(self): """ return self.getXmpMetadata() - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_("<<\n")) for key, value in list(self.items()): key.writeToStream(stream, encryption_key) @@ -577,8 +547,12 @@ def writeToStream(self, stream, encryption_key): stream.write(b_(">>")) @staticmethod - def readFromStream(stream, pdf): - def getNextObjPos(p, p1, remGens, pdf): + def readFromStream( + stream: StreamType, pdf: Any # PdfFileReader + ) -> "DictionaryObject": + def getNextObjPos( + p: int, p1: int, remGens: List[int], pdf: Any + ) -> int: # PdfFileReader l = pdf.xref[remGens[0]] for o in l: if p1 > l[o] and p < l[o]: @@ -588,7 +562,9 @@ def getNextObjPos(p, p1, remGens, pdf): else: return getNextObjPos(p, p1, remGens[1:], pdf) - def readUnsizedFromSteam(stream, pdf): + def readUnsizedFromSteam( + stream: StreamType, pdf: Any # PdfFileReader + ) -> bytes: # we are just pointing at beginning of the stream eon = getNextObjPos(stream.tell(), 2**32, [g for g in pdf.xref], pdf) - 1 curr = stream.tell() @@ -607,7 +583,7 @@ def readUnsizedFromSteam(stream, pdf): "Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()) ) - data = {} + data: Dict[Any, Any] = {} while True: tok = readNonWhitespace(stream) if tok == b_("\x00"): @@ -702,16 +678,16 @@ def readUnsizedFromSteam(stream, pdf): class TreeObject(DictionaryObject): - def __init__(self): + def __init__(self) -> None: DictionaryObject.__init__(self) - def hasChildren(self): + def hasChildren(self) -> bool: return "/First" in self - def __iter__(self): + def __iter__(self) -> Any: return self.children() - def children(self): + def children(self) -> Optional[Any]: if not self.hasChildren(): return @@ -720,9 +696,9 @@ def children(self): yield child if child == self["/Last"]: return - child = child["/Next"] + child = child["/Next"] # type: ignore - def addChild(self, child, pdf): + def addChild(self, child: Any, pdf: Any) -> None: # PdfFileReader child_obj = child.getObject() child = pdf.getReference(child_obj) assert isinstance(child, IndirectObject) @@ -735,19 +711,19 @@ def addChild(self, child, pdf): prev = self["/Last"] self[NameObject("/Last")] = child - self[NameObject("/Count")] = NumberObject(self[NameObject("/Count")] + 1) + self[NameObject("/Count")] = NumberObject(self[NameObject("/Count")] + 1) # type: ignore if prev: prev_ref = pdf.getReference(prev) assert isinstance(prev_ref, IndirectObject) child_obj[NameObject("/Prev")] = prev_ref - prev[NameObject("/Next")] = child + prev[NameObject("/Next")] = child # type: ignore parent_ref = pdf.getReference(self) assert isinstance(parent_ref, IndirectObject) child_obj[NameObject("/Parent")] = parent_ref - def removeChild(self, child): + def removeChild(self, child: Any) -> None: child_obj = child.getObject() if NameObject("/Parent") not in child_obj: @@ -758,8 +734,8 @@ def removeChild(self, child): found = False prev_ref = None prev = None - cur_ref = self[NameObject("/First")] - cur = cur_ref.getObject() + cur_ref: Optional[Any] = self[NameObject("/First")] + cur: Optional[Dict[str, Any]] = cur_ref.getObject() # type: ignore last_ref = self[NameObject("/Last")] last = last_ref.getObject() while cur is not None: @@ -771,7 +747,7 @@ def removeChild(self, child): next = next_ref.getObject() del next[NameObject("/Prev")] self[NameObject("/First")] = next_ref - self[NameObject("/Count")] = self[NameObject("/Count")] - 1 + self[NameObject("/Count")] -= 1 # type: ignore else: # Removing only tree node @@ -787,13 +763,13 @@ def removeChild(self, child): next = next_ref.getObject() next[NameObject("/Prev")] = prev_ref prev[NameObject("/Next")] = next_ref - self[NameObject("/Count")] = self[NameObject("/Count")] - 1 + self[NameObject("/Count")] -= 1 else: # Removing last tree node assert cur == last del prev[NameObject("/Next")] self[NameObject("/Last")] = prev_ref - self[NameObject("/Count")] = self[NameObject("/Count")] - 1 + self[NameObject("/Count")] -= 1 found = True break @@ -815,7 +791,7 @@ def removeChild(self, child): if NameObject("/Prev") in child_obj: del child_obj[NameObject("/Prev")] - def emptyTree(self): + def emptyTree(self) -> None: for child in self: child_obj = child.getObject() del child_obj[NameObject("/Parent")] @@ -833,19 +809,21 @@ def emptyTree(self): class StreamObject(DictionaryObject): - def __init__(self): + def __init__(self) -> None: self.__data: Optional[str] = None - self.decodedSelf = None + self.decodedSelf: Optional[DecodedStreamObject] = None @property - def _data(self): + def _data(self) -> Any: return self.__data @_data.setter - def _data(self, value): + def _data(self, value: Any) -> None: self.__data = value - def writeToStream(self, stream, encryption_key) -> None: + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) DictionaryObject.writeToStream(self, stream, encryption_key) del self[SA.LENGTH] @@ -857,7 +835,10 @@ def writeToStream(self, stream, encryption_key) -> None: stream.write(b_("\nendstream")) @staticmethod - def initializeFromDictionary(data): + def initializeFromDictionary( + data: Dict[str, Any] + ) -> Union["EncodedStreamObject", "DecodedStreamObject"]: + retval: Union["EncodedStreamObject", "DecodedStreamObject"] if SA.FILTER in data: retval = EncodedStreamObject() else: @@ -868,7 +849,9 @@ def initializeFromDictionary(data): retval.update(data) return retval - def flateEncode(self): + def flateEncode(self) -> "EncodedStreamObject": + from PyPDF2.filters import FlateDecode + if SA.FILTER in self: f = self[SA.FILTER] if isinstance(f, ArrayObject): @@ -882,23 +865,25 @@ def flateEncode(self): f = NameObject("/FlateDecode") retval = EncodedStreamObject() retval[NameObject(SA.FILTER)] = f - retval._data = filters.FlateDecode.encode(self._data) + retval._data = FlateDecode.encode(self._data) return retval class DecodedStreamObject(StreamObject): - def getData(self): + def getData(self) -> Any: return self._data - def setData(self, data): + def setData(self, data: Any) -> None: self._data = data class EncodedStreamObject(StreamObject): - def __init__(self): - self.decodedSelf = None + def __init__(self) -> None: + self.decodedSelf: Optional[DecodedStreamObject] = None + + def getData(self) -> Union[None, str, bytes]: + from PyPDF2.filters import decodeStreamData - def getData(self): if self.decodedSelf: # cached version of decoded object return self.decodedSelf.getData() @@ -906,21 +891,26 @@ def getData(self): # create decoded object decoded = DecodedStreamObject() - decoded._data = filters.decodeStreamData(self) + decoded._data = decodeStreamData(self) for key, value in list(self.items()): if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): decoded[key] = value self.decodedSelf = decoded return decoded._data - def setData(self, data): + def setData(self, data: Any) -> None: raise PdfReadError("Creating EncodedStreamObject is not currently supported") class ContentStream(DecodedStreamObject): - def __init__(self, stream, pdf): + def __init__(self, stream: Any, pdf: Any) -> None: self.pdf = pdf - self.operations = [] + + # The inner list has two elements: + # [0] : List + # [1] : str + self.operations: List[Tuple[Any, Any]] = [] + # stream may be a StreamObject or an ArrayObject containing # multiple StreamObjects to be cat'd together. stream = stream.getObject() @@ -933,10 +923,10 @@ def __init__(self, stream, pdf): stream = BytesIO(b_(stream.getData())) self.__parseContentStream(stream) - def __parseContentStream(self, stream): + def __parseContentStream(self, stream: StreamType) -> None: # file("f:\\tmp.txt", "w").write(stream.read()) stream.seek(0, 0) - operands = [] + operands: List[Union[int, str, PdfObject]] = [] while True: peek = readNonWhitespace(stream) if peek == b_("") or ord_(peek) == 0: @@ -966,7 +956,7 @@ def __parseContentStream(self, stream): else: operands.append(readObject(stream, None)) - def _readInlineImage(self, stream): + def _readInlineImage(self, stream: StreamType) -> Dict[str, Any]: # begin reading just after the "BI" - begin image # first read the dictionary of settings. settings = DictionaryObject() @@ -1027,7 +1017,7 @@ def _readInlineImage(self, stream): return {"settings": settings, "data": data.getvalue()} @property - def _data(self): + def _data(self) -> bytes: newdata = BytesIO() for operands, operator in self.operations: if operator == b_("INLINE IMAGE"): @@ -1047,10 +1037,56 @@ def _data(self): return newdata.getvalue() @_data.setter - def _data(self, value): + def _data(self, value: Union[str, bytes]) -> None: self.__parseContentStream(BytesIO(b_(value))) +def readObject( + stream: StreamType, pdf: Any # PdfFileReader +) -> Union[PdfObject, int, str, ContentStream]: + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + idx = ObjectPrefix.find(tok) + if idx == 0: + return NameObject.readFromStream(stream, pdf) + elif idx == 1: + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + + if peek == b_("<<"): + return DictionaryObject.readFromStream(stream, pdf) + else: + return readHexStringFromStream(stream) + elif idx == 2: + return ArrayObject.readFromStream(stream, pdf) + elif idx == 3 or idx == 4: + return BooleanObject.readFromStream(stream) + elif idx == 5: + return readStringFromStream(stream) + elif idx == 6: + return NullObject.readFromStream(stream) + elif idx == 7: + # comment + while tok not in (b_("\r"), b_("\n")): + tok = stream.read(1) + # Prevents an infinite loop by raising an error if the stream is at + # the EOF + if len(tok) <= 0: + raise PdfStreamError("File ended unexpectedly.") + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if IndirectPattern.match(peek) is not None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + + class RectangleObject(ArrayObject): """ This class is used to represent *page boxes* in PyPDF2. These boxes include: @@ -1062,73 +1098,84 @@ class RectangleObject(ArrayObject): * :attr:`trimBox ` """ - def __init__(self, arr): + def __init__( + self, + arr: Union[ + ArrayObject, + Tuple[ + Union[float, decimal.Decimal, None], + Union[float, decimal.Decimal, None], + Union[float, decimal.Decimal, None], + Union[float, decimal.Decimal, None], + ], + ], + ) -> None: # must have four points assert len(arr) == 4 # automatically convert arr[x] into NumberObject(arr[x]) if necessary - ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) + ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) # type: ignore - def ensureIsNumber(self, value): + def ensureIsNumber(self, value: Any) -> Union[NumberObject, FloatObject]: if not isinstance(value, (NumberObject, FloatObject)): - value = FloatObject(value) + return FloatObject(value) return value - def __repr__(self): + def __repr__(self) -> str: return "RectangleObject(%s)" % repr(list(self)) - def getLowerLeft_x(self): + def getLowerLeft_x(self) -> FloatObject: return self[0] - def getLowerLeft_y(self): + def getLowerLeft_y(self) -> FloatObject: return self[1] - def getUpperRight_x(self): + def getUpperRight_x(self) -> FloatObject: return self[2] - def getUpperRight_y(self): + def getUpperRight_y(self) -> FloatObject: return self[3] - def getUpperLeft_x(self): + def getUpperLeft_x(self) -> FloatObject: return self.getLowerLeft_x() - def getUpperLeft_y(self): + def getUpperLeft_y(self) -> FloatObject: return self.getUpperRight_y() - def getLowerRight_x(self): + def getLowerRight_x(self) -> FloatObject: return self.getUpperRight_x() - def getLowerRight_y(self): + def getLowerRight_y(self) -> FloatObject: return self.getLowerLeft_y() - def getLowerLeft(self): + def getLowerLeft(self) -> Tuple[FloatObject, FloatObject]: return self.getLowerLeft_x(), self.getLowerLeft_y() - def getLowerRight(self): + def getLowerRight(self) -> Tuple[FloatObject, FloatObject]: return self.getLowerRight_x(), self.getLowerRight_y() - def getUpperLeft(self): + def getUpperLeft(self) -> Tuple[FloatObject, FloatObject]: return self.getUpperLeft_x(), self.getUpperLeft_y() - def getUpperRight(self): + def getUpperRight(self) -> Tuple[FloatObject, FloatObject]: return self.getUpperRight_x(), self.getUpperRight_y() - def setLowerLeft(self, value): + def setLowerLeft(self, value: Iterable[FloatObject]) -> None: self[0], self[1] = (self.ensureIsNumber(x) for x in value) - def setLowerRight(self, value): + def setLowerRight(self, value: Iterable[FloatObject]) -> None: self[2], self[1] = (self.ensureIsNumber(x) for x in value) - def setUpperLeft(self, value): + def setUpperLeft(self, value: Iterable[FloatObject]) -> None: self[0], self[3] = (self.ensureIsNumber(x) for x in value) - def setUpperRight(self, value): + def setUpperRight(self, value: Iterable[FloatObject]) -> None: self[2], self[3] = (self.ensureIsNumber(x) for x in value) - def getWidth(self): - return self.getUpperRight_x() - self.getLowerLeft_x() + def getWidth(self) -> float: + return self.getUpperRight_x() - self.getLowerLeft_x() # type: ignore - def getHeight(self): - return self.getUpperRight_y() - self.getLowerLeft_y() + def getHeight(self) -> float: + return self.getUpperRight_y() - self.getLowerLeft_y() # type: ignore lowerLeft = property(getLowerLeft, setLowerLeft, None, None) """ @@ -1158,7 +1205,7 @@ class Field(TreeObject): :meth:`getFields()` """ - def __init__(self, data): + def __init__(self, data: Dict[str, Any]) -> None: DictionaryObject.__init__(self) attributes = ( "/FT", @@ -1178,33 +1225,35 @@ def __init__(self, data): except KeyError: pass + # TABLE 8.69 Entries common to all field dictionaries + @property - def fieldType(self): + def fieldType(self) -> Optional[NameObject]: """Read-only property accessing the type of this field.""" return self.get("/FT") @property - def parent(self): + def parent(self) -> Optional[DictionaryObject]: """Read-only property accessing the parent of this field.""" return self.get("/Parent") @property - def kids(self): + def kids(self) -> Optional[ArrayObject]: """Read-only property accessing the kids of this field.""" return self.get("/Kids") @property - def name(self): + def name(self) -> Optional[str]: """Read-only property accessing the name of this field.""" return self.get("/T") @property - def altName(self): + def altName(self) -> Optional[str]: """Read-only property accessing the alternate name of this field.""" return self.get("/TU") @property - def mappingName(self): + def mappingName(self) -> Optional[str]: """ Read-only property accessing the mapping name of this field. This name is used by PyPDF2 as a key in the dictionary returned by @@ -1213,7 +1262,7 @@ def mappingName(self): return self.get("/TM") @property - def flags(self): + def flags(self) -> Optional[int]: """ Read-only property accessing the field flags, specifying various characteristics of the field (see Table 8.70 of the PDF 1.7 reference). @@ -1221,7 +1270,7 @@ def flags(self): return self.get("/Ff") @property - def value(self): + def value(self) -> Optional[Any]: """ Read-only property accessing the value of this field. Format varies based on field type. @@ -1229,18 +1278,18 @@ def value(self): return self.get("/V") @property - def defaultValue(self): + def defaultValue(self) -> Optional[Any]: """Read-only property accessing the default value of this field.""" return self.get("/DV") @property - def additionalActions(self): + def additionalActions(self) -> Optional[DictionaryObject]: """ Read-only property accessing the additional actions dictionary. This dictionary defines the field's behavior in response to trigger events. See Section 8.5.2 of the PDF 1.7 reference. """ - self.get("/AA") + return self.get("/AA") class Destination(TreeObject): @@ -1276,7 +1325,13 @@ class Destination(TreeObject): - [left] """ - def __init__(self, title, page, typ, *args): + def __init__( + self, + title: str, + page: Union[NumberObject, IndirectObject, NullObject, DictionaryObject], + typ: Union[str, NumberObject], + *args: Any, # ZoomArgType + ) -> None: DictionaryObject.__init__(self) self[NameObject("/Title")] = title self[NameObject("/Page")] = page @@ -1308,7 +1363,7 @@ def __init__(self, title, page, typ, *args): else: raise PdfReadError("Unknown Destination Type: %r" % typ) - def getDestArray(self): + def getDestArray(self) -> ArrayObject: return ArrayObject( [self.raw_get("/Page"), self["/Type"]] + [ @@ -1318,7 +1373,9 @@ def getDestArray(self): ] ) - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_("<<\n")) key = NameObject("/D") key.writeToStream(stream, encryption_key) @@ -1329,14 +1386,14 @@ def writeToStream(self, stream, encryption_key): key = NameObject("/S") key.writeToStream(stream, encryption_key) stream.write(b_(" ")) - value = NameObject("/GoTo") - value.writeToStream(stream, encryption_key) + value_s = NameObject("/GoTo") + value_s.writeToStream(stream, encryption_key) stream.write(b_("\n")) stream.write(b_(">>")) @property - def title(self): + def title(self) -> Optional[str]: """ Read-only property accessing the destination title. @@ -1345,7 +1402,7 @@ def title(self): return self.get("/Title") @property - def page(self): + def page(self) -> Optional[int]: """ Read-only property accessing the destination page number. @@ -1354,7 +1411,7 @@ def page(self): return self.get("/Page") @property - def typ(self): + def typ(self) -> Optional[str]: """ Read-only property accessing the destination type. @@ -1363,7 +1420,7 @@ def typ(self): return self.get("/Type") @property - def zoom(self): + def zoom(self) -> Optional[int]: """ Read-only property accessing the zoom factor. @@ -1372,44 +1429,46 @@ def zoom(self): return self.get("/Zoom", None) @property - def left(self): + def left(self) -> Optional[FloatObject]: """ Read-only property accessing the left horizontal coordinate. - :rtype: int, or ``None`` if not available. + :rtype: float, or ``None`` if not available. """ return self.get("/Left", None) @property - def right(self): + def right(self) -> Optional[FloatObject]: """ Read-only property accessing the right horizontal coordinate. - :rtype: int, or ``None`` if not available. + :rtype: float, or ``None`` if not available. """ return self.get("/Right", None) @property - def top(self): + def top(self) -> Optional[FloatObject]: """ Read-only property accessing the top vertical coordinate. - :rtype: int, or ``None`` if not available. + :rtype: float, or ``None`` if not available. """ return self.get("/Top", None) @property - def bottom(self): + def bottom(self) -> Optional[FloatObject]: """ Read-only property accessing the bottom vertical coordinate. - :rtype: int, or ``None`` if not available. + :rtype: float, or ``None`` if not available. """ return self.get("/Bottom", None) class Bookmark(Destination): - def writeToStream(self, stream, encryption_key): + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: stream.write(b_("<<\n")) for key in [ NameObject(x) @@ -1430,7 +1489,40 @@ def writeToStream(self, stream, encryption_key): stream.write(b_(">>")) -def encode_pdfdocencoding(unicode_string): +def createStringObject( + string: Union[str, bytes] +) -> Union[TextStringObject, ByteStringObject]: + """ + Given a string, create a ByteStringObject or a TextStringObject to + represent the string. + + :param string: A string + + :raises TypeError: If string is not of type str or bytes. + """ + if isinstance(string, str): + return TextStringObject(string) + elif isinstance(string, utils.bytes_type): + try: + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("createStringObject should have str or unicode arg") + + +def encode_pdfdocencoding(unicode_string: str) -> bytes: retval = b_("") for c in unicode_string: try: @@ -1442,7 +1534,7 @@ def encode_pdfdocencoding(unicode_string): return retval -def decode_pdfdocencoding(byte_array): +def decode_pdfdocencoding(byte_array: bytes) -> str: retval = "" for b in byte_array: c = _pdfDocEncoding[ord_(b)] diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py index e1a470a18..c998b32d8 100644 --- a/PyPDF2/merger.py +++ b/PyPDF2/merger.py @@ -25,18 +25,24 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -from io import BytesIO -from io import FileIO as file -from typing import List, Optional, Union +from io import BytesIO, FileIO, IOBase +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast +from PyPDF2._page import PageObject from PyPDF2._reader import PdfFileReader from PyPDF2._writer import PdfFileWriter from PyPDF2.constants import PagesAttributes as PA from PyPDF2.generic import * -from PyPDF2.pagerange import PageRange -from PyPDF2.utils import str_ - -StreamIO = BytesIO +from PyPDF2.pagerange import PageRange, PageRangeSpec +from PyPDF2.types import ( + BookmarkTypes, + LayoutType, + OutlinesType, + PagemodeType, + ZoomArgsType, + ZoomArgType, +) +from PyPDF2.utils import StrByteType, str_ ERR_CLOSED_WRITER = "close() was called and thus the writer cannot be used anymore" @@ -47,7 +53,7 @@ class _MergedPage: information on each page that is being merged. """ - def __init__(self, pagedata, src, id): + def __init__(self, pagedata: PageObject, src: PdfFileReader, id: int) -> None: self.src = src self.pagedata = pagedata self.out_pagedata = None @@ -68,18 +74,23 @@ class PdfFileMerger: Defaults to ``False``. """ - def __init__(self, strict=False): - self.inputs = [] - self.pages = [] + def __init__(self, strict: bool = False) -> None: + self.inputs: List[Tuple[Any, PdfFileReader, bool]] = [] + self.pages: List[Any] = [] self.output: Optional[PdfFileWriter] = PdfFileWriter() - self.bookmarks = [] - self.named_dests = [] + self.bookmarks: OutlinesType = [] + self.named_dests: List[Any] = [] self.id_count = 0 self.strict = strict def merge( - self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True - ): + self, + position: int, + fileobj: Union[StrByteType, PdfFileReader], + bookmark: Optional[str] = None, + pages: Optional[PageRangeSpec] = None, + import_bookmarks: bool = True, + ) -> None: """ Merges the pages from the given file into the output file at the specified page number. @@ -104,82 +115,51 @@ def merge( bookmarks from being imported by specifying this as ``False``. """ - # This parameter is passed to self.inputs.append and means - # that the stream used was created in this method. - my_file = False - - # If the fileobj parameter is a string, assume it is a path - # and create a file object at that location. If it is a file, - # copy the file's contents into a BytesIO (or StreamIO) stream object; if - # it is a PdfFileReader, copy that reader's stream into a - # BytesIO (or StreamIO) stream. - # If fileobj is none of the above types, it is not modified - decryption_key = None - if isinstance(fileobj, str): - fileobj = file(fileobj, "rb") - my_file = True - elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): - fileobj.seek(0) - filecontent = fileobj.read() - fileobj = StreamIO(filecontent) - my_file = True - elif isinstance(fileobj, PdfFileReader): - if hasattr(fileobj, "_decryption_key"): - decryption_key = fileobj._decryption_key - orig_tell = fileobj.stream.tell() - fileobj.stream.seek(0) - filecontent = StreamIO(fileobj.stream.read()) - - # reset the stream to its original location - fileobj.stream.seek(orig_tell) - - fileobj = filecontent - my_file = True + stream, my_file, decryption_key = self._create_stream(fileobj) # Create a new PdfFileReader instance using the stream # (either file or BytesIO or StringIO) created above - pdfr = PdfFileReader(fileobj, strict=self.strict) + reader = PdfFileReader(stream, strict=self.strict) # type: ignore[arg-type] if decryption_key is not None: - pdfr._decryption_key = decryption_key + reader._decryption_key = decryption_key # Find the range of pages to merge. if pages is None: - pages = (0, pdfr.getNumPages()) + pages = (0, reader.getNumPages()) elif isinstance(pages, PageRange): - pages = pages.indices(pdfr.getNumPages()) + pages = pages.indices(reader.getNumPages()) elif not isinstance(pages, tuple): raise TypeError('"pages" must be a tuple of (start, stop[, step])') srcpages = [] - if bookmark: - bookmark = Bookmark( - TextStringObject(bookmark), - NumberObject(self.id_count), - NameObject("/Fit"), - ) outline = [] if import_bookmarks: - outline = pdfr.getOutlines() - outline = self._trim_outline(pdfr, outline, pages) + outline = reader.getOutlines() + outline = self._trim_outline(reader, outline, pages) if bookmark: - self.bookmarks += [bookmark, outline] + bookmark_typ = Bookmark( + TextStringObject(bookmark), + NumberObject(self.id_count), + NameObject("/Fit"), + ) + self.bookmarks += [bookmark_typ, outline] # type: ignore else: self.bookmarks += outline - dests = pdfr.namedDestinations - dests = self._trim_dests(pdfr, dests, pages) - self.named_dests += dests + dests = reader.namedDestinations + trimmed_dests = self._trim_dests(reader, dests, pages) + self.named_dests += trimmed_dests # Gather all the pages that are going to be merged for i in range(*pages): - pg = pdfr.getPage(i) + pg = reader.getPage(i) id = self.id_count self.id_count += 1 - mp = _MergedPage(pg, pdfr, id) + mp = _MergedPage(pg, reader, id) srcpages.append(mp) @@ -190,9 +170,53 @@ def merge( self.pages[position:position] = srcpages # Keep track of our input files so we can close them later - self.inputs.append((fileobj, pdfr, my_file)) + self.inputs.append((stream, reader, my_file)) - def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): + def _create_stream( + self, fileobj: Union[StrByteType, PdfFileReader] + ) -> Tuple[IOBase, bool, Optional[bytes]]: + # This parameter is passed to self.inputs.append and means + # that the stream used was created in this method. + my_file = False + + # If the fileobj parameter is a string, assume it is a path + # and create a file object at that location. If it is a file, + # copy the file's contents into a BytesIO stream object; if + # it is a PdfFileReader, copy that reader's stream into a + # BytesIO stream. + # If fileobj is none of the above types, it is not modified + decryption_key = None + stream: IOBase + if isinstance(fileobj, str): + stream = FileIO(fileobj, "rb") + my_file = True + elif isinstance(fileobj, PdfFileReader): + if hasattr(fileobj, "_decryption_key"): + decryption_key = fileobj._decryption_key + orig_tell = fileobj.stream.tell() + fileobj.stream.seek(0) + stream = BytesIO(fileobj.stream.read()) + + # reset the stream to its original location + fileobj.stream.seek(orig_tell) + + my_file = True + elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): + fileobj.seek(0) + filecontent = fileobj.read() + stream = BytesIO(filecontent) + my_file = True + else: + stream = fileobj + return stream, my_file, decryption_key + + def append( + self, + fileobj: Union[StrByteType, PdfFileReader], + bookmark: Optional[str] = None, + pages: Union[None, PageRange, Tuple[int, int], Tuple[int, int, int]] = None, + import_bookmarks: bool = True, + ) -> None: """ Identical to the :meth:`merge()` method, but assumes you want to concatenate all pages onto the end of the file instead of specifying a @@ -216,7 +240,7 @@ def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): """ self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) - def write(self, fileobj): + def write(self, fileobj: StrByteType) -> None: """ Writes all data that has been merged to the given output file. @@ -227,7 +251,7 @@ def write(self, fileobj): raise RuntimeError(ERR_CLOSED_WRITER) my_file = False if isinstance(fileobj, str): - fileobj = file(fileobj, "wb") + fileobj = FileIO(fileobj, "wb") my_file = True # Add pages to the PdfFileWriter @@ -235,8 +259,9 @@ def write(self, fileobj): # to allow PdfFileMerger to work with PyPdf 1.13 for page in self.pages: self.output.addPage(page.pagedata) + pages_obj = cast(Dict[str, Any], self.output._pages.getObject()) page.out_pagedata = self.output.getReference( - self.output._pages.getObject()[PA.KIDS][-1].getObject() + pages_obj[PA.KIDS][-1].getObject() ) # idnum = self.output._objects.index(self.output._pages.getObject()[PA.KIDS][-1].getObject()) + 1 # page.out_pagedata = IndirectObject(idnum, 0, self.output) @@ -251,7 +276,7 @@ def write(self, fileobj): if my_file: fileobj.close() - def close(self): + def close(self) -> None: """ Shuts all file descriptors (input and output) and clears all memory usage. @@ -264,7 +289,7 @@ def close(self): self.inputs = [] self.output = None - def addMetadata(self, infos): + def addMetadata(self, infos: Dict[str, Any]) -> None: """ Add custom metadata to the output. @@ -276,7 +301,7 @@ def addMetadata(self, infos): raise RuntimeError(ERR_CLOSED_WRITER) self.output.addMetadata(infos) - def setPageLayout(self, layout): + def setPageLayout(self, layout: LayoutType) -> None: """ Set the page layout @@ -304,7 +329,7 @@ def setPageLayout(self, layout): raise RuntimeError(ERR_CLOSED_WRITER) self.output.setPageLayout(layout) - def setPageMode(self, mode): + def setPageMode(self, mode: PagemodeType) -> None: """ Set the page mode. @@ -330,22 +355,32 @@ def setPageMode(self, mode): raise RuntimeError(ERR_CLOSED_WRITER) self.output.setPageMode(mode) - def _trim_dests(self, pdf, dests, pages): + def _trim_dests( + self, + pdf: PdfFileReader, + dests: Dict[str, Dict[str, Any]], + pages: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> List[Dict[str, Any]]: """ Removes any named destinations that are not a part of the specified page set. """ new_dests = [] - for k, o in list(dests.items()): + for key, obj in dests.items(): for j in range(*pages): - if pdf.getPage(j).getObject() == o["/Page"].getObject(): - o[NameObject("/Page")] = o["/Page"].getObject() - assert str_(k) == str_(o["/Title"]) - new_dests.append(o) + if pdf.getPage(j).getObject() == obj["/Page"].getObject(): + obj[NameObject("/Page")] = obj["/Page"].getObject() + assert str_(key) == str_(obj["/Title"]) + new_dests.append(obj) break return new_dests - def _trim_outline(self, pdf, outline, pages): + def _trim_outline( + self, + pdf: PdfFileReader, + outline: OutlinesType, + pages: Union[Tuple[int, int], Tuple[int, int, int]], + ) -> OutlinesType: """ Removes any outline/bookmark entries that are not a part of the specified page set. @@ -354,11 +389,11 @@ def _trim_outline(self, pdf, outline, pages): prev_header_added = True for i, o in enumerate(outline): if isinstance(o, list): - sub = self._trim_outline(pdf, o, pages) + sub = self._trim_outline(pdf, o, pages) # type: ignore if sub: if not prev_header_added: new_outline.append(outline[i - 1]) - new_outline.append(sub) + new_outline.append(sub) # type: ignore else: prev_header_added = False for j in range(*pages): @@ -369,7 +404,7 @@ def _trim_outline(self, pdf, outline, pages): break return new_outline - def _write_dests(self): + def _write_dests(self) -> None: if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) for named_dest in self.named_dests: @@ -383,11 +418,16 @@ def _write_dests(self): if pageno is not None: self.output.addNamedDestinationObject(named_dest) - def _write_bookmarks(self, bookmarks=None, parent=None): + def _write_bookmarks( + self, + bookmarks: Optional[Iterable[Bookmark]] = None, + parent: Optional[TreeObject] = None, + ) -> None: if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) if bookmarks is None: - bookmarks = self.bookmarks + bookmarks = self.bookmarks # type: ignore + assert bookmarks is not None, "hint for mypy" # TODO: is that true? last_added = None for bookmark in bookmarks: @@ -405,24 +445,27 @@ def _write_bookmarks(self, bookmarks=None, parent=None): del bookmark["/Page"], bookmark["/Type"] last_added = self.output.addBookmarkDict(bookmark, parent) - def _write_bookmark_on_page(self, bookmark, page): + def _write_bookmark_on_page( + self, bookmark: Union[Bookmark, Destination], page: _MergedPage + ) -> None: # b[NameObject('/Page')] = p.out_pagedata - args = [NumberObject(page.id), NameObject(bookmark["/Type"])] + bm_type = cast(BookmarkTypes, bookmark["/Type"]) + args = [NumberObject(page.id), NameObject(bm_type)] # nothing more to add # if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' - if bookmark["/Type"] == "/FitH" or bookmark["/Type"] == "/FitBH": + if bm_type == "/FitH" or bm_type == "/FitBH": if "/Top" in bookmark and not isinstance(bookmark["/Top"], NullObject): args.append(FloatObject(bookmark["/Top"])) else: args.append(FloatObject(0)) del bookmark["/Top"] - elif bookmark["/Type"] == "/FitV" or bookmark["/Type"] == "/FitBV": + elif bm_type == "/FitV" or bm_type == "/FitBV": if "/Left" in bookmark and not isinstance(bookmark["/Left"], NullObject): args.append(FloatObject(bookmark["/Left"])) else: args.append(FloatObject(0)) del bookmark["/Left"] - elif bookmark["/Type"] == "/XYZ": + elif bm_type == "/XYZ": if "/Left" in bookmark and not isinstance(bookmark["/Left"], NullObject): args.append(FloatObject(bookmark["/Left"])) else: @@ -436,7 +479,7 @@ def _write_bookmark_on_page(self, bookmark, page): else: args.append(FloatObject(0)) del bookmark["/Top"], bookmark["/Zoom"], bookmark["/Left"] - elif bookmark["/Type"] == "/FitR": + elif bm_type == "/FitR": if "/Left" in bookmark and not isinstance(bookmark["/Left"], NullObject): args.append(FloatObject(bookmark["/Left"])) else: @@ -466,7 +509,7 @@ def _write_bookmark_on_page(self, bookmark, page): {NameObject("/S"): NameObject("/GoTo"), NameObject("/D"): ArrayObject(args)} ) - def _associate_dests_to_pages(self, pages): + def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None: for nd in self.named_dests: pageno = None np = nd["/Page"] @@ -485,10 +528,12 @@ def _associate_dests_to_pages(self, pages): "Unresolved named destination '{}'".format(nd["/Title"]) ) - def _associate_bookmarks_to_pages(self, pages, bookmarks=None): + def _associate_bookmarks_to_pages( + self, pages: List[_MergedPage], bookmarks: Optional[Iterable[Bookmark]] = None + ) -> None: if bookmarks is None: - bookmarks = self.bookmarks - + bookmarks = self.bookmarks # type: ignore # TODO: self.bookmarks can be None! + assert bookmarks is not None, "hint for mypy" for b in bookmarks: if isinstance(b, list): self._associate_bookmarks_to_pages(pages, b) @@ -509,31 +554,38 @@ def _associate_bookmarks_to_pages(self, pages, bookmarks=None): else: raise ValueError("Unresolved bookmark '{}'".format(b["/Title"])) - def findBookmark(self, bookmark, root=None): + def findBookmark( + self, + bookmark: Dict[str, Any], + root: Optional[OutlinesType] = None, + ) -> Optional[List[int]]: if root is None: root = self.bookmarks for i, b in enumerate(root): if isinstance(b, list): - res = self.findBookmark(bookmark, b) + # b is still an inner node + # (OutlinesType, if recursive types were supported by mypy) + res = self.findBookmark(bookmark, b) # type: ignore if res: return [i] + res elif b == bookmark or b["/Title"] == bookmark: + # we found a leaf node return [i] return None def addBookmark( self, - title, - pagenum, - parent=None, - color=None, - bold=False, - italic=False, - fit="/Fit", - *args - ): + title: str, + pagenum: int, + parent: Union[None, TreeObject, IndirectObject] = None, + color: Optional[Tuple[float, float, float]] = None, + bold: bool = False, + italic: bool = False, + fit: str = "/Fit", + *args: ZoomArgType + ) -> IndirectObject: """ Add a bookmark to this PDF file. @@ -550,13 +602,14 @@ def addBookmark( """ if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) - if len(self.output.getObject(self.output._pages)["/Kids"]) > 0: - page_ref = self.output.getObject(self.output._pages)["/Kids"][pagenum] + out_pages = cast(Dict[str, Any], self.output.getObject(self.output._pages)) + if len(out_pages["/Kids"]) > 0: + page_ref = out_pages["/Kids"][pagenum] else: - page_ref = self.output.getObject(self.output._pages) + page_ref = out_pages action = DictionaryObject() - zoom_args: List[Union[NumberObject, NullObject]] = [] + zoom_args: ZoomArgsType = [] for a in args: if a is not None: zoom_args.append(NumberObject(a)) @@ -599,12 +652,13 @@ def addBookmark( bookmark.update({NameObject("/F"): NumberObject(format)}) bookmark_ref = self.output._addObject(bookmark) - parent = parent.getObject() + parent = cast(Bookmark, parent.getObject()) + assert parent is not None, "hint for mypy" parent.addChild(bookmark_ref, self.output) return bookmark_ref - def addNamedDestination(self, title, pagenum): + def addNamedDestination(self, title: str, pagenum: int) -> None: """ Add a destination to the output. diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py index 9e819f0ff..09543ec43 100644 --- a/PyPDF2/pagerange.py +++ b/PyPDF2/pagerange.py @@ -8,7 +8,7 @@ """ import re -from typing import List, Tuple, Union +from typing import Any, List, Tuple, Union from PyPDF2.errors import ParseError @@ -48,7 +48,7 @@ class PageRange: """ - def __init__(self, arg): + def __init__(self, arg: Union[slice, "PageRange", str]) -> None: """ Initialize with either a slice -- giving the equivalent page range, or a PageRange object -- making a copy, @@ -84,17 +84,17 @@ def __init__(self, arg): __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP) @staticmethod - def valid(input): + def valid(input: Any) -> bool: """True if input is a valid initializer for a PageRange.""" return isinstance(input, (slice, PageRange)) or ( isinstance(input, str) and bool(re.match(PAGE_RANGE_RE, input)) ) - def to_slice(self): + def to_slice(self) -> slice: """Return the slice equivalent of this page range.""" return self._slice - def __str__(self): + def __str__(self) -> str: """A string like "1:2:3".""" s = self._slice indices: Union[Tuple[int, int], Tuple[int, int, int]] @@ -107,18 +107,18 @@ def __str__(self): indices = s.start, s.stop, s.step return ":".join("" if i is None else str(i) for i in indices) - def __repr__(self): + def __repr__(self) -> str: """A string like "PageRange('1:2:3')".""" return "PageRange(" + repr(str(self)) + ")" - def indices(self, n): + def indices(self, n: int) -> Tuple[int, int, int]: """ n is the length of the list of pages to choose from. Returns arguments for range(). See help(slice.indices). """ return self._slice.indices(n) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if not isinstance(other, PageRange): return False return self._slice == other._slice @@ -127,7 +127,9 @@ def __eq__(self, other): PAGE_RANGE_ALL = PageRange(":") # The range of all pages. -def parse_filename_page_ranges(args): +def parse_filename_page_ranges( + args: List[Union[str, PageRange, None]] +) -> List[Tuple[str, PageRange]]: """ Given a list of filenames and page ranges, return a list of (filename, page_range) pairs. @@ -155,3 +157,6 @@ def parse_filename_page_ranges(args): pdf_filename = arg did_page_range = False return pairs + + +PageRangeSpec = Union[str, PageRange, Tuple[int, int], Tuple[int, int, int]] diff --git a/PyPDF2/types.py b/PyPDF2/types.py new file mode 100644 index 000000000..67f14f1c0 --- /dev/null +++ b/PyPDF2/types.py @@ -0,0 +1,58 @@ +"""Helpers for working with PDF types.""" + +from typing import List, Union + +try: + # Python 3.8+: https://peps.python.org/pep-0586 + from typing import Literal # type: ignore[attr-defined] +except ImportError: + from typing_extensions import Literal # type: ignore[misc] + +try: + # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ + from typing import TypeAlias # type: ignore[attr-defined] +except ImportError: + from typing_extensions import TypeAlias # type: ignore[misc] + +from PyPDF2.generic import ( + ArrayObject, + Bookmark, + Destination, + NameObject, + NullObject, + NumberObject, +) + +BorderArrayType: TypeAlias = List[Union[NameObject, NumberObject, ArrayObject]] +BookmarkTypes: TypeAlias = Union[Bookmark, Destination] +FitType: TypeAlias = Literal[ + "/Fit", "/XYZ", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV" +] +# Those go with the FitType: They specify values for the fit +ZoomArgType: TypeAlias = Union[NumberObject, NullObject] +ZoomArgsType: TypeAlias = List[ZoomArgType] + +# Recursive types are not yet supported by mypy: +# OutlinesType = List[Union[Destination, "OutlinesType"]] +# See https://github.com/python/mypy/issues/731 +# Hence use this for the moment: +OutlinesType = List[Union[Destination, List[Union[Destination, List[Destination]]]]] + + +LayoutType: TypeAlias = Literal[ + "/NoLayout", + "/SinglePage", + "/OneColumn", + "/TwoColumnLeft", + "/TwoColumnRight", + "/TwoPageLeft", + "/TwoPageRight", +] +PagemodeType: TypeAlias = Literal[ + "/UseNone", + "/UseOutlines", + "/UseThumbs", + "/FullScreen", + "/UseOC", + "/UseAttachments", +] diff --git a/PyPDF2/utils.py b/PyPDF2/utils.py index 7df3ad237..381c62055 100644 --- a/PyPDF2/utils.py +++ b/PyPDF2/utils.py @@ -31,15 +31,17 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" - -from typing import Dict +from io import BufferedReader, BufferedWriter, BytesIO, FileIO +from typing import Any, Dict, List, Optional, Union, overload from PyPDF2.errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X +StreamType = Union[BytesIO, BufferedReader, BufferedWriter, FileIO] +StrByteType = Union[str, StreamType] -def readUntilWhitespace(stream, maxchars=None): +def readUntilWhitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes: """ Reads non-whitespace characters and returns them. Stops upon encountering whitespace or when maxchars is reached. @@ -55,7 +57,7 @@ def readUntilWhitespace(stream, maxchars=None): return txt -def readNonWhitespace(stream): +def readNonWhitespace(stream: StreamType) -> bytes: """ Finds and reads the next non-whitespace character (ignores whitespace). """ @@ -65,7 +67,7 @@ def readNonWhitespace(stream): return tok -def skipOverWhitespace(stream): +def skipOverWhitespace(stream: StreamType) -> bool: """ Similar to readNonWhitespace, but returns a Boolean if more than one whitespace character was read. @@ -78,7 +80,7 @@ def skipOverWhitespace(stream): return cnt > 1 -def skipOverComment(stream): +def skipOverComment(stream: StreamType) -> None: tok = stream.read(1) stream.seek(-1, 1) if tok == b_("%"): @@ -86,11 +88,12 @@ def skipOverComment(stream): tok = stream.read(1) -def readUntilRegex(stream, regex, ignore_eof=False): +def readUntilRegex(stream: StreamType, regex: Any, ignore_eof: bool = False) -> bytes: """ Reads until the regular expression pattern matched (ignore the match) :raises PdfStreamError: on premature end-of-file :param bool ignore_eof: If true, ignore end-of-line and return immediately + :param regex: re.Pattern """ name = b_("") while True: @@ -110,31 +113,7 @@ def readUntilRegex(stream, regex, ignore_eof=False): return name -class ConvertFunctionsToVirtualList: - def __init__(self, lengthFunction, getFunction): - self.lengthFunction = lengthFunction - self.getFunction = getFunction - - def __len__(self): - return self.lengthFunction() - - def __getitem__(self, index): - if isinstance(index, slice): - indices = range(*index.indices(len(self))) - cls = type(self) - return cls(indices.__len__, lambda idx: self[indices[idx]]) - if not isinstance(index, int): - raise TypeError("sequence indices must be integers") - len_self = len(self) - if index < 0: - # support negative indexes - index = len_self + index - if index < 0 or index >= len_self: - raise IndexError("sequence index out of range") - return self.getFunction(index) - - -def RC4_encrypt(key, plaintext): +def RC4_encrypt(key: Union[str, bytes], plaintext: bytes) -> bytes: S = list(range(256)) j = 0 for i in range(256): @@ -151,14 +130,14 @@ def RC4_encrypt(key, plaintext): return b_("").join(retval) -def matrixMultiply(a, b): +def matrixMultiply(a: List[List[float]], b: List[List[float]]) -> List[List[float]]: return [ [sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b)] for row in a ] -def markLocation(stream): +def markLocation(stream: StreamType) -> None: """Creates text file showing current location in context.""" # Mainly for debugging radius = 5000 @@ -170,14 +149,14 @@ def markLocation(stream): stream.seek(-radius, 1) -B_CACHE = {} # type: Dict[str, bytes] +B_CACHE: Dict[Union[str, bytes], bytes] = {} -def b_(s): +def b_(s: Union[str, bytes]) -> bytes: bc = B_CACHE if s in bc: return bc[s] - if type(s) == bytes: + if isinstance(s, bytes): return s else: try: @@ -192,35 +171,61 @@ def b_(s): return r -def str_(b): - if type(b) == bytes: +@overload +def str_(b: str) -> str: + ... + + +@overload +def str_(b: bytes) -> str: + ... + + +def str_(b: Union[str, bytes]) -> str: + if isinstance(b, bytes): return b.decode("latin-1") else: return b -def ord_(b): - if type(b) == str: +@overload +def ord_(b: str) -> int: + ... + + +@overload +def ord_(b: bytes) -> bytes: + ... + + +@overload +def ord_(b: int) -> int: + ... + + +def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: + if isinstance(b, str): return ord(b) else: return b -def hexencode(b): +def hexencode(b: bytes) -> bytes: import codecs coder = codecs.getencoder("hex_codec") - return coder(b)[0] + coded = coder(b) # type: ignore + return coded[0] -def hexStr(num): +def hexStr(num: int) -> str: return hex(num).replace("L", "") WHITESPACES = [b_(x) for x in [" ", "\n", "\r", "\t", "\x00"]] -def paethPredictor(left, up, up_left): +def paethPredictor(left: int, up: int, up_left: int) -> int: p = left + up - up_left dist_left = abs(p - left) dist_up = abs(p - up) diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py index fddaf8fc4..0add70fd7 100644 --- a/PyPDF2/xmp.py +++ b/PyPDF2/xmp.py @@ -1,9 +1,13 @@ import datetime import decimal import re +from typing import Any, Callable, Dict, Optional, TypeVar, Union +from xml.dom.minidom import Document +from xml.dom.minidom import Element as XmlElement from xml.dom.minidom import parseString -from .generic import PdfObject +from PyPDF2.generic import PdfObject +from PyPDF2.utils import StreamType RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" @@ -53,8 +57,42 @@ ) -def _getter_bag(namespace, name, converter): - def get(self): +K = TypeVar("K") + + +def _identity(value: K) -> K: + return value + + +def _converter_date(value: str) -> datetime.datetime: + matches = iso8601.match(value) + if matches is None: + raise ValueError("Invalid date format: %s" % value) + year = int(matches.group("year")) + month = int(matches.group("month") or "1") + day = int(matches.group("day") or "1") + hour = int(matches.group("hour") or "0") + minute = int(matches.group("minute") or "0") + second = decimal.Decimal(matches.group("second") or "0") + seconds_dec = second.to_integral(decimal.ROUND_FLOOR) + milliseconds_dec = (second - seconds_dec) * 1000000 + + seconds = int(seconds_dec) + milliseconds = int(milliseconds_dec) + + tzd = matches.group("tzd") or "Z" + dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) + if tzd != "Z": + tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) + tzd_hours *= -1 + if tzd_hours < 0: + tzd_minutes *= -1 + dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) + return dt + + +def _getter_bag(namespace: str, name: str) -> Optional[Any]: + def get(self: Any) -> Optional[Any]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached @@ -65,7 +103,6 @@ def get(self): for bag in bags: for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): value = self._getText(item) - value = converter(value) retval.append(value) ns_cache = self.cache.setdefault(namespace, {}) ns_cache[name] = retval @@ -74,8 +111,10 @@ def get(self): return get -def _getter_seq(namespace, name, converter): - def get(self): +def _getter_seq( + namespace: str, name: str, converter: Callable[[Any], Any] = _identity +) -> Optional[Any]: + def get(self: Any) -> Optional[Any]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached @@ -98,8 +137,8 @@ def get(self): return get -def _getter_langalt(namespace, name, converter): - def get(self): +def _getter_langalt(namespace: str, name: str) -> Optional[Any]: + def get(self: Any) -> Optional[Any]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached @@ -110,10 +149,9 @@ def get(self): for alt in alts: for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): value = self._getText(item) - value = converter(value) retval[item.getAttribute("xml:lang")] = value else: - retval["x-default"] = converter(self._getText(element)) + retval["x-default"] = self._getText(element) ns_cache = self.cache.setdefault(namespace, {}) ns_cache[name] = retval return retval @@ -121,8 +159,10 @@ def get(self): return get -def _getter_single(namespace, name, converter): - def get(self): +def _getter_single( + namespace: str, name: str, converter: Callable[[str], Any] = _identity +) -> Optional[Any]: + def get(self: Any) -> Optional[Any]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached @@ -148,16 +188,22 @@ class XmpInformation(PdfObject): Usually accessed by :meth:`getXmpMetadata()` """ - def __init__(self, stream): - self.stream = stream - doc_root = parseString(self.stream.getData()) - self.rdfRoot = doc_root.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] - self.cache = {} + from PyPDF2.generic import ContentStream - def writeToStream(self, stream, encryption_key): + def __init__(self, stream: ContentStream) -> None: + self.stream = stream + doc_root: Document = parseString(self.stream.getData()) + self.rdfRoot: XmlElement = doc_root.getElementsByTagNameNS( + RDF_NAMESPACE, "RDF" + )[0] + self.cache: Dict[Any, Any] = {} + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: self.stream.writeToStream(stream, encryption_key) - def getElement(self, aboutUri, namespace, name): + def getElement(self, aboutUri: str, namespace: str, name: str) -> Any: for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: attr = desc.getAttributeNodeNS(namespace, name) @@ -165,7 +211,7 @@ def getElement(self, aboutUri, namespace, name): yield attr yield from desc.getElementsByTagNameNS(namespace, name) - def getNodesInNamespace(self, aboutUri, namespace): + def getNodesInNamespace(self, aboutUri: str, namespace: str) -> Any: for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: for i in range(desc.attributes.length): @@ -176,57 +222,25 @@ def getNodesInNamespace(self, aboutUri, namespace): if child.namespaceURI == namespace: yield child - def _getText(self, element): + def _getText(self, element: XmlElement) -> str: text = "" for child in element.childNodes: if child.nodeType == child.TEXT_NODE: text += child.data return text - def _converter_string(value): - return value - - @staticmethod - def _converter_date(value): - matches = iso8601.match(value) - if matches is None: - raise ValueError("Invalid date format: %s" % value) - year = int(matches.group("year")) - month = int(matches.group("month") or "1") - day = int(matches.group("day") or "1") - hour = int(matches.group("hour") or "0") - minute = int(matches.group("minute") or "0") - second = decimal.Decimal(matches.group("second") or "0") - seconds_dec = second.to_integral(decimal.ROUND_FLOOR) - milliseconds_dec = (second - seconds_dec) * 1000000 - - seconds = int(seconds_dec) - milliseconds = int(milliseconds_dec) - - tzd = matches.group("tzd") or "Z" - dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) - if tzd != "Z": - tzd_hours, tzd_minutes = (int(x) for x in tzd.split(":")) - tzd_hours *= -1 - if tzd_hours < 0: - tzd_minutes *= -1 - dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) - return dt - - dc_contributor = property( - _getter_bag(DC_NAMESPACE, "contributor", _converter_string) - ) + dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor")) """ Contributors to the resource (other than the authors). An unsorted array of names. """ - dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) + dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage")) """ Text describing the extent or scope of the resource. """ - dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) + dc_creator = property(_getter_seq(DC_NAMESPACE, "creator")) """ A sorted array of names of the authors of the resource, listed in order of precedence. @@ -238,86 +252,76 @@ def _converter_date(value): the resource. The dates and times are in UTC. """ - dc_description = property( - _getter_langalt(DC_NAMESPACE, "description", _converter_string) - ) + dc_description = property(_getter_langalt(DC_NAMESPACE, "description")) """ A language-keyed dictionary of textual descriptions of the content of the resource. """ - dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) + dc_format = property(_getter_single(DC_NAMESPACE, "format")) """ The mime-type of the resource. """ - dc_identifier = property( - _getter_single(DC_NAMESPACE, "identifier", _converter_string) - ) + dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier")) """ Unique identifier of the resource. """ - dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) + dc_language = property(_getter_bag(DC_NAMESPACE, "language")) """ An unordered array specifying the languages used in the resource. """ - dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) + dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher")) """ An unordered array of publisher names. """ - dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) + dc_relation = property(_getter_bag(DC_NAMESPACE, "relation")) """ An unordered array of text descriptions of relationships to other documents. """ - dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) + dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights")) """ A language-keyed dictionary of textual descriptions of the rights the user has to this resource. """ - dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) + dc_source = property(_getter_single(DC_NAMESPACE, "source")) """ Unique identifier of the work from which this resource was derived. """ - dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) + dc_subject = property(_getter_bag(DC_NAMESPACE, "subject")) """ An unordered array of descriptive phrases or keywrods that specify the topic of the content of the resource. """ - dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) + dc_title = property(_getter_langalt(DC_NAMESPACE, "title")) """ A language-keyed dictionary of the title of the resource. """ - dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) + dc_type = property(_getter_bag(DC_NAMESPACE, "type")) """ An unordered array of textual descriptions of the document type. """ - pdf_keywords = property( - _getter_single(PDF_NAMESPACE, "Keywords", _converter_string) - ) + pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords")) """ An unformatted text string representing document keywords. """ - pdf_pdfversion = property( - _getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string) - ) + pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion")) """ The PDF file version, for example 1.0, 1.3. """ - pdf_producer = property( - _getter_single(PDF_NAMESPACE, "Producer", _converter_string) - ) + pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer")) """ The name of the tool that created the PDF document. """ @@ -347,30 +351,24 @@ def _converter_date(value): object. """ - xmp_creatorTool = property( - _getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string) - ) + xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool")) """ The name of the first known tool used to create the resource. """ - xmpmm_documentId = property( - _getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string) - ) + xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) """ The common identifier for all versions and renditions of this resource. """ - xmpmm_instanceId = property( - _getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string) - ) + xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) """ An identifier for a specific incarnation of a document, updated each time a file is saved. """ @property - def custom_properties(self): + def custom_properties(self) -> Dict[Any, Any]: """ Retrieves custom metadata properties defined in the undocumented pdfx metadata schema. diff --git a/requirements/ci.in b/requirements/ci.in index 5b4188a30..421b1287d 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -2,8 +2,9 @@ coverage flake8 flake8_implicit_str_concat flake8-bugbear +mypy pillow -types-Pillow pytest pytest-benchmark -mypy +typeguard +types-Pillow diff --git a/requirements/ci.txt b/requirements/ci.txt index bf9c2f73c..5a63727fe 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -62,6 +62,8 @@ tomli==1.2.3 # pytest typed-ast==1.5.3 # via mypy +typeguard==2.13.3 + # via -r requirements/ci.in types-pillow==9.0.14 # via -r requirements/ci.in typing-extensions==4.1.1 diff --git a/PDF_Samples/AutoCad_Simple.pdf b/resources/AutoCad_Simple.pdf similarity index 100% rename from PDF_Samples/AutoCad_Simple.pdf rename to resources/AutoCad_Simple.pdf diff --git a/setup.cfg b/setup.cfg index da7184e43..b6cd4366c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,4 +35,4 @@ classifiers = packages = PyPDF2 python_requires = >=3.6 install_requires = - typing_extensions; python_version < '3.8' + typing_extensions; python_version < '3.10' diff --git a/setup.py b/setup.py index 65f4cf3f7..10dbb1448 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,8 @@ from setuptools import setup VERSIONFILE = "PyPDF2/_version.py" -with open(VERSIONFILE) as fp: - verstrline = fp.read() +with open(VERSIONFILE) as fh: + verstrline = fh.read() VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" mo = re.search(VSRE, verstrline, re.M) if mo: diff --git a/tests/test_basic_features.py b/tests/test_basic_features.py index 5895331e8..76c7151ea 100644 --- a/tests/test_basic_features.py +++ b/tests/test_basic_features.py @@ -63,5 +63,5 @@ def test_basic_features(): def test_convertToInt(): with pytest.raises(PdfReadError) as exc: - convertToInt(256, 16) + convertToInt(b"256", 16) assert exc.value.args[0] == "invalid size in convertToInt" diff --git a/tests/test_filters.py b/tests/test_filters.py index ce2d3476d..db9f8078a 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -35,7 +35,7 @@ def test_FlateDecode(predictor, s): codec = FlateDecode() s = s.encode() encoded = codec.encode(s) - assert codec.decode(encoded, {"/Predictor": predictor}) == s + assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s def test_FlateDecode_unsupported_predictor(): @@ -50,7 +50,7 @@ def test_FlateDecode_unsupported_predictor(): for predictor, s in cartesian_product(predictors, filter_inputs): s = s.encode() with pytest.raises(PdfReadError): - codec.decode(codec.encode(s), {"/Predictor": predictor}) + codec.decode(codec.encode(s), DictionaryObject({"/Predictor": predictor})) @pytest.mark.parametrize( diff --git a/tests/test_generic.py b/tests/test_generic.py index 4ced67a00..0124084af 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -16,7 +16,6 @@ NameObject, NullObject, NumberObject, - PdfObject, RectangleObject, TextStringObject, createStringObject, @@ -39,7 +38,12 @@ def test_number_object_exception(): def test_createStringObject_exception(): with pytest.raises(TypeError) as exc: createStringObject(123) - assert exc.value.args[0] == "createStringObject should have str or unicode arg" + assert ( # typeguard is not running + exc.value.args[0] == "createStringObject should have str or unicode arg" + ) or ( # typeguard is enabled + 'type of argument "string" must be one of (str, bytes); got int instead' + in exc.value.args[0] + ) @pytest.mark.parametrize( @@ -142,7 +146,7 @@ def test_NameObject(): def test_destination_fit_r(): d = Destination( NameObject("title"), - PdfObject(), + NullObject(), NameObject(TF.FIT_R), FloatObject(0), FloatObject(0), @@ -161,22 +165,24 @@ def test_destination_fit_r(): def test_destination_fit_v(): - Destination(NameObject("title"), PdfObject(), NameObject(TF.FIT_V), FloatObject(0)) + Destination(NameObject("title"), NullObject(), NameObject(TF.FIT_V), FloatObject(0)) def test_destination_exception(): with pytest.raises(PdfReadError): - Destination(NameObject("title"), PdfObject(), NameObject("foo"), FloatObject(0)) + Destination( + NameObject("title"), NullObject(), NameObject("foo"), FloatObject(0) + ) def test_bookmark_write_to_stream(): stream = BytesIO() bm = Bookmark( - NameObject("title"), NameObject(), NameObject(TF.FIT_V), FloatObject(0) + NameObject("title"), NullObject(), NameObject(TF.FIT_V), FloatObject(0) ) bm.writeToStream(stream, None) stream.seek(0, 0) - assert stream.read() == b"<<\n/Title title\n/Dest [ /FitV 0 ]\n>>" + assert stream.read() == b"<<\n/Title title\n/Dest [ null /FitV 0 ]\n>>" def test_encode_pdfdocencoding_keyerror(): @@ -325,7 +331,6 @@ class tst: # to replace pdf assert shouldFail ^ (exc.value.args[0] == "__ALLGOOD__") - def test_RectangleObject(): ro = RectangleObject((1, 2, 3, 4)) assert ro.lowerLeft == (1, 2) diff --git a/tests/test_merger.py b/tests/test_merger.py index 450ccd829..19d1ea6d8 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -31,7 +31,7 @@ def test_merge(): file_merger.append(pdfr) # PdfFileReader object: - file_merger.append(PyPDF2.PdfFileReader(pdf_path, "rb"), bookmark=True) + file_merger.append(PyPDF2.PdfFileReader(pdf_path), bookmark="foo") # File handle with open(pdf_path, "rb") as fh: @@ -61,7 +61,7 @@ def test_merge(): "Foo", "Bar", "Baz", - "True", + "foo", ] # Clean up diff --git a/tests/test_page.py b/tests/test_page.py index d6543885a..780a45e7e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -4,6 +4,7 @@ import pytest from PyPDF2 import PdfFileReader +from PyPDF2._page import PageObject from PyPDF2.generic import RectangleObject TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -63,10 +64,8 @@ def test_page_operations(pdf_path, password): if password: reader.decrypt(password) - page = reader.pages[0] - page.mergeRotatedScaledPage(page, 90, 1, 1) - page.mergeScaledTranslatedPage(page, 1, 1, 1) - page.mergeRotatedScaledTranslatedPage(page, 90, 1, 1, 1, 1) + page: PageObject = reader.pages[0] + page.mergeRotatedScaledTranslatedPage(page, 90, scale=1, tx=1, ty=1, expand=True) page.addTransformation([1, 0, 0, 0, 0, 0]) page.scale(2, 2) page.scaleBy(0.5) @@ -75,6 +74,21 @@ def test_page_operations(pdf_path, password): page.extractText() +def test_page_transformations(): + pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") + reader = PdfFileReader(pdf_path) + + page: PageObject = reader.pages[0] + page.mergeRotatedPage(page, 90, expand=True) + page.mergeRotatedScaledPage(page, 90, 1, expand=True) + page.mergeRotatedScaledTranslatedPage(page, 90, scale=1, tx=1, ty=1, expand=True) + page.mergeRotatedTranslatedPage(page, 90, 100, 100, expand=False) + page.mergeScaledPage(page, 2, expand=False) + page.mergeScaledTranslatedPage(page, 1, 1, 1) + page.mergeTranslatedPage(page, 100, 100, expand=False) + page.addTransformation([1, 0, 0, 0, 0, 0]) + + @pytest.mark.parametrize( ("pdf_path", "password"), [ diff --git a/tests/test_reader.py b/tests/test_reader.py index d2cdcee55..a48491663 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -12,8 +12,6 @@ from PyPDF2.errors import PdfReadError from PyPDF2.filters import _xobj_to_image -StreamIO = BytesIO - TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "resources") @@ -332,9 +330,7 @@ def test_get_page_number(src, page_nb): @pytest.mark.parametrize( ("src", "expected"), - [ - ("form.pdf", None), - ], + [("form.pdf", None), ("AutoCad_Simple.pdf", "/SinglePage")], ) def test_get_page_layout(src, expected): src = os.path.join(RESOURCE_ROOT, src) @@ -495,7 +491,7 @@ def test_read_encrypted_without_decryption(): assert exc.value.args[0] == "File has not been decrypted" -def test_get_destination_age_number(): +def test_get_destination_page_number(): src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf") reader = PdfFileReader(src) outlines = reader.getOutlines() @@ -508,7 +504,7 @@ def test_do_not_get_stuck_on_large_files_without_start_xref(): """Tests for the absence of a DoS bug, where a large file without an startxref mark would cause the library to hang for minutes to hours""" start_time = time.time() - broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000) + broken_stream = BytesIO(b"\0" * 5 * 1000 * 1000) with pytest.raises(PdfReadError): PdfFileReader(broken_stream) parse_duration = time.time() - start_time @@ -599,3 +595,11 @@ def test_decode_permissions(): modify = base.copy() modify["modify"] = True assert reader.decode_permissions(8) == modify + + +def test_VirtualList(): + pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") + reader = PdfFileReader(pdf_path) + + # Test if getting as slice throws an error + assert len(reader.pages[:]) == 1 diff --git a/tests/test_utils.py b/tests/test_utils.py index b85eb6e88..8c4feb7e8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,7 +4,6 @@ import pytest import PyPDF2.utils -from PyPDF2 import PdfFileReader from PyPDF2.errors import PdfStreamError TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -79,14 +78,6 @@ def test_markLocation(): os.remove("PyPDF2_pdfLocation.txt") # cleanup -def test_ConvertFunctionsToVirtualList(): - pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") - reader = PdfFileReader(pdf_path) - - # Test if getting as slice throws an error - assert len(reader.pages[:]) == 1 - - def test_hexStr(): assert PyPDF2.utils.hexStr(10) == "0xa" diff --git a/tests/test_writer.py b/tests/test_writer.py index e0a864f2e..32e9a0b09 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -48,11 +48,11 @@ def test_writer_operations(): writer.addURI(2, "https://example.com", RectangleObject([0, 0, 100, 100])) writer.addLink(2, 1, RectangleObject([0, 0, 100, 100])) assert writer.getPageLayout() is None - writer.setPageLayout("SinglePage") - assert writer.getPageLayout() == "SinglePage" + writer.setPageLayout("/SinglePage") + assert writer.getPageLayout() == "/SinglePage" assert writer.getPageMode() is None - writer.setPageMode("UseNone") - assert writer.getPageMode() == "UseNone" + writer.setPageMode("/UseNone") + assert writer.getPageMode() == "/UseNone" writer.insertBlankPage(width=100, height=100) writer.insertBlankPage() # without parameters @@ -359,6 +359,7 @@ def test_regression_issue670(): with open("dont_commit_issue670.pdf", "wb") as f_pdf: pdf_writer.write(f_pdf) + def test_issue301(): """ Test with invalid stream length object diff --git a/tests/test_xmp.py b/tests/test_xmp.py index f12a07787..7017a45b2 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -45,8 +45,7 @@ def get_all_tiff(xmp): def test_regression_issue774(): - cls = PyPDF2.xmp.XmpInformation - date = cls._converter_date("2021-04-28T12:23:34.123Z") + date = PyPDF2.xmp._converter_date("2021-04-28T12:23:34.123Z") assert date.year == 2021 assert date.month == 4 assert date.day == 28