From 17897d986a6c678ff55ecdef5ed26e59f9d9bc39 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 1 Dec 2022 23:17:54 +0100 Subject: [PATCH 1/3] ENH: Make PdfReader.get_object accept integer arguments (#1459) Also fix various type annotations --- PyPDF2/_reader.py | 45 ++++++++++++++++++++++++++++---------------- tests/test_reader.py | 41 +++++++++++++++++++++++++++++++++++++++- tests/test_writer.py | 1 + 3 files changed, 70 insertions(+), 17 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 655278b8d..65586d7d5 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -863,14 +863,18 @@ def getDestinationPageNumber( def _build_destination( self, title: str, - array: List[Union[NumberObject, IndirectObject, NullObject, DictionaryObject]], + array: Optional[ + List[ + Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] + ] + ], ) -> Destination: page, typ = None, None # handle outline items with missing or invalid destination if ( - isinstance(array, (type(None), NullObject)) + isinstance(array, (NullObject, str)) or (isinstance(array, ArrayObject) and len(array) == 0) - or (isinstance(array, str)) + or array is None ): page = NullObject() @@ -898,7 +902,7 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: # title required for valid outline # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary try: - title = node["/Title"] + title = cast("str", node["/Title"]) except KeyError: if self.strict: raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") @@ -918,9 +922,10 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: dest = dest["/D"] if isinstance(dest, ArrayObject): - outline_item = self._build_destination(title, dest) # type: ignore + outline_item = self._build_destination(title, dest) elif isinstance(dest, str): # named destination, addresses NameObject Issue #193 + # TODO : keep named destination instead of replacing it ? try: outline_item = self._build_destination( title, self._namedDests[dest].dest_array @@ -928,13 +933,18 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: except KeyError: # named destination not found in Name Dict outline_item = self._build_destination(title, None) - elif isinstance(dest, type(None)): + elif dest is None: # outline item not required to have destination or action # PDFv1.7 Table 153 - outline_item = self._build_destination(title, dest) # type: ignore + outline_item = self._build_destination(title, dest) else: if self.strict: raise PdfReadError(f"Unexpected destination {dest!r}") + else: + logger_warning( + f"Removed unexpected destination {dest!r} from destination", + __name__, + ) outline_item = self._build_destination(title, None) # type: ignore # if outline item created, add color, format, and child count if present @@ -950,7 +960,6 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: # absolute value = num. visible children # positive = open/unfolded, negative = closed/folded outline_item[NameObject("/Count")] = node["/Count"] - return outline_item @property @@ -1154,7 +1163,18 @@ def _get_object_from_stream( raise PdfReadError("This is a fatal error in strict mode.") return NullObject() - def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: + def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: + """ + used to ease development + equivalent to generic.IndirectObject(num,gen,self).get_object() + """ + return IndirectObject(num, gen, self).get_object() + + def get_object( + self, indirect_reference: Union[int, IndirectObject] + ) -> Optional[PdfObject]: + if isinstance(indirect_reference, int): + indirect_reference = IndirectObject(indirect_reference, 0, self) retval = self.cache_get_indirect_object( indirect_reference.generation, indirect_reference.idnum ) @@ -1928,13 +1948,6 @@ def xfa(self) -> Optional[Dict[str, Any]]: retval[tag] = es return retval - def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: - """ - used to ease development - equivalent to generic.IndirectObject(num,gen,self).get_object() - """ - return IndirectObject(num, gen, self).get_object() - class PdfFileReader(PdfReader): # pragma: no cover def __init__(self, *args: Any, **kwargs: Any) -> None: diff --git a/tests/test_reader.py b/tests/test_reader.py index 192825f16..0338d6eb2 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -17,7 +17,13 @@ PdfReadWarning, WrongPasswordError, ) -from PyPDF2.generic import Destination +from PyPDF2.generic import ( + Destination, + DictionaryObject, + NameObject, + NumberObject, + TextStringObject, +) from . import get_pdf_from_url, normalize_warnings @@ -755,6 +761,12 @@ def test_iss925(): annot.get_object() +def test_get_object(): + reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") + assert reader.get_object(22)["/Type"] == "/Catalog" + assert reader._get_indirect_object(22, 0)["/Type"] == "/Catalog" + + @pytest.mark.xfail(reason="#591") def test_extract_text_hello_world(): reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") @@ -1179,3 +1191,30 @@ def test_zeroing_xref(): name = "UTA_OSHA.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) len(reader.pages) + + +def test_build_outline_item(caplog): + url = "https://github.com/py-pdf/PyPDF2/files/9464742/shiv_resume.pdf" + name = "shiv_resume.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + outline = reader._build_outline_item( + DictionaryObject( + { + NameObject("/Title"): TextStringObject("Toto"), + NameObject("/Dest"): NumberObject(2), + } + ) + ) + assert "Removed unexpected destination 2 from destination" in caplog.text + assert outline["/Title"] == "Toto" + reader.strict = True + with pytest.raises(PdfReadError) as exc: + reader._build_outline_item( + DictionaryObject( + { + NameObject("/Title"): TextStringObject("Toto"), + NameObject("/Dest"): NumberObject(2), + } + ) + ) + assert "Unexpected destination 2" in exc.value.args[0] diff --git a/tests/test_writer.py b/tests/test_writer.py index 9ab514672..70adf3ba2 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -343,6 +343,7 @@ def test_write_metadata(): reader = PdfReader(pdf_path) writer = PdfWriter() + writer.add_page(reader.pages[0]) for page in reader.pages: writer.add_page(page) From 1a9f7d9434f9c68254d00f3c91a21d5fc6d1e22b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 1 Dec 2022 23:21:32 +0100 Subject: [PATCH 2/3] MAINT: Add List of pages to PageRangeSpec (#1456) Applies to merge/append --- PyPDF2/_merger.py | 20 ++++++++++++++------ PyPDF2/pagerange.py | 2 +- tests/test_merger.py | 6 +++++- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/PyPDF2/_merger.py b/PyPDF2/_merger.py index 5a512ac68..acd1ead1b 100644 --- a/PyPDF2/_merger.py +++ b/PyPDF2/_merger.py @@ -155,6 +155,7 @@ def merge( or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document. + Can also be a list of pages to merge. :param bool import_outline: You may prevent the source document's outline (collection of outline items, previously referred to as @@ -174,6 +175,8 @@ def merge( pages = (0, len(reader.pages)) elif isinstance(pages, PageRange): pages = pages.indices(len(reader.pages)) + elif isinstance(pages, list): + pass elif not isinstance(pages, tuple): raise TypeError('"pages" must be a tuple of (start, stop[, step])') @@ -255,7 +258,9 @@ def append( self, fileobj: Union[StrByteType, PdfReader, Path], outline_item: Optional[str] = None, - pages: Union[None, PageRange, Tuple[int, int], Tuple[int, int, int]] = None, + pages: Union[ + None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] + ] = None, import_outline: bool = True, ) -> None: """ @@ -275,6 +280,7 @@ def append( or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document. + Can also be a list of pages to append. :param bool import_outline: You may prevent the source document's outline (collection of outline items, previously referred to as @@ -420,12 +426,13 @@ def _trim_dests( self, pdf: PdfReader, dests: Dict[str, Dict[str, Any]], - pages: Union[Tuple[int, int], Tuple[int, int, int]], + pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]], ) -> List[Dict[str, Any]]: """Remove named destinations that are not a part of the specified page set.""" new_dests = [] + lst = pages if isinstance(pages, list) else list(range(*pages)) for key, obj in dests.items(): - for j in range(*pages): + for j in lst: if pdf.pages[j].get_object() == obj["/Page"].get_object(): obj[NameObject("/Page")] = obj["/Page"].get_object() assert str_(key) == str_(obj["/Title"]) @@ -437,21 +444,22 @@ def _trim_outline( self, pdf: PdfReader, outline: OutlineType, - pages: Union[Tuple[int, int], Tuple[int, int, int]], + pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]], ) -> OutlineType: """Remove outline item entries that are not a part of the specified page set.""" new_outline = [] prev_header_added = True + lst = pages if isinstance(pages, list) else list(range(*pages)) for i, outline_item in enumerate(outline): if isinstance(outline_item, list): - sub = self._trim_outline(pdf, outline_item, pages) # type: ignore + sub = self._trim_outline(pdf, outline_item, lst) # type: ignore if sub: if not prev_header_added: new_outline.append(outline[i - 1]) new_outline.append(sub) # type: ignore else: prev_header_added = False - for j in range(*pages): + for j in lst: if outline_item["/Page"] is None: continue if pdf.pages[j].get_object() == outline_item["/Page"].get_object(): diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py index b22541159..f009adc19 100644 --- a/PyPDF2/pagerange.py +++ b/PyPDF2/pagerange.py @@ -170,4 +170,4 @@ def parse_filename_page_ranges( return pairs -PageRangeSpec = Union[str, PageRange, Tuple[int, int], Tuple[int, int, int]] +PageRangeSpec = Union[str, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]] diff --git a/tests/test_merger.py b/tests/test_merger.py index 6170bbba4..9cd1fb127 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -45,7 +45,11 @@ def merger_operate(merger): merger.append(reader) # PdfReader object: - merger.append(PyPDF2.PdfReader(pdf_path), outline_item="foo") + r = PyPDF2.PdfReader(pdf_path) + merger.append(r, outline_item="foo", pages=list(range(len(r.pages)))) + + # PdfReader object with List: + # merger.append(PyPDF2.PdfReader(pdf_path), outline_item="foo") # File handle with open(pdf_path, "rb") as fh: From 3e250c5b025bb82407ad10f3db9906f784b3f42e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 2 Dec 2022 09:24:17 +0100 Subject: [PATCH 3/3] ENH: Add 'threads' property to PdfWriter (#1458) This currently returns only an empty list . Pending to PR#1371 --- PyPDF2/_reader.py | 14 ++++++++++++++ PyPDF2/_writer.py | 23 +++++++++++++++++++++++ PyPDF2/constants.py | 1 + tests/test_reader.py | 13 +++++++++++++ tests/test_writer.py | 10 ++++++++++ 5 files changed, 61 insertions(+) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 65586d7d5..ab34c7a9e 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -800,6 +800,20 @@ def getOutlines( deprecate_with_replacement("getOutlines", "outline") return self._get_outline(node, outline) + @property + def threads(self) -> Optional[ArrayObject]: + """ + Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array of Dictionnaries with "/F" and "/I" properties + or None if no articles. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + if CO.THREADS in catalog: + return cast("ArrayObject", catalog[CO.THREADS]) + else: + return None + def _get_page_number_by_indirect( self, indirect_ref: Union[None, int, NullObject, IndirectObject] ) -> int: diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 12539900d..3ad538da9 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -1098,6 +1098,29 @@ def get_outline_root(self) -> TreeObject: return outline + def get_threads_root(self) -> ArrayObject: + """ + the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array (possibly empty) of Dictionaries with "/F" and "/I" properties + """ + if CO.THREADS in self._root_object: + # TABLE 3.25 Entries in the catalog dictionary + threads = cast(ArrayObject, self._root_object[CO.THREADS]) + else: + threads = ArrayObject() + self._root_object[NameObject(CO.THREADS)] = threads + return threads + + @property + def threads(self) -> ArrayObject: + """ + Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array (possibly empty) of Dictionaries with "/F" and "/I" properties + """ + return self.get_threads_root() + def getOutlineRoot(self) -> TreeObject: # pragma: no cover """ .. deprecated:: 1.28.0 diff --git a/PyPDF2/constants.py b/PyPDF2/constants.py index f8d3faf8f..a2f8c49ed 100644 --- a/PyPDF2/constants.py +++ b/PyPDF2/constants.py @@ -16,6 +16,7 @@ class Core: """Keywords that don't quite belong anywhere else.""" OUTLINES = "/Outlines" + THREADS = "/Threads" PAGE = "/Page" PAGES = "/Pages" CATALOG = "/Catalog" diff --git a/tests/test_reader.py b/tests/test_reader.py index 0338d6eb2..8e109cfbd 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -18,6 +18,7 @@ WrongPasswordError, ) from PyPDF2.generic import ( + ArrayObject, Destination, DictionaryObject, NameObject, @@ -1193,6 +1194,18 @@ def test_zeroing_xref(): len(reader.pages) +def test_thread(): + url = "https://github.com/py-pdf/PyPDF2/files/9066120/UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" + name = "UTA_OSHA.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + assert reader.threads is None + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + assert isinstance(reader.threads, ArrayObject) + assert len(reader.threads) >= 1 + + def test_build_outline_item(caplog): url = "https://github.com/py-pdf/PyPDF2/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" diff --git a/tests/test_writer.py b/tests/test_writer.py index 70adf3ba2..fa8638b3d 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -7,6 +7,7 @@ from PyPDF2 import PageObject, PdfMerger, PdfReader, PdfWriter from PyPDF2.errors import PageSizeNotDefinedError from PyPDF2.generic import ( + ArrayObject, IndirectObject, NameObject, NumberObject, @@ -856,3 +857,12 @@ def test_startup_dest(): pdf_file_writer.open_destination = None assert "/OpenAction" not in pdf_file_writer._root_object pdf_file_writer.open_destination = None + + +def test_threads_empty(): + writer = PdfWriter() + thr = writer.threads + assert isinstance(thr, ArrayObject) + assert len(thr) == 0 + thr2 = writer.threads + assert thr == thr2