From f62e05150eab36cf70770d1e0711e29612e9e479 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 19 Jun 2022 09:36:59 +0200 Subject: [PATCH] ROB: Fix corrupted (wrongly) linear PDF (#1008) Fix: Rescan the whole PDF and update/rebuild the trailer Closes #989 --- PyPDF2/_reader.py | 17 ++++++----------- tests/test_workflows.py | 23 ++++++++++++++++------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 0a697e88e..669f2dbbd 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1448,17 +1448,12 @@ def _rebuild_xref_table(self, stream: StreamType) -> None: if generation not in self.xref: self.xref[generation] = {} self.xref[generation][idnum] = m.start(1) - trailer_pos = f_.rfind(b"trailer") - len(f_) + 7 - stream.seek(trailer_pos, 2) - # code below duplicated - read_non_whitespace(stream) - stream.seek(-1, 1) - - # there might be something that is not a dict (see #856) - new_trailer = cast(Dict[Any, Any], read_object(stream, self)) - - for key, value in list(new_trailer.items()): - if key not in self.trailer: + stream.seek(0, 0) + for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): + stream.seek(m.start(1), 0) + new_trailer = cast(Dict[Any, Any], read_object(stream, self)) + # Here, we are parsing the file from start to end, the new data have to erase the existing. + for key, value in list(new_trailer.items()): self.trailer[key] = value def _read_xref_subsections( diff --git a/tests/test_workflows.py b/tests/test_workflows.py index c5f8ebf20..fcb8f7f52 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -7,6 +7,7 @@ from PyPDF2 import PdfReader from PyPDF2.constants import PageAttributes as PG +from PyPDF2.errors import PdfReadWarning from . import get_pdf_from_url @@ -167,15 +168,23 @@ def test_rotate_45(): "https://corpora.tika.apache.org/base/docs/govdocs1/971/971703.pdf", [0, 1, 5, 8, 14], ), + ( # faulty PDF, wrongly linearized and with 2 trailer, second with /Root + True, + "https://corpora.tika.apache.org/base/docs/govdocs1/989/989691.pdf", + [0], + ), ], ) def test_extract_textbench(enable, url, pages, print_result=False): if not enable: return - reader = PdfReader(BytesIO(get_pdf_from_url(url, url.split("/")[-1]))) - for page_number in pages: - if print_result: - print(f"**************** {url} / page {page_number} ****************") - rst = reader.pages[page_number].extract_text() - if print_result: - print(f"{rst}\n*****************************\n") + try: + reader = PdfReader(BytesIO(get_pdf_from_url(url, url.split("/")[-1]))) + for page_number in pages: + if print_result: + print(f"**************** {url} / page {page_number} ****************") + rst = reader.pages[page_number].extract_text() + if print_result: + print(f"{rst}\n*****************************\n") + except PdfReadWarning: + pass