Skip to content

Commit

Permalink
ROB: Fix corrupted (wrongly) linear PDF (#1008)
Browse files Browse the repository at this point in the history
Fix: Rescan the whole PDF and update/rebuild the trailer

Closes #989
  • Loading branch information
pubpub-zz committed Jun 19, 2022
1 parent 953a11d commit f62e051
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 18 deletions.
17 changes: 6 additions & 11 deletions PyPDF2/_reader.py
Expand Up @@ -1448,17 +1448,12 @@ def _rebuild_xref_table(self, stream: StreamType) -> None:
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)
trailer_pos = f_.rfind(b"trailer") - len(f_) + 7
stream.seek(trailer_pos, 2)
# code below duplicated
read_non_whitespace(stream)
stream.seek(-1, 1)

# there might be something that is not a dict (see #856)
new_trailer = cast(Dict[Any, Any], read_object(stream, self))

for key, value in list(new_trailer.items()):
if key not in self.trailer:
stream.seek(0, 0)
for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_):
stream.seek(m.start(1), 0)
new_trailer = cast(Dict[Any, Any], read_object(stream, self))
# Here, we are parsing the file from start to end, the new data have to erase the existing.
for key, value in list(new_trailer.items()):
self.trailer[key] = value

def _read_xref_subsections(
Expand Down
23 changes: 16 additions & 7 deletions tests/test_workflows.py
Expand Up @@ -7,6 +7,7 @@

from PyPDF2 import PdfReader
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.errors import PdfReadWarning

from . import get_pdf_from_url

Expand Down Expand Up @@ -167,15 +168,23 @@ def test_rotate_45():
"https://corpora.tika.apache.org/base/docs/govdocs1/971/971703.pdf",
[0, 1, 5, 8, 14],
),
( # faulty PDF, wrongly linearized and with 2 trailer, second with /Root
True,
"https://corpora.tika.apache.org/base/docs/govdocs1/989/989691.pdf",
[0],
),
],
)
def test_extract_textbench(enable, url, pages, print_result=False):
if not enable:
return
reader = PdfReader(BytesIO(get_pdf_from_url(url, url.split("/")[-1])))
for page_number in pages:
if print_result:
print(f"**************** {url} / page {page_number} ****************")
rst = reader.pages[page_number].extract_text()
if print_result:
print(f"{rst}\n*****************************\n")
try:
reader = PdfReader(BytesIO(get_pdf_from_url(url, url.split("/")[-1])))
for page_number in pages:
if print_result:
print(f"**************** {url} / page {page_number} ****************")
rst = reader.pages[page_number].extract_text()
if print_result:
print(f"{rst}\n*****************************\n")
except PdfReadWarning:
pass

0 comments on commit f62e051

Please sign in to comment.