Skip to content

Commit

Permalink
ROB: Cope with corrupted entries in xref table (#1300)
Browse files Browse the repository at this point in the history
This robustness improvement is for PDF files that have a corrupted Xref table entry, but the object can be found in the PDF file by searching the file for the entry.

Closes #1292
  • Loading branch information
pubpub-zz committed Aug 29, 2022
1 parent 3b74312 commit b5dce26
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 3 deletions.
33 changes: 30 additions & 3 deletions PyPDF2/_reader.py
Expand Up @@ -1385,10 +1385,37 @@ def _read_standard_xref_table(self, stream: StreamType) -> None:
if line[-1] in b"0123456789t":
stream.seek(-1, 1)

offset_b, generation_b = line[:16].split(b" ")
entry_type_b = line[17:18]
try:
offset_b, generation_b = line[:16].split(b" ")
entry_type_b = line[17:18]

offset, generation = int(offset_b), int(generation_b)
except Exception:
# if something wrong occured
if hasattr(stream, "getbuffer"):
buf = bytes(stream.getbuffer()) # type: ignore
else:
p = stream.tell()
stream.seek(0, 0)
buf = stream.read(-1)
stream.seek(p)

f = re.search(f"{num}\\s+(\\d+)\\s+obj".encode(), buf)
if f is None:
logger_warning(
f"entry {num} in Xref table invalid; object not found",
__name__,
)
generation = 65535
offset = -1
else:
logger_warning(
f"entry {num} in Xref table invalid but object found",
__name__,
)
generation = int(f.group(1))
offset = f.start()

offset, generation = int(offset_b), int(generation_b)
if generation not in self.xref:
self.xref[generation] = {}
self.xref_free_entry[generation] = {}
Expand Down
18 changes: 18 additions & 0 deletions tests/test_reader.py
Expand Up @@ -1094,3 +1094,21 @@ def test_wrong_password_error():
def test_get_page_number_by_indirect():
reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
reader._get_page_number_by_indirect(1)


def test_corrupted_xref_table():
# issue #1292
url = "https://github.com/py-pdf/PyPDF2/files/9444747/BreezeManual.orig.pdf"
name = "BreezeMan1.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader.pages[0].extract_text()

url = "https://github.com/py-pdf/PyPDF2/files/9444748/BreezeManual.failed.pdf"
name = "BreezeMan2.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
try:
reader.pages[0].extract_text()
except Exception:
pass # Exception normal
else:
raise Exception("page 0 should not be corrupted")

0 comments on commit b5dce26

Please sign in to comment.