Skip to content

Commit

Permalink
ROB: Capture UnicodeDecodeError at PdfReader.pdf_header (#1768)
Browse files Browse the repository at this point in the history
Fixes #1758
  • Loading branch information
pubpub-zz committed Apr 6, 2023
1 parent b385ce9 commit 8146729
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 12 deletions.
28 changes: 17 additions & 11 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import struct
import zlib
from datetime import datetime
from io import BytesIO
from io import BytesIO, UnsupportedOperation
from pathlib import Path
from typing import (
Any,
Expand Down Expand Up @@ -360,7 +360,7 @@ def pdf_header(self) -> str:
# but that needs a deprecation
loc = self.stream.tell()
self.stream.seek(0, 0)
pdf_file_version = self.stream.read(8).decode("utf-8")
pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace")
self.stream.seek(loc, 0) # return to where it was
return pdf_file_version

Expand Down Expand Up @@ -1541,19 +1541,22 @@ def read(self, stream: StreamType) -> None:

def _basic_validation(self, stream: StreamType) -> None:
"""Ensure file is not empty. Read at most 5 bytes."""
# start at the end:
stream.seek(0, os.SEEK_END)
if not stream.tell():
raise EmptyFileError("Cannot read an empty file")
if self.strict:
stream.seek(0, os.SEEK_SET)
stream.seek(0, os.SEEK_SET)
try:
header_byte = stream.read(5)
if header_byte != b"%PDF-":
except UnicodeDecodeError:
raise UnsupportedOperation("cannot read header")
if header_byte == b"":
raise EmptyFileError("Cannot read an empty file")
elif header_byte != b"%PDF-":
if self.strict:
raise PdfReadError(
f"PDF starts with '{header_byte.decode('utf8')}', "
"but '%PDF-' expected"
)
stream.seek(0, os.SEEK_END)
else:
logger_warning(f"invalid pdf header: {header_byte}", __name__)
stream.seek(0, os.SEEK_END)

def _find_eof_marker(self, stream: StreamType) -> None:
"""
Expand All @@ -1567,7 +1570,10 @@ def _find_eof_marker(self, stream: StreamType) -> None:
line = b""
while line[:5] != b"%%EOF":
if stream.tell() < HEADER_SIZE:
raise PdfReadError("EOF marker not found")
if self.strict:
raise PdfReadError("EOF marker not found")
else:
logger_warning("EOF marker not found", __name__)
line = read_previous_line(stream)

def _find_startxref_pos(self, stream: StreamType) -> int:
Expand Down
42 changes: 41 additions & 1 deletion tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,10 +460,16 @@ def test_read_empty():
assert exc.value.args[0] == "Cannot read an empty file"


def test_read_malformed_header():
def test_read_malformed_header(caplog):
with pytest.raises(PdfReadError) as exc:
PdfReader(io.BytesIO(b"foo"), strict=True)
assert exc.value.args[0] == "PDF starts with 'foo', but '%PDF-' expected"
caplog.clear()
try:
PdfReader(io.BytesIO(b"foo"), strict=False)
except Exception:
pass
assert caplog.messages[0].startswith("invalid pdf header")


def test_read_malformed_body():
Expand Down Expand Up @@ -1352,3 +1358,37 @@ def test_iss1710():
name = "irbookonlinereading.pdf"
in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
in_pdf.outline


def test_broken_file_header():
pdf_data = (
b"%%PDF-\xa0sd\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
b"2 0 obj << >> endobj\n"
b"3 0 obj << >> endobj\n"
b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
b" /Resources << /Font << >> >>"
b" /Rotate 0 /Type /Page >> endobj\n"
b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
b"xref 1 5\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"trailer << %s/Root 5 0 R /Size 6 >>\n"
b"startxref %d\n"
b"%%%%EOF"
)
with_prev_0 = True
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
b"/Prev 0 " if with_prev_0 else b"",
pdf_data.find(b"xref") - 1,
)
PdfReader(io.BytesIO(pdf_data))

0 comments on commit 8146729

Please sign in to comment.