Skip to content

Commit

Permalink
Merge branch 'main' into fix355
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Apr 17, 2022
2 parents 2ad7ad4 + 9cd16d0 commit 0f2a3bf
Show file tree
Hide file tree
Showing 9 changed files with 544 additions and 30 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ coverage.xml
docs/_build/

# Files generated by some of the scripts
dont_commit_merged.pdf
dont_commit_writer.pdf
dont_commit_*.pdf
PyPDF2-output.pdf
Image9.png
PyPDF2_pdfLocation.txt
3 changes: 3 additions & 0 deletions PyPDF2/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ class Core:
"""Keywords that don't quite belong anywhere else"""

OUTLINES = "/Outlines"
PAGE = "/Page"
PAGES = "/Pages"
CATALOG = "/Catalog"


class TrailerKeys:
Expand Down
30 changes: 20 additions & 10 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ def __init__(self):
# root object
root = DictionaryObject()
root.update({
NameObject(PA.TYPE): NameObject("/Catalog"),
NameObject("/Pages"): self._pages,
NameObject(PA.TYPE): NameObject(CO.CATALOG),
NameObject(CO.PAGES): self._pages,
})
self._root = None
self._root_object = root
Expand All @@ -130,8 +130,8 @@ def getObject(self, ido):
return self._objects[ido.idnum - 1]

def _addPage(self, page, action):
assert page[PA.TYPE] == "/Page"
page[NameObject("/Parent")] = self._pages
assert page[PA.TYPE] == CO.PAGE
page[NameObject(PA.PARENT)] = self._pages
page = self._addObject(page)
pages = self.getObject(self._pages)
action(pages[PA.KIDS], page)
Expand Down Expand Up @@ -730,7 +730,17 @@ def addBookmarkDict(self, bookmark, parent=None):

return bookmarkRef

def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
def addBookmark(
self,
title,
pagenum,
parent=None,
color=None,
bold=False,
italic=False,
fit='/Fit',
*args
):
"""
Add a bookmark to this PDF file.
Expand Down Expand Up @@ -1315,7 +1325,7 @@ def getFields(self, tree = None, retval = None, fileobj = None):
default, the mapping name is used for keys.
:rtype: dict, or ``None`` if form data could not be located.
"""
fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent",
fieldAttributes = {"/FT" : "Field Type", PA.PARENT : "Parent",
"/T" : "Field Name", "/TU" : "Alternate Field Name",
"/TM" : "Mapping Name", "/Ff" : "Field Flags",
"/V" : "Value", "/DV" : "Default Value"}
Expand Down Expand Up @@ -1367,7 +1377,7 @@ def _checkKids(self, tree, retval, fileobj):
self.getFields(kid.getObject(), retval, fileobj)

def _writeField(self, fileobj, field, fieldAttributes):
order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"]
order = ["/TM", "/T", "/FT", PA.PARENT, "/TU", "/Ff", "/V", "/DV"]
for attr in order:
attrName = fieldAttributes[attr]
try:
Expand All @@ -1377,12 +1387,12 @@ def _writeField(self, fileobj, field, fieldAttributes):
"/Sig":"Signature"}
if field[attr] in types:
fileobj.write(attrName + ": " + types[field[attr]] + "\n")
elif attr == "/Parent":
elif attr == PA.PARENT:
# Let's just write the name of the parent
try:
name = field["/Parent"]["/TM"]
name = field[PA.PARENT]["/TM"]
except KeyError:
name = field["/Parent"]["/T"]
name = field[PA.PARENT]["/T"]
fileobj.write(attrName + ": " + name + "\n")
else:
fileobj.write(attrName + ": " + str(field[attr]) + "\n")
Expand Down
8 changes: 8 additions & 0 deletions Tests/test_pagerange.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ def test_str(page_range, expected):
assert str(PageRange(page_range)) == expected


@pytest.mark.parametrize(
"page_range,expected",
[(slice(0, 5), "PageRange('0:5')"), (slice(0, 5, 2), "PageRange('0:5:2')")],
)
def test_repr(page_range, expected):
assert repr(PageRange(page_range)) == expected


def test_equality_other_objectc():
pr1 = PageRange(slice(0, 5))
pr2 = "PageRange(slice(0, 5))"
Expand Down
226 changes: 218 additions & 8 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,16 @@ def test_read_metadata(pdf_path, expected):
docinfo = reader.getDocumentInfo()
metadict = dict(docinfo)
assert metadict == expected
docinfo.title
docinfo.title_raw
docinfo.author
docinfo.author_raw
docinfo.creator
docinfo.creator_raw
docinfo.producer
docinfo.producer_raw
docinfo.subject
docinfo.subject_raw
if "/Title" in metadict:
assert metadict["/Title"] == docinfo.title

Expand Down Expand Up @@ -196,7 +206,10 @@ def test_get_images_raw(strict, with_prev_0, should_fail):
if should_fail:
with pytest.raises(PdfReadError) as exc:
PdfFileReader(pdf_stream, strict=strict)
assert exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)"
assert (
exc.value.args[0]
== "/Prev=0 in the trailer (try opening with strict=False)"
)
else:
PdfFileReader(pdf_stream, strict=strict)

Expand Down Expand Up @@ -230,16 +243,213 @@ def test_get_page_of_encrypted_file():


@pytest.mark.parametrize(
"src,expected",
"src,expected,expected_method",
[
("form.pdf", {"foo": ""}),
("form_acrobatReader.pdf", {"foo": "Bar"}),
("form_evince.pdf", {"foo": "bar"}),
(
"form.pdf",
{"foo": ""},
{"foo": {"/DV": "", "/FT": "/Tx", "/T": "foo", "/V": ""}},
),
(
"form_acrobatReader.pdf",
{"foo": "Bar"},
{"foo": {"/DV": "", "/FT": "/Tx", "/T": "foo", "/V": "Bar"}},
),
(
"form_evince.pdf",
{"foo": "bar"},
{"foo": {"/DV": "", "/FT": "/Tx", "/T": "foo", "/V": "bar"}},
),
],
)
def test_form(src, expected):
def test_get_form(src, expected, expected_method):
"""Check if we can read out form data."""
src = os.path.join(RESOURCE_ROOT, src)
pdf = PdfFileReader(src)
fields = pdf.getFormTextFields()
reader = PdfFileReader(src)
fields = reader.getFormTextFields()
assert fields == expected

fields = reader.getFields()
assert fields == expected_method


@pytest.mark.parametrize(
"src,page_nb",
[
("form.pdf", 0),
("pdflatex-outline.pdf", 2),
],
)
def test_get_page_number(src, page_nb):
src = os.path.join(RESOURCE_ROOT, src)
reader = PdfFileReader(src)
page = reader.pages[page_nb]
assert reader.getPageNumber(page) == page_nb


@pytest.mark.parametrize(
"src,expected",
[
("form.pdf", None),
],
)
def test_get_page_layout(src, expected):
src = os.path.join(RESOURCE_ROOT, src)
reader = PdfFileReader(src)
assert reader.getPageLayout() == expected


@pytest.mark.parametrize(
"src,expected",
[
("form.pdf", "/UseNone"),
("crazyones.pdf", None),
],
)
def test_get_page_mode(src, expected):
src = os.path.join(RESOURCE_ROOT, src)
reader = PdfFileReader(src)
assert reader.getPageMode() == expected


def test_read_empty():
with pytest.raises(PdfReadError) as exc:
PdfFileReader(io.BytesIO())
assert exc.value.args[0] == "Cannot read an empty file"


def test_read_malformed():
with pytest.raises(PdfReadError) as exc:
PdfFileReader(io.BytesIO(b"foo"))
assert exc.value.args[0] == "Could not read malformed PDF file"


def test_read_prev_0_trailer():
pdf_data = (
b"%%PDF-1.7\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
b"2 0 obj << >> endobj\n"
b"3 0 obj << >> endobj\n"
b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
b" /Resources << /Font << >> >>"
b" /Rotate 0 /Type /Page >> endobj\n"
b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
b"xref 1 5\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"trailer << %s/Root 5 0 R /Size 6 >>\n"
b"startxref %d\n"
b"%%%%EOF"
)
with_prev_0 = True
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
b"/Prev 0 " if with_prev_0 else b"",
pdf_data.find(b"xref"),
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
PdfFileReader(pdf_stream, strict=True)
assert exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)"


def test_read_missing_startxref():
pdf_data = (
b"%%PDF-1.7\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
b"2 0 obj << >> endobj\n"
b"3 0 obj << >> endobj\n"
b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
b" /Resources << /Font << >> >>"
b" /Rotate 0 /Type /Page >> endobj\n"
b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
b"xref 1 5\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"trailer << /Root 5 0 R /Size 6 >>\n"
# b"startxref %d\n"
b"%%%%EOF"
)
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
# pdf_data.find(b"xref"),
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
PdfFileReader(pdf_stream, strict=True)
assert exc.value.args[0] == "startxref not found"


def test_read_unknown_zero_pages():
pdf_data = (
b"%%PDF-1.7\n"
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
b"2 0 obj << >> endobj\n"
b"3 0 obj << >> endobj\n"
b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
b" /Resources << /Font << >> >>"
b" /Rotate 0 /Type /Page >> endobj\n"
# Pages 0 0 is the key point:
b"5 0 obj << /Pages 0 0 R /Type /Catalog >> endobj\n"
b"xref 1 5\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"%010d 00000 n\n"
b"trailer << /Root 5 1 R /Size 6 >>\n"
b"startxref %d\n"
b"%%%%EOF"
)
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
pdf_data.find(b"xref"),
)
pdf_stream = io.BytesIO(pdf_data)
with pytest.raises(PdfReadError) as exc:
reader = PdfFileReader(pdf_stream, strict=True)
reader.getNumPages()

assert exc.value.args[0] == "Could not find object."
reader = PdfFileReader(pdf_stream, strict=False)
with pytest.raises(AttributeError) as exc:
reader.getNumPages()
assert exc.value.args[0] == "'NoneType' object has no attribute 'getObject'"


def test_read_encrypted_without_decryption():
src = os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf")
reader = PdfFileReader(src)
with pytest.raises(PdfReadError) as exc:
reader.getNumPages()
assert exc.value.args[0] == "File has not been decrypted"


def test_get_destination_age_number():
src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
reader = PdfFileReader(src)
outlines = reader.getOutlines()
for outline in outlines:
if not isinstance(outline, list):
reader.getDestinationPageNumber(outline)

0 comments on commit 0f2a3bf

Please sign in to comment.