TST: Error cases (#773)

py-pdf · Apr 17, 2022 · 9cd16d0 · 9cd16d0
1 parent 9111336
commit 9cd16d0
Show file tree

Hide file tree

Showing 4 changed files with 212 additions and 3 deletions.
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -54,6 +54,16 @@ def test_read_metadata(pdf_path, expected):
         docinfo = reader.getDocumentInfo()
         metadict = dict(docinfo)
         assert metadict == expected
+        docinfo.title
+        docinfo.title_raw
+        docinfo.author
+        docinfo.author_raw
+        docinfo.creator
+        docinfo.creator_raw
+        docinfo.producer
+        docinfo.producer_raw
+        docinfo.subject
+        docinfo.subject_raw
         if "/Title" in metadict:
             assert metadict["/Title"] == docinfo.title
 
@@ -293,9 +303,153 @@ def test_get_page_layout(src, expected):
     "src,expected",
     [
         ("form.pdf", "/UseNone"),
+        ("crazyones.pdf", None),
     ],
 )
 def test_get_page_mode(src, expected):
     src = os.path.join(RESOURCE_ROOT, src)
     reader = PdfFileReader(src)
     assert reader.getPageMode() == expected
+
+
+def test_read_empty():
+    with pytest.raises(PdfReadError) as exc:
+        PdfFileReader(io.BytesIO())
+    assert exc.value.args[0] == "Cannot read an empty file"
+
+
+def test_read_malformed():
+    with pytest.raises(PdfReadError) as exc:
+        PdfFileReader(io.BytesIO(b"foo"))
+    assert exc.value.args[0] == "Could not read malformed PDF file"
+
+
+def test_read_prev_0_trailer():
+    pdf_data = (
+        b"%%PDF-1.7\n"
+        b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
+        b"2 0 obj << >> endobj\n"
+        b"3 0 obj << >> endobj\n"
+        b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
+        b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
+        b" /Resources << /Font << >> >>"
+        b" /Rotate 0 /Type /Page >> endobj\n"
+        b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
+        b"xref 1 5\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"trailer << %s/Root 5 0 R /Size 6 >>\n"
+        b"startxref %d\n"
+        b"%%%%EOF"
+    )
+    with_prev_0 = True
+    pdf_data = pdf_data % (
+        pdf_data.find(b"1 0 obj"),
+        pdf_data.find(b"2 0 obj"),
+        pdf_data.find(b"3 0 obj"),
+        pdf_data.find(b"4 0 obj"),
+        pdf_data.find(b"5 0 obj"),
+        b"/Prev 0 " if with_prev_0 else b"",
+        pdf_data.find(b"xref"),
+    )
+    pdf_stream = io.BytesIO(pdf_data)
+    with pytest.raises(PdfReadError) as exc:
+        PdfFileReader(pdf_stream, strict=True)
+    assert exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)"
+
+
+def test_read_missing_startxref():
+    pdf_data = (
+        b"%%PDF-1.7\n"
+        b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
+        b"2 0 obj << >> endobj\n"
+        b"3 0 obj << >> endobj\n"
+        b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
+        b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
+        b" /Resources << /Font << >> >>"
+        b" /Rotate 0 /Type /Page >> endobj\n"
+        b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
+        b"xref 1 5\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"trailer << /Root 5 0 R /Size 6 >>\n"
+        # b"startxref %d\n"
+        b"%%%%EOF"
+    )
+    pdf_data = pdf_data % (
+        pdf_data.find(b"1 0 obj"),
+        pdf_data.find(b"2 0 obj"),
+        pdf_data.find(b"3 0 obj"),
+        pdf_data.find(b"4 0 obj"),
+        pdf_data.find(b"5 0 obj"),
+        # pdf_data.find(b"xref"),
+    )
+    pdf_stream = io.BytesIO(pdf_data)
+    with pytest.raises(PdfReadError) as exc:
+        PdfFileReader(pdf_stream, strict=True)
+    assert exc.value.args[0] == "startxref not found"
+
+
+def test_read_unknown_zero_pages():
+    pdf_data = (
+        b"%%PDF-1.7\n"
+        b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
+        b"2 0 obj << >> endobj\n"
+        b"3 0 obj << >> endobj\n"
+        b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
+        b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
+        b" /Resources << /Font << >> >>"
+        b" /Rotate 0 /Type /Page >> endobj\n"
+        # Pages 0 0 is the key point:
+        b"5 0 obj << /Pages 0 0 R /Type /Catalog >> endobj\n"
+        b"xref 1 5\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"trailer << /Root 5 1 R /Size 6 >>\n"
+        b"startxref %d\n"
+        b"%%%%EOF"
+    )
+    pdf_data = pdf_data % (
+        pdf_data.find(b"1 0 obj"),
+        pdf_data.find(b"2 0 obj"),
+        pdf_data.find(b"3 0 obj"),
+        pdf_data.find(b"4 0 obj"),
+        pdf_data.find(b"5 0 obj"),
+        pdf_data.find(b"xref"),
+    )
+    pdf_stream = io.BytesIO(pdf_data)
+    with pytest.raises(PdfReadError) as exc:
+        reader = PdfFileReader(pdf_stream, strict=True)
+        reader.getNumPages()
+
+    assert exc.value.args[0] == "Could not find object."
+    reader = PdfFileReader(pdf_stream, strict=False)
+    with pytest.raises(AttributeError) as exc:
+        reader.getNumPages()
+    assert exc.value.args[0] == "'NoneType' object has no attribute 'getObject'"
+
+
+def test_read_encrypted_without_decryption():
+    src = os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf")
+    reader = PdfFileReader(src)
+    with pytest.raises(PdfReadError) as exc:
+        reader.getNumPages()
+    assert exc.value.args[0] == "File has not been decrypted"
+
+
+def test_get_destination_age_number():
+    src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
+    reader = PdfFileReader(src)
+    outlines = reader.getOutlines()
+    for outline in outlines:
+        if not isinstance(outline, list):
+            reader.getDestinationPageNumber(outline)
diff --git a/Tests/test_writer.py b/Tests/test_writer.py
@@ -11,6 +11,16 @@
 RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")
 
 
+def test_writer_clone():
+    src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
+
+    reader = PdfFileReader(src)
+    writer = PdfFileWriter()
+
+    writer.cloneDocumentFromReader(reader)
+    assert writer.getNumPages() == 4
+
+
 def test_writer_operations():
     """
     This test just checks if the operation throws an exception.
@@ -33,7 +43,6 @@ def test_writer_operations():
     writer.insertPage(reader_outline.pages[0], 0)
     writer.addBookmarkDestination(page)
     writer.removeLinks()
-    # assert output.getNamedDestRoot() == ['A named destination', IndirectObject(9, 0, output)]
     writer.addBlankPage()
     writer.addURI(2, "https://example.com", RectangleObject([0, 0, 100, 100]))
     writer.addLink(2, 1, RectangleObject([0, 0, 100, 100]))
@@ -213,7 +222,11 @@ def test_add_named_destination():
 
     from PyPDF2.pdf import NameObject
 
-    writer.addNamedDestination(NameObject("A bookmark"), 2)
+    writer.addNamedDestination(NameObject("A named dest"), 2)
+
+    from PyPDF2.pdf import IndirectObject
+
+    assert writer.getNamedDestRoot() == ["A named dest", IndirectObject(7, 0, writer)]
 
     # write "output" to PyPDF2-output.pdf
     tmp_filename = "dont_commit_named_destination.pdf"
@@ -307,4 +320,23 @@ def test_add_link():
         writer.write(output_stream)
 
     # Cleanup
-    # os.remove(tmp_filename)
+    os.remove(tmp_filename)
+
+
+def test_io_streams():
+    """This is the example from the docs ("Streaming data")."""
+    # Arrange
+    from io import BytesIO
+
+    filepath = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
+    with open(filepath, "rb") as fh:
+        bytes_stream = BytesIO(fh.read())
+
+    # Read from bytes stream
+    reader = PdfFileReader(bytes_stream)
+    assert reader.getNumPages() == 4
+
+    # Write to bytes stream
+    writer = PdfFileWriter()
+    with BytesIO() as output_stream:
+        writer.write(output_stream)
diff --git a/docs/index.rst b/docs/index.rst
@@ -28,6 +28,7 @@ You can contribute to `PyPDF2 on Github <https://github.com/py-pdf/PyPDF2>`_.
    user/reading-pdf-annotations
    user/adding-pdf-annotations
    user/forms
+   user/streaming-data
 
 
 .. toctree::

diff --git a/docs/user/streaming-data.md b/docs/user/streaming-data.md
@@ -0,0 +1,22 @@
+# Streaming Data with PyPDF2
+
+In some cases you might want to avoid saving things explicitly as a file
+to disk, e.g. when you want to store the PDF in a database or AWS S3.
+
+PyPDF2 supports streaming data to a file-like object and here is how.
+
+```python
+from io import BytesIO
+
+# Prepare example
+with open("example.pdf", 'rb') as fh:
+    bytes_stream = BytesIO(fh.read())
+
+# Read from bytes_stream
+reader = PdfFileReader(bytes_stream)
+
+# Write to bytes_stream
+writer = PdfFileWriter()
+with BytesIO() as bytes_stream:
+    writer.write(bytes_stream)
+```