ENH: Accept inline images with space before EI (#1552)

Closes #1541
py-pdf · Jan 16, 2023 · df90053 · df90053
1 parent f0c0a1d
commit df90053
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 17 deletions.
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -1016,7 +1016,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
             # We have reached the end of the stream, but haven't found the EI operator.
             if not buf:
                 raise PdfReadError("Unexpected end of stream")
-            loc = buf.find(b"E")
+            loc = buf.find(
+                b"E"
+            )  # we can not look straight for "EI" because it may not have been loaded in the buffer
 
             if loc == -1:
                 data.write(buf)
@@ -1026,28 +1028,44 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
 
                 # Seek back in the stream to read the E next.
                 stream.seek(loc - len(buf), 1)
-                tok = stream.read(1)
+                tok = stream.read(1)  # E of "EI"
                 # Check for End Image
-                tok2 = stream.read(1)
-                if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES:
-                    # Data can contain [\s]EI,  so check for the separator \s; 4 chars suffisent Q operator not required.
-                    tok3 = stream.read(1)
-                    info = tok + tok2
-                    # We need to find at least one whitespace after.
-                    has_q_whitespace = False
+                tok2 = stream.read(1)  # I of "EI"
+                if tok2 != b"I":
+                    stream.seek(-1, 1)
+                    data.write(tok)
+                    continue
+                # for further debug : print("!!!!",buf[loc-1:loc+10])
+                info = tok + tok2
+                tok3 = stream.read(
+                    1
+                )  # possible space after "EI" may not been loaded  in buf
+                if tok3 not in WHITESPACES:
+                    stream.seek(-2, 1)  # to step back on I
+                    data.write(tok)
+                elif buf[loc - 1 : loc] in WHITESPACES:  # and tok3 in WHITESPACES:
+                    # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required.
+                    while tok3 in WHITESPACES:
+                        # needed ???? : info += tok3
+                        tok3 = stream.read(1)
+                    stream.seek(-1, 1)
+                    # we do not insert EI
+                    break
+                else:  # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES:
+                    # Data can contain [!\s]EI[\s],  so check for Q or EMC operator is required to have 4 chars.
                     while tok3 in WHITESPACES:
-                        has_q_whitespace = True
                         info += tok3
                         tok3 = stream.read(1)
-                    if has_q_whitespace:
-                        stream.seek(-1, 1)
+                    stream.seek(-1, 1)
+                    if tok3 == b"Q":
                         break
+                    elif tok3 == b"E":
+                        ope = stream.read(3)
+                        stream.seek(-3, 1)
+                        if ope == b"EMC":
+                            break
                     else:
-                        stream.seek(-1, 1)
                         data.write(info)
-                else:
-                    stream.seek(-1, 1)
-                    data.write(tok)
         return {"settings": settings, "data": data.getvalue()}
 
     @property

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -15,7 +15,8 @@
 
 from pypdf import PdfMerger, PdfReader, PdfWriter
 from pypdf.constants import PageAttributes as PG
-from pypdf.errors import PdfReadWarning
+from pypdf.errors import PdfReadError, PdfReadWarning
+from pypdf.generic import ContentStream, read_object
 
 from . import get_pdf_from_url, normalize_warnings
 
@@ -880,3 +881,36 @@ def test_tounicode_is_identity():
     data = BytesIO(get_pdf_from_url(url, name=name))
     reader = PdfReader(data, strict=False)
     reader.pages[0].extract_text()
+
+
+@pytest.mark.external
+def test_extra_test_iss1541():
+    url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf"
+    name = "tst_iss1541.pdf"
+    data = BytesIO(get_pdf_from_url(url, name=name))
+    reader = PdfReader(data, strict=False)
+    reader.pages[0].extract_text()
+
+    cs = ContentStream(reader.pages[0]["/Contents"], None, None)
+    cs.operations.insert(-1, ([], b"EMC"))
+    bu = BytesIO()
+    cs.write_to_stream(bu, None)
+    bu.seek(0)
+    ContentStream(read_object(bu, None, None), None, None).operations
+
+    cs = ContentStream(reader.pages[0]["/Contents"], None, None)
+    cs.operations.insert(-1, ([], b"E!C"))
+    bu = BytesIO()
+    cs.write_to_stream(bu, None)
+    bu.seek(0)
+    with pytest.raises(PdfReadError) as exc:
+        ContentStream(read_object(bu, None, None), None, None).operations
+    assert exc.value.args[0] == "Unexpected end of stream"
+
+    buf2 = BytesIO(data.getbuffer())
+    reader = PdfReader(
+        BytesIO(bytes(buf2.getbuffer()).replace(b"EI \n", b"E! \n")), strict=False
+    )
+    with pytest.raises(PdfReadError) as exc:
+        reader.pages[0].extract_text()
+    assert exc.value.args[0] == "Unexpected end of stream"