ROB: Fix image extraction issue with superfluous whitespaces (#1327)

Fix some images reading when some operations are inserted between EI and Q end of image is now considered with [whitespace]EI[whitespace] (4 characters should be sufficient) Fixes #1090
py-pdf · Sep 6, 2022 · 5049c1e · 5049c1e
1 parent 4073b2a
commit 5049c1e
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 13 deletions.
diff --git a/PyPDF2/generic/_data_structures.py b/PyPDF2/generic/_data_structures.py
@@ -759,17 +759,17 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
                 tok = stream.read(1)
                 # Check for End Image
                 tok2 = stream.read(1)
-                if tok2 == b"I":
-                    # Data can contain EI, so check for the Q operator.
+                if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES:
+                    # Data can contain [\s]EI,  so check for the separator \s; 4 chars suffisent Q operator not required.
                     tok3 = stream.read(1)
                     info = tok + tok2
-                    # We need to find whitespace between EI and Q.
+                    # We need to find at least one whitespace after.
                     has_q_whitespace = False
                     while tok3 in WHITESPACES:
                         has_q_whitespace = True
                         info += tok3
                         tok3 = stream.read(1)
-                    if tok3 == b"Q" and has_q_whitespace:
+                    if has_q_whitespace:
                         stream.seek(-1, 1)
                         break
                     else:

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -17,7 +17,7 @@
 from PyPDF2.constants import ImageAttributes as IA
 from PyPDF2.constants import PageAttributes as PG
 from PyPDF2.constants import Ressources as RES
-from PyPDF2.errors import PdfReadError, PdfReadWarning
+from PyPDF2.errors import PdfReadWarning
 from PyPDF2.filters import _xobj_to_image
 
 from . import get_pdf_from_url, normalize_warnings
@@ -425,7 +425,7 @@ def test_get_metadata(url, name):
             "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf",
             "tika-938702.pdf",
             False,
-            (PdfReadError, "Unexpected end of stream"),
+            None,  # iss #1090 is now fixed
         ),
         (
             "https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf",
@@ -512,19 +512,16 @@ def test_extract_text(url, name, strict, exception):
         ),
         (
             "https://corpora.tika.apache.org/base/docs/govdocs1/957/957304.pdf",
-            "tika-938702.pdf",
+            "tika-957304.pdf",
         ),
     ],
 )
 def test_compress_raised(url, name):
     data = BytesIO(get_pdf_from_url(url, name=name))
     reader = PdfReader(data)
-    # TODO: which page exactly?
-    # TODO: Is it reasonable to have an exception here?
-    with pytest.raises(PdfReadError) as exc:
-        for page in reader.pages:
-            page.compress_content_streams()
-    assert exc.value.args[0] == "Unexpected end of stream"
+    # no more error since iss #1090 fix
+    for page in reader.pages:
+        page.compress_content_streams()
 
 
 @pytest.mark.parametrize(