Skip to content

Commit

Permalink
ROB: Fix image extraction issue with superfluous whitespaces (#1327)
Browse files Browse the repository at this point in the history
Fix some images reading when some operations are inserted between EI and Q
end of image is now considered with [whitespace]EI[whitespace] (4 characters should be sufficient)

Fixes #1090
  • Loading branch information
pubpub-zz committed Sep 6, 2022
1 parent 4073b2a commit 5049c1e
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 13 deletions.
8 changes: 4 additions & 4 deletions PyPDF2/generic/_data_structures.py
Expand Up @@ -759,17 +759,17 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
tok = stream.read(1)
# Check for End Image
tok2 = stream.read(1)
if tok2 == b"I":
# Data can contain EI, so check for the Q operator.
if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES:
# Data can contain [\s]EI, so check for the separator \s; 4 chars suffisent Q operator not required.
tok3 = stream.read(1)
info = tok + tok2
# We need to find whitespace between EI and Q.
# We need to find at least one whitespace after.
has_q_whitespace = False
while tok3 in WHITESPACES:
has_q_whitespace = True
info += tok3
tok3 = stream.read(1)
if tok3 == b"Q" and has_q_whitespace:
if has_q_whitespace:
stream.seek(-1, 1)
break
else:
Expand Down
15 changes: 6 additions & 9 deletions tests/test_workflows.py
Expand Up @@ -17,7 +17,7 @@
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError, PdfReadWarning
from PyPDF2.errors import PdfReadWarning
from PyPDF2.filters import _xobj_to_image

from . import get_pdf_from_url, normalize_warnings
Expand Down Expand Up @@ -425,7 +425,7 @@ def test_get_metadata(url, name):
"https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf",
"tika-938702.pdf",
False,
(PdfReadError, "Unexpected end of stream"),
None, # iss #1090 is now fixed
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf",
Expand Down Expand Up @@ -512,19 +512,16 @@ def test_extract_text(url, name, strict, exception):
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/957/957304.pdf",
"tika-938702.pdf",
"tika-957304.pdf",
),
],
)
def test_compress_raised(url, name):
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data)
# TODO: which page exactly?
# TODO: Is it reasonable to have an exception here?
with pytest.raises(PdfReadError) as exc:
for page in reader.pages:
page.compress_content_streams()
assert exc.value.args[0] == "Unexpected end of stream"
# no more error since iss #1090 fix
for page in reader.pages:
page.compress_content_streams()


@pytest.mark.parametrize(
Expand Down

0 comments on commit 5049c1e

Please sign in to comment.