Skip to content

Commit

Permalink
STY: consider images inside pages patterns.
Browse files Browse the repository at this point in the history
Added code to detect patterns in "_get_ids_image".
To avoid any conflicts with images that could be located directly in a page or images using the same ID in differents patterns, images ids under patterns are returned in this form :
"/Pattern/patternNameHere/imageNameHere"

Added code to deal with Pattern images in "_get_image".
  • Loading branch information
Nathanaël Renaud committed May 11, 2024
1 parent 32f826b commit eb6b294
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 26 deletions.
96 changes: 70 additions & 26 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,19 +457,35 @@ def _get_ids_image(
if ancest is None:
ancest = []
lst: List[Union[str, List[str]]] = []
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
DictionaryObject, obj[PG.RESOURCES]
):
return self.inline_images_keys

x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
if not isinstance(x_object[o], StreamObject):
continue
if x_object[o][IA.SUBTYPE] == "/Image":
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))

if PG.RESOURCES in obj:
if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]):
for patternName, pattern in obj[PG.RESOURCES][RES.PATTERN].items():
if PG.RESOURCES in pattern.get_object():
if RES.XOBJECT in cast(DictionaryObject, pattern[PG.RESOURCES]):
x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
if not isinstance(x_object[o], StreamObject):
continue
if x_object[o][IA.SUBTYPE] == "/Image":
lst.append(
f"{RES.PATTERN}{patternName}{o}"
if len(ancest) == 0
else ancest + [o]
)

if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]):
x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
if not isinstance(x_object[o], StreamObject):
continue
if x_object[o][IA.SUBTYPE] == "/Image":
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(
self._get_ids_image(x_object[o], ancest + [o], call_stack)
)

return lst + self.inline_images_keys

def _get_image(
Expand All @@ -484,9 +500,27 @@ def _get_image(
if isinstance(id, List) and len(id) == 1:
id = id[0]
try:
xobjs = cast(
DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
)
if isinstance(id, str) and id.find(RES.PATTERN) == 0:
pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)]
image_name = id[id.rfind("/") :]

patterns = cast(
DictionaryObject,
cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN],
)

xobjs = cast(
DictionaryObject,
cast(DictionaryObject, patterns[pattern_name][PG.RESOURCES])[
RES.XOBJECT
],
)

else:
xobjs = cast(
DictionaryObject,
cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT],
)
except KeyError:
if not (id[0] == "~" and id[-1] == "~"):
raise
Expand All @@ -497,16 +531,26 @@ def _get_image(
if self.inline_images is None: # pragma: no cover
raise KeyError("no inline image can be found")
return self.inline_images[id]

imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
extension, byte_stream = imgd[:2]
f = ImageFile(
name=f"{id[1:]}{extension}",
data=byte_stream,
image=imgd[2],
indirect_reference=xobjs[id].indirect_reference,
)
return f
elif id.find("/Pattern") == 0:
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_name]))
extension, byte_stream = imgd[:2]
f = ImageFile(
name=f"{pattern_name[1:]}_{image_name[1:]}{extension}",
data=byte_stream,
image=imgd[2],
indirect_reference=xobjs[image_name].indirect_reference,
)
return f
else:
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
extension, byte_stream = imgd[:2]
f = ImageFile(
name=f"{id[1:]}{extension}",
data=byte_stream,
image=imgd[2],
indirect_reference=xobjs[id].indirect_reference,
)
return f
else: # in a sub object
ids = id[1:]
return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
Expand Down
43 changes: 43 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,49 @@ def test_image_extraction(src, page_index, image_key, expected):
assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99


@pytest.mark.parametrize(
("src", "page_index", "image_key", "expected"),
[
(
SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf",
0,
"/Pattern/P1/X1",
SAMPLE_ROOT / "027-onlyoffice-image/P1_X1.jpg",
),
(
SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf",
0,
"/Pattern/P2/X1",
SAMPLE_ROOT / "027-onlyoffice-image/P2_X1.jpg",
),
(
SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf",
0,
"/Pattern/P3/X1",
SAMPLE_ROOT / "027-onlyoffice-image/P3_X1.jpg",
),
],
ids=[
"027-onlyoffice-image/P1_X1.jpg",
"027-onlyoffice-image/P2_X1.jpg",
"027-onlyoffice-image/P3_X1.jpg",
],
)
@pytest.mark.samples()
def test_patterns_image_extraction(src, page_index, image_key, expected):
reader = PdfReader(src)
extreactedIDs = reader.pages[page_index].images

assert (
str(extreactedIDs)
== "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]"
)

actual_image = reader.pages[page_index].images[image_key]

assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99


@pytest.mark.enable_socket()
@pytest.mark.timeout(30)
def test_loop_in_image_keys():
Expand Down

0 comments on commit eb6b294

Please sign in to comment.