Skip to content

Commit

Permalink
MAINT: Change the positions of the calls of the visitor-function
Browse files Browse the repository at this point in the history
Before the text-visitor-function had been called at each change of the output.
But this can lead to wrong coordinates because the output may sent after changing the text-matrix for the next text.
As an example have a look at resources/Sample_Td-matrix.pdf: The text_matrix is computed correctly at the Td-operations but the text had been sent after applying the next transformation.

In this pull request the texts are sent inside the TJ and Tj operations.
This may lead to sending letters instead of words:

```    x=264.53, y=403.13, text='M'
    x=264.53, y=403.13, text='etad'
    x=264.53, y=403.13, text='ata'
    x=307.85, y=403.13, text=' '
```

Therefore there is a second commit which introduces a temporarily visitor inside the processing of TJ.
The temp visitor ist used to collect the letters of TJ which will be sent after processing of TJ.
When setting the temp visitor the original parameter is manipulated. I don't know if this is bad style in python.
In case of bad style a local variable current_text_visitor may be introduced.

See also issue #1377. I haven't checked if #1377 had the Td-matrix-problem or the one to be solved by this PR.

--

This PR is a copy of #1389
The PR#1389 was made a long time ago (before we renamed to pypdf),
but it seems still valuable.

This PR migrated the changes to the new codebase. Full credit
to rogmann for all of the changes.

Co-authored-by: rogmann <github@rogmann.org>
  • Loading branch information
MartinThoma and srogmann committed Dec 24, 2023
1 parent 3ab1581 commit 387ea44
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 38 deletions.
66 changes: 46 additions & 20 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1873,6 +1873,7 @@ def _extract_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
See extract_text for most arguments.
Expand Down Expand Up @@ -1957,16 +1958,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
if operator == b"BT":
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
Expand Down Expand Up @@ -1999,8 +1996,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
cm_matrix = mult(
[
Expand All @@ -2025,8 +2020,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
elif operator == b"Tf":
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
Expand Down Expand Up @@ -2132,6 +2125,34 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
process_operation(b"TL", [-operands[1]])
process_operation(b"Td", operands)
elif operator == b"TJ":
if visitor_text is not None and group_TJ:
# To prevent sending letters instead of words we
# override the visitor temporarily.
visitor_text_before = visitor_text
tm_matrix_before = [
tm_matrix[0],
tm_matrix[1],
tm_matrix[2],
tm_matrix[3],
tm_matrix[4],
tm_matrix[5],
]
text_TJ: List[str] = []

def visitor_text(
text: str,
cm_matrix: Any,
tm_matrix: Any,
font_dict: Any,
font_size: Any,
) -> None:
# TODO cases where the current inserting order is kept
if rtl_dir:
# right-to-left
text_TJ.insert(0, text) # noqa
else:
text_TJ.append(text) # noqa

for op in operands[0]:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
Expand All @@ -2141,10 +2162,17 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
if visitor_text is not None and group_TJ:
visitor_text = visitor_text_before
visitor_text(
"".join(text_TJ),
cm_matrix,
tm_matrix_before,
cmap[3],
font_size,
)
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
Expand All @@ -2168,16 +2196,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
visitor_operand_before,
visitor_operand_after,
visitor_text,
group_TJ,
)
output += text
if visitor_text is not None:
visitor_text(
text,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
Expand All @@ -2193,8 +2214,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def extract_text(
Expand All @@ -2207,6 +2226,7 @@ def extract_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
Locate all text drawing commands, in the order they are provided in the
Expand Down Expand Up @@ -2246,6 +2266,8 @@ def extract_text(
text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
group_TJ: True for one call of visitor_text at each TJ,
False for calls of visitor_text at each text-fragment of TJ.
Returns:
The extracted text
Expand Down Expand Up @@ -2295,6 +2317,7 @@ def extract_text(
visitor_operand_before,
visitor_operand_after,
visitor_text,
group_TJ,
)

def extract_xform_text(
Expand All @@ -2305,6 +2328,7 @@ def extract_xform_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
Extract text from an XObject.
Expand All @@ -2316,6 +2340,8 @@ def extract_xform_text(
visitor_operand_before:
visitor_operand_after:
visitor_text:
group_TJ: True for one call of visitor_text at each TJ,
False for calls of visitor_text at each text-fragment of TJ.
Returns:
The extracted text
Expand Down
43 changes: 36 additions & 7 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def crlf_space_check(
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -136,13 +136,21 @@ def crlf_space_check(
and (output + text)[-1] != " "
):
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
elif orientation == 180:
if delta_y > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -155,13 +163,21 @@ def crlf_space_check(
and (output + text)[-1] != " "
):
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
elif orientation == 90:
if delta_x > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -180,7 +196,7 @@ def crlf_space_check(
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -193,6 +209,14 @@ def crlf_space_check(
and (output + text)[-1] != " "
):
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
except Exception:
pass
tm_prev = tm_matrix.copy()
Expand All @@ -214,12 +238,13 @@ def handle_tj(
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
) -> Tuple[str, bool]:

m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
if orientation in orientations and len(operands) > 0:
if isinstance(operands[0], str):
text += operands[0]
if visitor_text is not None:
visitor_text(operands[0], cm_matrix, tm_matrix, cmap[3], font_size)
else:
t: str = ""
tt: bytes = (
Expand All @@ -243,6 +268,7 @@ def handle_tj(
[cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
)
# "\u0590 - \u08FF \uFB50 - \uFDFF"
tj_text = ""
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
Expand All @@ -258,7 +284,7 @@ def handle_tj(
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
tj_text = x + tj_text if rtl_dir else tj_text + x
elif ( # right-to-left characters set
0x0590 <= xx <= 0x08FF
or 0xFB1D <= xx <= 0xFDFF
Expand All @@ -280,6 +306,9 @@ def handle_tj(
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = text + x
tj_text = tj_text + x
# fmt: on
text = tj_text + text if rtl_dir else text + tj_text
if visitor_text is not None:
visitor_text(tj_text, cm_matrix, tm_matrix, cmap[3], font_size)
return text, rtl_dir
38 changes: 27 additions & 11 deletions tests/test_page.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Test the pypdf._page module."""
import json
import math
import re
from copy import deepcopy
from io import BytesIO
from pathlib import Path
Expand Down Expand Up @@ -545,7 +546,7 @@ def print_op_b(op, args, cm_matrix, tm_matrix) -> None:
rectangles.append(r)

def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size) -> None:
if text.strip() != "":
if text != "":
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}")
texts.append(
Expand All @@ -571,7 +572,7 @@ def extract_table(
It is expected that each cell is marked by a rectangle-object.
It is expected that the page contains one table only.
It is expected that the table contains at least 3 columns and 2 rows.
It is expected that the table contains at least 2 columns and 2 rows.
A list of rows is returned.
Each row contains a list of cells.
Expand Down Expand Up @@ -623,8 +624,8 @@ def extract_table(
curr_y = None
curr_row = None
for r in rectangles_filtered:
if col2count[r.x] < 3 or row2count[r.y] < 2:
# We expect at least 3 columns and 2 rows.
if col2count[r.x] < 2 or row2count[r.y] < 2:
# We expect at least 2 columns and 2 rows.
continue
if curr_y is None or r.y != curr_y:
# next row
Expand All @@ -646,7 +647,8 @@ def extract_table(

def extract_cell_text(cell_texts: List[PositionedText]) -> str:
"""Joins the text-objects of a cell."""
return ("".join(t.text for t in cell_texts)).strip()
text_raw = "".join(t.text for t in cell_texts)
return re.sub(r" +\n", "\n", text_raw.strip())

# Test 1: We test the analysis of page 7 "2.1 LRS model".
reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf")
Expand All @@ -667,12 +669,16 @@ def ignore_large_rectangles(r) -> bool:
for t in texts:
for r in rectangles:
if r.contains(t.x, t.y):
texts = rectangle2texts.setdefault(r, [])
texts.append(t.text.strip())
rtexts = rectangle2texts.setdefault(r, [])
if t.text != "":
rtexts.append(t.text)
break
# Five boxes and the figure-description below.
assert len(rectangle2texts) == 6
box_texts = [" ".join(texts) for texts in rectangle2texts.values()]
assert len(rectangle2texts) == 11
box_texts = [
re.sub(" *\n", " ", "".join(texts).strip())
for texts in rectangle2texts.values()
]
assert "Hydro Network" in box_texts
assert "Hydro Events" in box_texts
assert "Metadata" in box_texts
Expand All @@ -697,10 +703,10 @@ def filter_first_table(r) -> bool:
assert extract_cell_text(rows[0][2]) == "Description"
assert extract_cell_text(rows[1][0]) == "September 2002"
# The line break between "English review;"
# and "Remove" is not detected.
# and "Remove" is detected.
assert (
extract_cell_text(rows[6][2])
== "English review;Remove the UML model for the Segmented view."
== "English review;\nRemove the UML model for the Segmented view."
)
assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments."

Expand Down Expand Up @@ -738,6 +744,16 @@ def visitor_td(op, args, cm, tm) -> None:
assert list_td[2] == (210.0, 210.0)
assert list_td[3] == (410.0, 210.0)

# Test 3b: check extract_visitor in Sample_Td-matrix.pdf
#
(texts, rectangles) = extract_text_and_rectangles(page_td_model)
rows = extract_table(texts, rectangles)
assert len(rows) == 2
assert extract_cell_text(rows[0][0]) == "Hello PDF!"
assert extract_cell_text(rows[0][1]) == "Hello PDF 200 0 Td!"
assert extract_cell_text(rows[1][0]) == "Hello PDF 2 1!"
assert extract_cell_text(rows[1][1]) == "Hello PDF 10 7!"


@pytest.mark.parametrize(
("pdf_path", "password", "embedded", "unembedded"),
Expand Down

0 comments on commit 387ea44

Please sign in to comment.