BUG: invalid cm/tm in visitor functions (#2206)

Reworks and is still valid to close #2059 Closes #2200 Closes #2075
py-pdf · Oct 8, 2023 · bcd85c4 · bcd85c4
1 parent 126f6be
commit bcd85c4
Show file tree

Hide file tree

Showing 6 changed files with 171 additions and 45 deletions.
diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
@@ -27,14 +27,27 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra
 You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment.
 
 The function provided in argument visitor_text of function extract_text has five arguments:
-text, current transformation matrix, text matrix, font-dictionary and font-size.
-In most cases the x and y coordinates of the current position
-are in index 4 and 5 of the current transformation matrix.
+* text: the current text (as long as possible, can be up to a full line)
+* user_matrix: current matrix to move from user coordinate space (also known as CTM)
+* tm_matrix: current matrix from text coordinate space
+* font-dictionary: full font dictionary
+* font-size: the size (in text coordinate space)
+
+The matrix stores 6 parameters. The first 4 provide the rotation/scaling matrix and the last two provide the translation (horizontal/vertical)
+It is recommended to use the user_matrix as it takes into all transformations.
+
+Notes :
+
+ - as indicated in the PDF 1.7 reference, page 204 the user matrix applies to text space/image space/form space/pattern space.
+ - if you want to get the full transformation from text to user space, you can use the `mult` function (availalbe in global import) as follows:
+`txt2user = mult(tm, cm))`
+The font-size is the raw text size, that is affected by the `user_matrix`
+
 
 The font-dictionary may be None in case of unknown fonts.
 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
 
-**Caveat**: In complicated documents the calculated positions might be wrong.
+**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example).
 
 The function provided in argument visitor_operand_before has four arguments:
 operator, operand-arguments, current transformation matrix and text matrix.
@@ -53,7 +66,7 @@ parts = []
 
 
 def visitor_body(text, cm, tm, font_dict, font_size):
-    y = tm[5]
+    y = cm[5]
     if y > 50 and y < 720:
         parts.append(text)
 
@@ -88,7 +101,7 @@ def visitor_svg_rect(op, args, cm, tm):
 
 
 def visitor_svg_text(text, cm, tm, fontDict, fontSize):
-    (x, y) = (tm[4], tm[5])
+    (x, y) = (cm[4], cm[5])
     dwg.add(dwg.text(text, insert=(x, y), fill="blue"))
 
 

diff --git a/pypdf/__init__.py b/pypdf/__init__.py
@@ -10,7 +10,7 @@
 from ._crypt_providers import crypt_provider
 from ._encryption import PasswordType
 from ._merger import PdfFileMerger, PdfMerger
-from ._page import PageObject, Transformation
+from ._page import PageObject, Transformation, mult
 from ._reader import DocumentInformation, PdfFileReader, PdfReader
 from ._version import __version__
 from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
@@ -31,6 +31,7 @@
 __all__ = [
     "__version__",
     "_debug_versions",
+    "mult",
     "PageRange",
     "PaperSize",
     "DocumentInformation",

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1921,18 +1921,17 @@ def _extract_text(
         # are strings where the byte->string encoding was unknown, so adding
         # them to the text here would be gibberish.
 
-        cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         cm_stack = []
         tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        tm_prev: List[float] = [
-            1.0,
-            0.0,
-            0.0,
-            1.0,
-            0.0,
-            0.0,
-        ]  # will store previous tm_matrix
+
+        # cm/tm_prev stores the last modified matrices can be an intermediate position
+        cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+        # memo_cm/tm will be used to store the position at the beginning of building the text
+        memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         char_scale = 1.0
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
@@ -1943,9 +1942,9 @@ def current_spacewidth() -> float:
             return _space_width / 1000.0
 
         def process_operation(operator: bytes, operands: List) -> None:
-            nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
+            nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
-            nonlocal orientations, rtl_dir, visitor_text
+            nonlocal orientations, rtl_dir, visitor_text, output, text
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
 
             check_crlf_space: bool = False
@@ -1954,14 +1953,18 @@ def process_operation(operator: bytes, operands: List) -> None:
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
                 return None
             elif operator == b"ET":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
             # table 4.7 "Graphics state operators", page 219
             # cm_matrix calculation is a reserved for the moment
             elif operator == b"q":
@@ -1992,7 +1995,7 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"cm":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 cm_matrix = mult(
                     [
@@ -2005,6 +2008,8 @@ def process_operation(operator: bytes, operands: List) -> None:
                     ],
                     cm_matrix,
                 )
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
             # Table 5.2 page 398
             elif operator == b"Tz":
                 char_scale = float(operands[0]) / 100.0
@@ -2016,8 +2021,10 @@ def process_operation(operator: bytes, operands: List) -> None:
                 if text != "":
                     output += text  # .translate(cmap)
                     if visitor_text is not None:
-                        visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                        visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
                 try:
                     # charMapTuple: font_type, float(sp_width / 2), encoding,
                     #               map_dict, font-dictionary
@@ -2088,17 +2095,19 @@ def process_operation(operator: bytes, operands: List) -> None:
                 try:
                     text, output, cm_prev, tm_prev = crlf_space_check(
                         text,
-                        cm_prev,
-                        tm_prev,
-                        cm_matrix,
-                        tm_matrix,
+                        (cm_prev, tm_prev),
+                        (cm_matrix, tm_matrix),
+                        (memo_cm, memo_tm),
                         cmap,
                         orientations,
                         output,
                         font_size,
                         visitor_text,
                         current_spacewidth(),
                     )
+                    if text == "":
+                        memo_cm = cm_matrix.copy()
+                        memo_tm = tm_matrix.copy()
                 except OrientationNotFoundError:
                     return None
 
@@ -2130,12 +2139,18 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"Do":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 try:
                     if output[-1] != "\n":
                         output += "\n"
                         if visitor_text is not None:
-                            visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
+                            visitor_text(
+                                "\n",
+                                memo_cm,
+                                memo_tm,
+                                cmap[3],
+                                font_size,
+                            )
                 except IndexError:
                     pass
                 try:
@@ -2151,21 +2166,30 @@ def process_operation(operator: bytes, operands: List) -> None:
                         )
                         output += text
                         if visitor_text is not None:
-                            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                            visitor_text(
+                                text,
+                                memo_cm,
+                                memo_tm,
+                                cmap[3],
+                                font_size,
+                            )
                 except Exception:
                     logger_warning(
                         f" impossible to decode XFormObject {operands[0]}",
                         __name__,
                     )
                 finally:
                     text = ""
+                    memo_cm = cm_matrix.copy()
+                    memo_tm = tm_matrix.copy()
+
             else:
                 process_operation(operator, operands)
             if visitor_operand_after is not None:
                 visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
         output += text  # just in case of
         if text != "" and visitor_text is not None:
-            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
         return output
 
     def extract_text(

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
@@ -87,10 +87,9 @@ def orient(m: List[float]) -> int:
 
 def crlf_space_check(
     text: str,
-    cm_prev: List[float],
-    tm_prev: List[float],
-    cm_matrix: List[float],
-    tm_matrix: List[float],
+    cmtm_prev: Tuple[List[float], List[float]],
+    cmtm_matrix: Tuple[List[float], List[float]],
+    memo_cmtm: Tuple[List[float], List[float]],
     cmap: Tuple[
         Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
     ],
@@ -100,13 +99,21 @@ def crlf_space_check(
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
     spacewidth: float,
 ) -> Tuple[str, str, List[float], List[float]]:
+    cm_prev = cmtm_prev[0]
+    tm_prev = cmtm_prev[1]
+    cm_matrix = cmtm_matrix[0]
+    tm_matrix = cmtm_matrix[1]
+    memo_cm = memo_cmtm[0]
+    memo_tm = memo_cmtm[1]
+
     m_prev = mult(tm_prev, cm_prev)
     m = mult(tm_matrix, cm_matrix)
     orientation = orient(m)
     delta_x = m[4] - m_prev[4]
     delta_y = m[5] - m_prev[5]
     k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
     f = font_size * k
+    cm_prev = m
     if orientation not in orientations:
         raise OrientationNotFoundError
     try:
@@ -117,8 +124,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )
@@ -136,8 +143,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )
@@ -155,8 +162,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )
@@ -174,8 +181,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -1288,3 +1288,81 @@ def test_get_contents_from_nullobject():
     p = writer.add_blank_page(100, 100)
     p[NameObject("/Contents")] = writer._add_object(NullObject())
     p.get_contents()
+
+
+@pytest.mark.enable_socket()
+def test_pos_text_in_textvisitor():
+    """See #2200"""
+    url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
+    name = "test_text_pos.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    p = ()
+
+    def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
+        nonlocal p
+        if text.startswith("5425."):
+            p = (tm[4], tm[5])
+
+    reader.pages[0].extract_text(visitor_text=visitor_body2)
+    assert abs(p[0] - 323.5) < 0.1
+    assert abs(p[1] - 457.4) < 0.1
+
+
+@pytest.mark.enable_socket()
+def test_pos_text_in_textvisitor2():
+    """See #2075"""
+    url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf"
+    name = "LegIndex-page6.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    x_lvl = 26
+    lst = []
+
+    def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None:
+        nonlocal x_lvl, lst
+        if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210:
+            lst.append(text.strip(" \n"))
+
+    reader.pages[0].extract_text(visitor_text=visitor_lvl)
+    assert lst == [
+        "ACUPUNCTURE BOARD",
+        "ACUPUNCTURISTS AND ACUPUNCTURE",
+        "ADMINISTRATIVE LAW AND PROCEDURE",
+        "ADMINISTRATIVE LAW, OFFICE OF",
+        "ADOPTION",
+        "ADULT EDUCATION",
+        "ADVERTISING. See also MARKETING; and particular subject matter (e.g.,",
+    ]
+    x_lvl = 35
+    lst = []
+    reader.pages[0].extract_text(visitor_text=visitor_lvl)
+    assert lst == [
+        "members,  AB 1264",
+        "assistants, acupuncture,  AB 1264",
+        "complaints, investigations, etc.,  AB 1264",
+        "day, california acupuncture,  HR 48",
+        "massage services, asian,  AB 1264",
+        "supervising acupuncturists,  AB 1264",
+        "supportive acupuncture services, basic,  AB 1264",
+        "rules and regulations—",
+        "professional assistants and employees: employment and compensation,  AB 916",
+        "adults, adoption of,  AB 1756",
+        "agencies, organizations, etc.: requirements, prohibitions, etc.,  SB 807",
+        "assistance programs, adoption: nonminor dependents,  SB 9",
+        "birth certificates,  AB 1302",
+        "contact agreements, postadoption—",
+        "facilitators, adoption,  AB 120",
+        "failed adoptions: reproductive loss leave,  SB 848",
+        "hearings, adoption finalization: remote proceedings, technology, etc.,  SB 21",
+        "native american tribes,  AB 120",
+        "parental rights, reinstatement of,  AB 20",
+        "parents, prospective adoptive: criminal background checks,  SB 824",
+        "services, adult educational,  SB 877",
+        "week, adult education,  ACR 31",
+        "alcoholic beverages: tied-house restrictions,  AB 546",
+        "campaign re social equity, civil rights, etc.,  SB 447",
+        "cannabis,  AB 794",
+        "elections. See ELECTIONS.",
+        "false, misleading, etc., advertising—",
+        "hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures,  SB 683",
+        "housing rental properties advertised rates: disclosures,  SB 611",
+    ]