From bcd85c4e4b46116a049fbba54b5b0aa7260ff041 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Oct 2023 11:58:42 +0200 Subject: [PATCH] BUG: invalid cm/tm in visitor functions (#2206) Reworks and is still valid to close #2059 Closes #2200 Closes #2075 --- docs/user/extract-text.md | 25 +++++++--- pypdf/__init__.py | 3 +- pypdf/_page.py | 70 ++++++++++++++++++--------- pypdf/_text_extraction/__init__.py | 31 +++++++----- tests/test_page.py | 78 ++++++++++++++++++++++++++++++ tests/test_text_extraction.py | 9 ++-- 6 files changed, 171 insertions(+), 45 deletions(-) diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index 6e1d1c775..649f723f6 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -27,14 +27,27 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment. The function provided in argument visitor_text of function extract_text has five arguments: -text, current transformation matrix, text matrix, font-dictionary and font-size. -In most cases the x and y coordinates of the current position -are in index 4 and 5 of the current transformation matrix. +* text: the current text (as long as possible, can be up to a full line) +* user_matrix: current matrix to move from user coordinate space (also known as CTM) +* tm_matrix: current matrix from text coordinate space +* font-dictionary: full font dictionary +* font-size: the size (in text coordinate space) + +The matrix stores 6 parameters. The first 4 provide the rotation/scaling matrix and the last two provide the translation (horizontal/vertical) +It is recommended to use the user_matrix as it takes into all transformations. + +Notes : + + - as indicated in the PDF 1.7 reference, page 204 the user matrix applies to text space/image space/form space/pattern space. + - if you want to get the full transformation from text to user space, you can use the `mult` function (availalbe in global import) as follows: +`txt2user = mult(tm, cm))` +The font-size is the raw text size, that is affected by the `user_matrix` + The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". -**Caveat**: In complicated documents the calculated positions might be wrong. +**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example). The function provided in argument visitor_operand_before has four arguments: operator, operand-arguments, current transformation matrix and text matrix. @@ -53,7 +66,7 @@ parts = [] def visitor_body(text, cm, tm, font_dict, font_size): - y = tm[5] + y = cm[5] if y > 50 and y < 720: parts.append(text) @@ -88,7 +101,7 @@ def visitor_svg_rect(op, args, cm, tm): def visitor_svg_text(text, cm, tm, fontDict, fontSize): - (x, y) = (tm[4], tm[5]) + (x, y) = (cm[4], cm[5]) dwg.add(dwg.text(text, insert=(x, y), fill="blue")) diff --git a/pypdf/__init__.py b/pypdf/__init__.py index 250c05564..df07b5306 100644 --- a/pypdf/__init__.py +++ b/pypdf/__init__.py @@ -10,7 +10,7 @@ from ._crypt_providers import crypt_provider from ._encryption import PasswordType from ._merger import PdfFileMerger, PdfMerger -from ._page import PageObject, Transformation +from ._page import PageObject, Transformation, mult from ._reader import DocumentInformation, PdfFileReader, PdfReader from ._version import __version__ from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter @@ -31,6 +31,7 @@ __all__ = [ "__version__", "_debug_versions", + "mult", "PageRange", "PaperSize", "DocumentInformation", diff --git a/pypdf/_page.py b/pypdf/_page.py index 2e48927a9..4f9c25000 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1921,18 +1921,17 @@ def _extract_text( # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. - cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] cm_stack = [] tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - tm_prev: List[float] = [ - 1.0, - 0.0, - 0.0, - 1.0, - 0.0, - 0.0, - ] # will store previous tm_matrix + + # cm/tm_prev stores the last modified matrices can be an intermediate position + cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + + # memo_cm/tm will be used to store the position at the beginning of building the text + memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf @@ -1943,9 +1942,9 @@ def current_spacewidth() -> float: return _space_width / 1000.0 def process_operation(operator: bytes, operands: List) -> None: - nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text + nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap - nonlocal orientations, rtl_dir, visitor_text + nonlocal orientations, rtl_dir, visitor_text, output, text global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS check_crlf_space: bool = False @@ -1954,14 +1953,18 @@ def process_operation(operator: bytes, operands: List) -> None: tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() return None elif operator == b"ET": output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() # table 4.7 "Graphics state operators", page 219 # cm_matrix calculation is a reserved for the moment elif operator == b"q": @@ -1992,7 +1995,7 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"cm": output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" cm_matrix = mult( [ @@ -2005,6 +2008,8 @@ def process_operation(operator: bytes, operands: List) -> None: ], cm_matrix, ) + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() # Table 5.2 page 398 elif operator == b"Tz": char_scale = float(operands[0]) / 100.0 @@ -2016,8 +2021,10 @@ def process_operation(operator: bytes, operands: List) -> None: if text != "": output += text # .translate(cmap) if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() try: # charMapTuple: font_type, float(sp_width / 2), encoding, # map_dict, font-dictionary @@ -2088,10 +2095,9 @@ def process_operation(operator: bytes, operands: List) -> None: try: text, output, cm_prev, tm_prev = crlf_space_check( text, - cm_prev, - tm_prev, - cm_matrix, - tm_matrix, + (cm_prev, tm_prev), + (cm_matrix, tm_matrix), + (memo_cm, memo_tm), cmap, orientations, output, @@ -2099,6 +2105,9 @@ def process_operation(operator: bytes, operands: List) -> None: visitor_text, current_spacewidth(), ) + if text == "": + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() except OrientationNotFoundError: return None @@ -2130,12 +2139,18 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"Do": output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" if visitor_text is not None: - visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text( + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) except IndexError: pass try: @@ -2151,7 +2166,13 @@ def process_operation(operator: bytes, operands: List) -> None: ) output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text( + text, + memo_cm, + memo_tm, + cmap[3], + font_size, + ) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", @@ -2159,13 +2180,16 @@ def process_operation(operator: bytes, operands: List) -> None: ) finally: text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + else: process_operation(operator, operands) if visitor_operand_after is not None: visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of if text != "" and visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) return output def extract_text( diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index aa262dd5a..ea8adf56c 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -87,10 +87,9 @@ def orient(m: List[float]) -> int: def crlf_space_check( text: str, - cm_prev: List[float], - tm_prev: List[float], - cm_matrix: List[float], - tm_matrix: List[float], + cmtm_prev: Tuple[List[float], List[float]], + cmtm_matrix: Tuple[List[float], List[float]], + memo_cmtm: Tuple[List[float], List[float]], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], @@ -100,6 +99,13 @@ def crlf_space_check( visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], spacewidth: float, ) -> Tuple[str, str, List[float], List[float]]: + cm_prev = cmtm_prev[0] + tm_prev = cmtm_prev[1] + cm_matrix = cmtm_matrix[0] + tm_matrix = cmtm_matrix[1] + memo_cm = memo_cmtm[0] + memo_tm = memo_cmtm[1] + m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) @@ -107,6 +113,7 @@ def crlf_space_check( delta_y = m[5] - m_prev[5] k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) f = font_size * k + cm_prev = m if orientation not in orientations: raise OrientationNotFoundError try: @@ -117,8 +124,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + memo_cm, + memo_tm, cmap[3], font_size, ) @@ -136,8 +143,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + memo_cm, + memo_tm, cmap[3], font_size, ) @@ -155,8 +162,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + memo_cm, + memo_tm, cmap[3], font_size, ) @@ -174,8 +181,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + memo_cm, + memo_tm, cmap[3], font_size, ) diff --git a/tests/test_page.py b/tests/test_page.py index 1d6c49443..7368291a2 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1288,3 +1288,81 @@ def test_get_contents_from_nullobject(): p = writer.add_blank_page(100, 100) p[NameObject("/Contents")] = writer._add_object(NullObject()) p.get_contents() + + +@pytest.mark.enable_socket() +def test_pos_text_in_textvisitor(): + """See #2200""" + url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf" + name = "test_text_pos.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + p = () + + def visitor_body2(text, cm, tm, fontdict, fontsize) -> None: + nonlocal p + if text.startswith("5425."): + p = (tm[4], tm[5]) + + reader.pages[0].extract_text(visitor_text=visitor_body2) + assert abs(p[0] - 323.5) < 0.1 + assert abs(p[1] - 457.4) < 0.1 + + +@pytest.mark.enable_socket() +def test_pos_text_in_textvisitor2(): + """See #2075""" + url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf" + name = "LegIndex-page6.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + x_lvl = 26 + lst = [] + + def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None: + nonlocal x_lvl, lst + if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210: + lst.append(text.strip(" \n")) + + reader.pages[0].extract_text(visitor_text=visitor_lvl) + assert lst == [ + "ACUPUNCTURE BOARD", + "ACUPUNCTURISTS AND ACUPUNCTURE", + "ADMINISTRATIVE LAW AND PROCEDURE", + "ADMINISTRATIVE LAW, OFFICE OF", + "ADOPTION", + "ADULT EDUCATION", + "ADVERTISING. See also MARKETING; and particular subject matter (e.g.,", + ] + x_lvl = 35 + lst = [] + reader.pages[0].extract_text(visitor_text=visitor_lvl) + assert lst == [ + "members, AB 1264", + "assistants, acupuncture, AB 1264", + "complaints, investigations, etc., AB 1264", + "day, california acupuncture, HR 48", + "massage services, asian, AB 1264", + "supervising acupuncturists, AB 1264", + "supportive acupuncture services, basic, AB 1264", + "rules and regulations—", + "professional assistants and employees: employment and compensation, AB 916", + "adults, adoption of, AB 1756", + "agencies, organizations, etc.: requirements, prohibitions, etc., SB 807", + "assistance programs, adoption: nonminor dependents, SB 9", + "birth certificates, AB 1302", + "contact agreements, postadoption—", + "facilitators, adoption, AB 120", + "failed adoptions: reproductive loss leave, SB 848", + "hearings, adoption finalization: remote proceedings, technology, etc., SB 21", + "native american tribes, AB 120", + "parental rights, reinstatement of, AB 20", + "parents, prospective adoptive: criminal background checks, SB 824", + "services, adult educational, SB 877", + "week, adult education, ACR 31", + "alcoholic beverages: tied-house restrictions, AB 546", + "campaign re social equity, civil rights, etc., SB 447", + "cannabis, AB 794", + "elections. See ELECTIONS.", + "false, misleading, etc., advertising—", + "hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683", + "housing rental properties advertised rates: disclosures, SB 611", + ] diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index de39c1ace..790ce6cf6 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -7,7 +7,7 @@ import pytest -from pypdf import PdfReader +from pypdf import PdfReader, mult from pypdf._text_extraction import set_custom_rtl TESTS_ROOT = Path(__file__).parent.resolve() @@ -82,8 +82,11 @@ def test_visitor_text_matrices(file_name, constraints): lines = [] def visitor_text(text, cm, tm, font_dict, font_size) -> None: - x = tm[4] * cm[0] + tm[5] * cm[2] + cm[4] # mult(tm, cm)[4] - y = tm[4] * cm[1] + tm[5] * cm[3] + cm[5] # mult(tm, cm)[5] + ctm = mult(tm, cm) + x = ctm[4] # used to tm[4] * cm[0] + tm[5] * cm[2] + cm[4] # mult(tm, cm)[4] + y = ctm[ + 5 + ] # used to be tm[4] * cm[1] + tm[5] * cm[3] + cm[5] # mult(tm, cm)[5] lines.append({"text": text, "x": x, "y": y}) reader.pages[0].extract_text(visitor_text=visitor_text)