diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index 4fce54cf6..6f0d82667 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -12,8 +12,13 @@ def build_char_map( font_name: str, space_width: float, obj: DictionaryObject ) -> Tuple[ - str, float, Union[str, Dict[int, str]], Dict + str, float, Union[str, Dict[int, str]], Dict, DictionaryObject ]: # font_type,space_width /2, encoding, cmap + """Determine information about a font. + + This function returns a tuple consisting of: + font sub-type, space_width/2, encoding, map character-map, font-dictionary. + The font-dictionary itself is suitable for the curious.""" ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore font_type: str = cast(str, ft["/Subtype"]) @@ -58,6 +63,7 @@ def build_char_map( encoding, # https://github.com/python/mypy/issues/4374 map_dict, + ft, ) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 5553e3da6..459dae05c 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1261,6 +1261,9 @@ def _extract_text( orientations: Tuple[int, ...] = (0, 90, 180, 270), space_width: float = 200.0, content_key: Optional[str] = PG.CONTENTS, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, ) -> str: """ Locate all text drawing commands, in the order they are provided in the @@ -1273,6 +1276,9 @@ def _extract_text( Arabic, Hebrew,... are extracted in the good order. If required an custom RTL range of characters can be defined; see function set_custom_rtl + Additionally you can provide visitor-methods to get informed on all operands and all text-objects. + For example in some PDF files this can be useful to parse tables. + :param Tuple[int, ...] orientations: list of orientations text_extraction will look for default = (0, 90, 180, 270) note: currently only 0(Up),90(turned Left), 180(upside Down), 270 (turned Right) @@ -1281,13 +1287,27 @@ def _extract_text( :param Optional[str] content_key: indicate the default key where to extract data None = the object; this allow to reuse the function on XObject default = "/Content" + :param Optional[Function] visitor_operand_before: function to be called before processing an operand. + It has four arguments: operand, operand-arguments, + current transformation matrix and text matrix. + :param Optional[Function] visitor_operand_after: function to be called after processing an operand. + It has four arguments: operand, operand-arguments, + current transformation matrix and text matrix. + :param Optional[Function] visitor_text: function to be called when extracting some text at some position. + It has five arguments: text, + current transformation matrix, text matrix, font-dictionary and font-size. + The font-dictionary may be None in case of unknown fonts. + If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". :return: a string object. """ text: str = "" output: str = "" rtl_dir: bool = False # right-to-left cmaps: Dict[ - str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]] + str, + Tuple[ + str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject + ], ] = {} try: objr = obj @@ -1301,11 +1321,14 @@ def _extract_text( if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): cmaps[f] = build_char_map(f, space_width, obj) - cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str] = ( + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ] = ( "charmap", {}, "NotInitialized", - ) # (encoding,CMAP,font_name) + None, + ) # (encoding,CMAP,font resource name,dictionary-object of font) try: content = ( obj[content_key].get_object() if isinstance(content_key, str) else obj @@ -1360,7 +1383,7 @@ def current_spacewidth() -> float: return _space_width / 1000.0 def process_operation(operator: bytes, operands: List) -> None: - nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations, rtl_dir + nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations, rtl_dir, visitor_text global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS check_crlf_space: bool = False @@ -1369,6 +1392,8 @@ def process_operation(operator: bytes, operands: List) -> None: tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] # tm_prev = tm_matrix output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) # based # if output != "" and output[-1]!="\n": # output += "\n" @@ -1376,8 +1401,10 @@ def process_operation(operator: bytes, operands: List) -> None: return None elif operator == b"ET": output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" - # table 4.7, page 219 + # table 4.7 "Graphics state operators", page 219 # cm_matrix calculation is a reserved for the moment elif operator == b"q": cm_stack.append( @@ -1407,6 +1434,8 @@ def process_operation(operator: bytes, operands: List) -> None: # rtl_dir = False elif operator == b"cm": output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" cm_matrix = mult( [ @@ -1430,14 +1459,21 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"Tf": if text != "": output += text # .translate(cmap) + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" # rtl_dir = False try: - _space_width = cmaps[operands[0]][1] + # charMapTuple: font_type, float(sp_width / 2), encoding, map_dict, font-dictionary + charMapTuple = cmaps[operands[0]] + _space_width = charMapTuple[1] + # current cmap: encoding, map_dict, font resource name (internal name, not the real font-name), + # font-dictionary. The font-dictionary describes the font. cmap = ( - cmaps[operands[0]][2], - cmaps[operands[0]][3], + charMapTuple[2], + charMapTuple[3], operands[0], + charMapTuple[4], ) except KeyError: # font not found _space_width = unknown_char_map[1] @@ -1445,6 +1481,7 @@ def process_operation(operator: bytes, operands: List) -> None: unknown_char_map[2], unknown_char_map[3], "???" + operands[0], + None, ) try: font_size = float(operands[1]) @@ -1525,6 +1562,8 @@ def process_operation(operator: bytes, operands: List) -> None: rtl_dir = True # print("RTL",text,"*") output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = x + text else: # left-to-right @@ -1533,6 +1572,8 @@ def process_operation(operator: bytes, operands: List) -> None: rtl_dir = False # print("LTR",text,"*") output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = text + x # fmt: on @@ -1553,6 +1594,14 @@ def process_operation(operator: bytes, operands: List) -> None: if deltaY < -0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) text = "" elif ( abs(deltaY) < f * 0.3 @@ -1564,6 +1613,14 @@ def process_operation(operator: bytes, operands: List) -> None: if deltaY > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) text = "" elif ( abs(deltaY) < f * 0.3 @@ -1575,6 +1632,14 @@ def process_operation(operator: bytes, operands: List) -> None: if deltaX > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) text = "" elif ( abs(deltaX) < f * 0.3 @@ -1586,6 +1651,14 @@ def process_operation(operator: bytes, operands: List) -> None: if deltaX < -0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) text = "" elif ( abs(deltaX) < f * 0.3 @@ -1597,6 +1670,8 @@ def process_operation(operator: bytes, operands: List) -> None: pass for operands, operator in content.operations: + if visitor_operand_before is not None: + visitor_operand_before(operator, operands, cm_matrix, tm_matrix) # multiple operators are defined in here #### if operator == b"'": process_operation(b"T*", []) @@ -1622,17 +1697,30 @@ def process_operation(operator: bytes, operands: List) -> None: process_operation(b"Tj", [" "]) elif operator == b"Do": output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" + if visitor_text is not None: + visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size) except IndexError: pass try: xobj = resources_dict["/XObject"] if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore # output += text - text = self.extract_xform_text(xobj[operands[0]], orientations, space_width) # type: ignore + text = self.extract_xform_text( + xobj[operands[0]], # type: ignore + orientations, + space_width, + visitor_operand_before, + visitor_operand_after, + visitor_text, + ) output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", @@ -1642,7 +1730,11 @@ def process_operation(operator: bytes, operands: List) -> None: text = "" else: process_operation(operator, operands) + if visitor_operand_after is not None: + visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of + if text != "" and visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) return output def extract_text( @@ -1652,6 +1744,9 @@ def extract_text( TJ_sep: str = None, orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270), space_width: float = 200.0, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, ) -> str: """ Locate all text drawing commands, in the order they are provided in the @@ -1663,12 +1758,25 @@ def extract_text( Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. + Additionally you can provide visitor-methods to get informed on + all operations and all text-objects. + For example in some PDF files this can be useful to parse tables. + :param Tj_sep: Deprecated. Kept for compatibility until PyPDF2==4.0.0 :param TJ_sep: Deprecated. Kept for compatibility until PyPDF2==4.0.0 :param orientations: (list of) orientations (of the characters) (default: (0,90,270,360)) single int is equivalent to a singleton ( 0 == (0,) ) note: currently only 0(Up),90(turned Left), 180(upside Down),270 (turned Right) :param float space_width: force default space width (if not extracted from font (default: 200) + :param Optional[Function] visitor_operand_before: function to be called before processing an operand. + It has four arguments: operator, operand-arguments, + current transformation matrix and text matrix. + :param Optional[Function] visitor_operand_after: function to be called after processing an operand. + It has four arguments: operand, operand-arguments, + current transformation matrix and text matrix. + :param Optional[Function] visitor_text: function to be called when extracting some text at some position. + It has three arguments: text, + current transformation matrix and text matrix. :return: The extracted text """ if len(args) >= 1: @@ -1708,7 +1816,14 @@ def extract_text( orientations = (orientations,) return self._extract_text( - self, self.pdf, orientations, space_width, PG.CONTENTS + self, + self.pdf, + orientations, + space_width, + PG.CONTENTS, + visitor_operand_before, + visitor_operand_after, + visitor_text, ) def extract_xform_text( @@ -1716,6 +1831,9 @@ def extract_xform_text( xform: EncodedStreamObject, orientations: Tuple[int, ...] = (0, 90, 270, 360), space_width: float = 200.0, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, ) -> str: """ Extract text from an XObject. @@ -1724,7 +1842,16 @@ def extract_xform_text( :return: The extracted text """ - return self._extract_text(xform, self.pdf, orientations, space_width, None) + return self._extract_text( + xform, + self.pdf, + orientations, + space_width, + None, + visitor_operand_before, + visitor_operand_after, + visitor_text, + ) def extractText( self, Tj_sep: str = "", TJ_sep: str = "" diff --git a/tests/test_page.py b/tests/test_page.py index 9712d9c09..741e67374 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -348,6 +348,260 @@ def test_extract_text_operator_t_star(): # L1266, L1267 page.extract_text() +def test_extract_text_visitor_callbacks(): + """ + Extract text in rectangle-objects or simple tables. + + This test uses GeoBase_NHNC1_Data_Model_UML_EN.pdf. + It extracts the labels of package-boxes in Figure 2. + It extracts the texts in table "REVISION HISTORY". + + """ + import logging + + class PositionedText: + """Specify a text with coordinates, font-dictionary and font-size. + + The font-dictionary may be None in case of an unknown font. + """ + + def __init__(self, text, x, y, font_dict, font_size) -> None: + # TODO \0-replace: Encoding issue in some files? + self.text = text.replace("\0", "") + self.x = x + self.y = y + self.font_dict = font_dict + self.font_size = font_size + + def get_base_font(self) -> str: + """Gets the base font of the text. + + Return UNKNOWN in case of an unknown font.""" + if (self.font_dict is None) or "/BaseFont" not in self.font_dict: + return "UNKNOWN" + return self.font_dict["/BaseFont"] + + class Rectangle: + """Specify a rectangle.""" + + def __init__(self, x, y, w, h) -> None: + self.x = x.as_numeric() + self.y = y.as_numeric() + self.w = w.as_numeric() + self.h = h.as_numeric() + + def contains(self, x, y) -> bool: + return ( + x >= self.x + and x <= (self.x + self.w) + and y >= self.y + and y <= (self.y + self.h) + ) + + def extractTextAndRectangles(page: PageObject, rectFilter=None) -> tuple: + """ + Extracts texts and rectangles of a page of type PyPDF2._page.PageObject. + + This function supports simple coordinate transformations only. + The optional rectFilter-lambda can be used to filter wanted rectangles. + rectFilter has Rectangle as argument and must return a boolean. + + It returns a tuple containing a list of extracted texts (type PositionedText) + and a list of extracted rectangles (type Rectangle). + """ + + logger = logging.getLogger("extractTextAndRectangles") + + listRects = [] + listTexts = [] + + def print_op_b(op, args, cm_matrix, tm_matrix): + if logger.isEnabledFor(logging.DEBUG): + logger.debug(f"before: {op} at {cm_matrix}, {tm_matrix}") + if op == b"re": + if logger.isEnabledFor(logging.DEBUG): + logger.debug(f" add rectangle: {args}") + w = args[2] + h = args[3] + r = Rectangle(args[0], args[1], w, h) + if (rectFilter is None) or rectFilter(r): + listRects.append(r) + + def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size): + if text.strip() != "": + if logger.isEnabledFor(logging.DEBUG): + logger.debug(f"at {cm_matrix}, {tm_matrix}, fontSize={font_size}") + listTexts.append( + PositionedText( + text, tm_matrix[4], tm_matrix[5], font_dict, font_size + ) + ) + + visitor_before = print_op_b + visitor_text = print_visi + + page.extract_text( + visitor_operand_before=visitor_before, visitor_text=visitor_text + ) + + return (listTexts, listRects) + + def extractTable(listTexts: list, listRects: list) -> list: + """ + Extracts a table containing text. + + It is expected that each cell is marked by a rectangle-object. + It is expected that the page contains one table only. + It is expected that the table contains at least 3 columns and 2 rows. + + A list of rows is returned. + Each row contains a list of cells. + Each cell contains a list of PositionedText-elements. + """ + logger = logging.getLogger("extractTable") + + # Step 1: Count number of x- and y-coordinates of rectangles. + # Remove duplicate rectangles. the new list is listRectsFiltered. + mapColCount = {} + mapRowCount = {} + mapKnownRects = {} + listRectsFiltered = [] + for r in listRects: + # Coordinates may be inaccurate, we have to round. + # cell: x=72.264, y=386.57, w=93.96, h=46.584 + # cell: x=72.271, y=386.56, w=93.96, h=46.59 + key = f"{round(r.x, 0)} {round(r.y, 0)} {round(r.w, 0)} {round(r.h, 0)}" + if key in mapKnownRects: + # Ignore duplicate rectangles + continue + mapKnownRects[key] = r + if r.x not in mapColCount: + mapColCount[r.x] = 0 + if r.y not in mapRowCount: + mapRowCount[r.y] = 0 + mapColCount[r.x] += 1 + mapRowCount[r.y] += 1 + listRectsFiltered.append(r) + + # Step 2: Look for texts in rectangles. + mapRectText = {} + for t in listTexts: + for r in listRectsFiltered: + if r.contains(t.x, t.y): + if r not in mapRectText: + mapRectText[r] = [] + mapRectText[r].append(t) + break + + # PDF: y = 0 is expected at the bottom of the page. + # So the header-row is expected to have the highest y-value. + listRects.sort(key=lambda r: (-r.y, r.x)) + + # Step 3: Build the list of rows containing list of cell-texts. + listRows = [] + rowNr = 0 + colNr = 0 + currY = None + currRow = None + for r in listRectsFiltered: + if mapColCount[r.x] < 3 or mapRowCount[r.y] < 2: + # We expect at least 3 columns and 2 rows. + continue + if currY is None or r.y != currY: + # next row + currY = r.y + colNr = 0 + rowNr += 1 + currRow = [] + listRows.append(currRow) + colNr += 1 + if logger.isEnabledFor(logging.DEBUG): + logger.debug(f"cell: x={r.x}, y={r.y}, w={r.w}, h={r.h}") + if r not in mapRectText: + currRow.append("") + continue + cellTexts = [t for t in mapRectText[r]] + currRow.append(cellTexts) + + return listRows + + def extract_cell_text(cellTexts: list) -> str: + """Joins the text-objects of a cell.""" + return ("".join(t.text for t in cellTexts)).strip() + + # Test 1: We test the analysis of page 7 "2.1 LRS model". + reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") + pageLrsModel = reader.pages[6] + + # We ignore the invisible large rectangles. + def ignoreLargeRectangles(r): + return r.w < 400 and r.h < 400 + + (listTexts, listRects) = extractTextAndRectangles( + pageLrsModel, rectFilter=ignoreLargeRectangles + ) + + # We see ten rectangles (5 tabs, 5 boxes) but there are 64 rectangles (including some invisible ones). + assert 60 == len(listRects) + mapRectTexts = {} + for t in listTexts: + for r in listRects: + if r.contains(t.x, t.y): + texts = mapRectTexts.setdefault(r, []) + texts.append(t.text.strip()) + break + # Five boxes and the figure-description below. + assert 6 == len(mapRectTexts) + boxTexts = [" ".join(texts) for texts in mapRectTexts.values()] + assert "Hydro Network" in boxTexts + assert "Hydro Events" in boxTexts + assert "Metadata" in boxTexts + assert "Hydrography" in boxTexts + assert "Toponymy (external model)" in boxTexts + + # Test 2: Parse table "REVISION HISTORY" on page 3. + pageRevisions = reader.pages[2] + # We ignore the second table, therefore: r.y > 350 + + def filterFirstTable(r): + return r.w > 1 and r.h > 1 and r.w < 400 and r.h < 400 and r.y > 350 + + (listTexts, listRects) = extractTextAndRectangles( + pageRevisions, rectFilter=filterFirstTable + ) + listRows = extractTable(listTexts, listRects) + + assert len(listRows) == 9 + assert extract_cell_text(listRows[0][0]) == "Date" + assert extract_cell_text(listRows[0][1]) == "Version" + assert extract_cell_text(listRows[0][2]) == "Description" + assert extract_cell_text(listRows[1][0]) == "September 2002" + # The line break between "English review;" + # and "Remove" is not detected. + assert ( + extract_cell_text(listRows[6][2]) + == "English review;Remove the UML model for the Segmented view." + ) + assert ( + extract_cell_text(listRows[7][2]) == "Update from the March Workshop comments." + ) + + # Check the fonts. We check: /F2 9.96 Tf [...] [(Dat)-2(e)] TJ + textDatOfDate = listRows[0][0][0] + assert textDatOfDate.font_dict is not None + assert textDatOfDate.font_dict["/Name"] == "/F2" + assert textDatOfDate.get_base_font() == "/Arial,Bold" + assert textDatOfDate.font_dict["/Encoding"] == "/WinAnsiEncoding" + assert textDatOfDate.font_size == 9.96 + # Check: /F1 9.96 Tf [...] [(S)4(ep)4(t)-10(em)-20(be)4(r)-3( 20)4(02)] TJ + textS = listRows[1][0][0] + assert textS.font_dict is not None + assert textS.font_dict["/Name"] == "/F1" + assert textS.get_base_font() == "/Arial" + assert textS.font_dict["/Encoding"] == "/WinAnsiEncoding" + assert textDatOfDate.font_size == 9.96 + + @pytest.mark.parametrize( ("pdf_path", "password", "embedded", "unembedded"), [