From 95f590db81dff2f7f10213f5ce63501a691deec5 Mon Sep 17 00:00:00 2001 From: monchin Date: Thu, 20 Nov 2025 09:55:10 +0800 Subject: [PATCH 1/2] Improve the performance of table-extraction by judging whether to do "make_chars" or "make_edges" by checking strategy --- src/table.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/table.py b/src/table.py index 9b73782d3..2daad996f 100644 --- a/src/table.py +++ b/src/table.py @@ -2656,15 +2656,19 @@ def find_tables( tset = TableSettings.resolve(settings=settings) page.table_settings = tset - make_chars(page, clip=clip) # create character list of page - make_edges( - page, - clip=clip, - tset=tset, - paths=paths, - add_lines=add_lines, - add_boxes=add_boxes, - ) # create lines and curves + if tset.vertical_strategy == "text" or tset.horizontal_strategy == "text": + make_chars(page, clip=clip) # create character list of page + if tset.vertical_strategy.startswith( + "lines" + ) or tset.horizontal_strategy.startswith("lines"): + make_edges( + page, + clip=clip, + tset=tset, + paths=paths, + add_lines=add_lines, + add_boxes=add_boxes, + ) # create lines and curves tbf = TableFinder(page, settings=tset) From 4aebd333838447e8f29d8ae599af55cb089a9308 Mon Sep 17 00:00:00 2001 From: monchin Date: Wed, 26 Nov 2025 16:34:03 +0800 Subject: [PATCH 2/2] Call make_chars before Table.extract if it is not called before --- src/table.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/table.py b/src/table.py index 2daad996f..5becbc6b2 100644 --- a/src/table.py +++ b/src/table.py @@ -89,6 +89,7 @@ EDGES = [] # vector graphics from PyMuPDF CHARS = [] # text characters from PyMuPDF +CHARS_MADE = False # whether make_chars is executed. If not, make_chars is called before Table.extract() TEXTPAGE = None TEXT_BOLD = mupdf.FZ_STEXT_BOLD TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT @@ -1529,6 +1530,9 @@ def col_count(self) -> int: # PyMuPDF extension return max([len(r.cells) for r in self.rows]) def extract(self, **kwargs) -> list: + if not CHARS_MADE: + make_chars() + chars = CHARS table_arr = [] @@ -2152,7 +2156,7 @@ def __getitem__(self, i): # ----------------------------------------------------------------------------- def make_chars(page, clip=None): """Extract text as "rawdict" to fill CHARS.""" - global TEXTPAGE + global TEXTPAGE, CHARS_MADE page_number = page.number + 1 page_height = page.rect.height ctm = page.transformation_matrix @@ -2204,6 +2208,7 @@ def make_chars(page, clip=None): "y1": bbox_ctm.y1, } CHARS.append(char_dict) + CHARS_MADE = True # ------------------------------------------------------------------------ @@ -2586,9 +2591,10 @@ def find_tables( paths=None, # accept vector graphics as parameter ): pymupdf._warn_layout_once() - global CHARS, EDGES + global CHARS, EDGES, CHARS_MADE CHARS = [] EDGES = [] + CHARS_MADE = False old_small = bool(pymupdf.TOOLS.set_small_glyph_heights()) # save old value pymupdf.TOOLS.set_small_glyph_heights(True) # we need minimum bboxes if page.rotation != 0: