Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@

EDGES = [] # vector graphics from PyMuPDF
CHARS = [] # text characters from PyMuPDF
CHARS_MADE = False # whether make_chars is executed. If not, make_chars is called before Table.extract()
TEXTPAGE = None
TEXT_BOLD = mupdf.FZ_STEXT_BOLD
TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
Expand Down Expand Up @@ -1529,6 +1530,9 @@ def col_count(self) -> int: # PyMuPDF extension
return max([len(r.cells) for r in self.rows])

def extract(self, **kwargs) -> list:
if not CHARS_MADE:
make_chars()

chars = CHARS
table_arr = []

Expand Down Expand Up @@ -2152,7 +2156,7 @@ def __getitem__(self, i):
# -----------------------------------------------------------------------------
def make_chars(page, clip=None):
"""Extract text as "rawdict" to fill CHARS."""
global TEXTPAGE
global TEXTPAGE, CHARS_MADE
page_number = page.number + 1
page_height = page.rect.height
ctm = page.transformation_matrix
Expand Down Expand Up @@ -2204,6 +2208,7 @@ def make_chars(page, clip=None):
"y1": bbox_ctm.y1,
}
CHARS.append(char_dict)
CHARS_MADE = True


# ------------------------------------------------------------------------
Expand Down Expand Up @@ -2586,9 +2591,10 @@ def find_tables(
paths=None, # accept vector graphics as parameter
):
pymupdf._warn_layout_once()
global CHARS, EDGES
global CHARS, EDGES, CHARS_MADE
CHARS = []
EDGES = []
CHARS_MADE = False
old_small = bool(pymupdf.TOOLS.set_small_glyph_heights()) # save old value
pymupdf.TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
if page.rotation != 0:
Expand Down Expand Up @@ -2656,15 +2662,19 @@ def find_tables(
tset = TableSettings.resolve(settings=settings)
page.table_settings = tset

make_chars(page, clip=clip) # create character list of page
make_edges(
page,
clip=clip,
tset=tset,
paths=paths,
add_lines=add_lines,
add_boxes=add_boxes,
) # create lines and curves
if tset.vertical_strategy == "text" or tset.horizontal_strategy == "text":
make_chars(page, clip=clip) # create character list of page
if tset.vertical_strategy.startswith(
"lines"
) or tset.horizontal_strategy.startswith("lines"):
make_edges(
page,
clip=clip,
tset=tset,
paths=paths,
add_lines=add_lines,
add_boxes=add_boxes,
) # create lines and curves

tbf = TableFinder(page, settings=tset)

Expand Down