From 358f54972772fd8f83ca921016341085d8d2d64c Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Wed, 15 Oct 2025 10:15:44 -0400 Subject: [PATCH] Support the PyMuPDF-Layout Feature --- src/__init__.py | 70 +++++- src/extra.i | 591 +++++++++++++++++++++++++++++++++--------------- src/table.py | 276 ++++++++++++++++------ 3 files changed, 677 insertions(+), 260 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index f1cb0a051..012f6506e 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -28,7 +28,7 @@ import zipfile from . import extra - +import importlib.util # Set up g_out_log and g_out_message from environment variables. # @@ -333,6 +333,37 @@ def __init__(self): _globals = _Globals() +_get_layout: typing.Optional[typing.Callable] = None + +# global switch ensuring that the recommendation message is shown at most once +_recommend_layout = True # must be referred to as "global" everywhere + + +def no_recommend_layout(): + """For users who never want to see the layout recommendation.""" + global _recommend_layout + _recommend_layout = False + + +def _warn_layout_once(): + """Check if we should recommend installing the layout package.""" + msg="""Consider using the pymupdf_layout package for a greatly improved page layout analysis.""" + + global _recommend_layout + if ( + 1 + and _recommend_layout # still True? + and _get_layout is None # no layout function stored here + + # client did not globally disable the recommendation + and os.getenv("PYMUPDF_SUGGEST_LAYOUT_ANALYZER") != "0" + + # layout is not available in this Python + and not importlib.util.find_spec("pymupdf.layout") + ): + print(msg) + _recommend_layout = False # never show the message again + # Optionally use MuPDF via cppyy bindings; experimental and not tested recently # as of 2023-01-20 11:51:40 @@ -9952,7 +9983,7 @@ def _get_resource_properties(self): return rc def _get_textpage(self, clip=None, flags=0, matrix=None): - if g_use_extra: + if 1 or g_use_extra: ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix) tpage = mupdf.FzStextPage(ll_tpage) return tpage @@ -10781,6 +10812,20 @@ def clip_to_rect(self, rect): pclip = JM_rect_from_py(clip) mupdf.pdf_clip_page(pdfpage, pclip) + def get_layout(self): + """Try to access layout information.""" + + if self.layout_information is not None: + # layout information already present + return + + if not _get_layout: + # no layout information available + return + + layout_info = _get_layout(self) + self.layout_information = layout_info + @property def artbox(self): """The ArtBox""" @@ -11432,7 +11477,7 @@ def get_cdrawings(self, extended=None, callback=None, method=None): assert isinstance(page, mupdf.FzPage), f'{self.this=}' clips = True if extended else False prect = mupdf.fz_bound_page(page) - if g_use_extra: + if 1 or g_use_extra: rc = extra.get_cdrawings(page, extended, callback, method) else: rc = list() @@ -12157,7 +12202,7 @@ def get_texttrace(self): self.set_rotation(0) page = self.this rc = [] - if g_use_extra: + if 1 or g_use_extra: dev = extra.JM_new_texttrace_device(rc) else: dev = JM_new_texttrace_device(rc) @@ -13206,6 +13251,9 @@ def xref(self): rect = property(bound, doc="page rectangle") + # any result of layout analysis is stored here + layout_information = None + class Pixmap: @@ -16391,7 +16439,7 @@ def _textpage_dict(self, raw=False): def extractBLOCKS(self): """Return a list with text block information.""" - if g_use_extra: + if 1 or g_use_extra: return extra.extractBLOCKS(self.this) block_n = -1 this_tpage = self.this @@ -16587,7 +16635,7 @@ def extractTextbox(self, rect): def extractWORDS(self, delimiters=None): """Return a list with text word information.""" - if g_use_extra: + if 1 or g_use_extra: return extra.extractWORDS(self.this, delimiters) buflen = 0 last_char_rtl = 0 @@ -18969,7 +19017,7 @@ def JM_color_FromSequence(color): def JM_color_count( pm, clip): - if g_use_extra: + if 1 or g_use_extra: return extra.ll_JM_color_count(pm.m_internal, clip) rc = dict() @@ -20469,7 +20517,7 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize): def JM_make_spanlist(line_dict, line, raw, buff, tp_rect): - if g_use_extra: + if 1 or g_use_extra: return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect) char_list = None span_list = [] @@ -20682,7 +20730,7 @@ def JM_make_image_block(block, block_dict): def JM_make_text_block(block, block_dict, raw, buff, tp_rect): - if g_use_extra: + if 1 or g_use_extra: return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal) line_list = [] block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) @@ -20705,7 +20753,7 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect): def JM_make_textpage_dict(tp, page_dict, raw): - if g_use_extra: + if 1 or g_use_extra: return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw) text_buffer = mupdf.fz_new_buffer(128) block_list = [] @@ -21356,7 +21404,7 @@ def JM_rotate_page_matrix(page): def JM_search_stext_page(page, needle): - if g_use_extra: + if 1 or g_use_extra: return extra.JM_search_stext_page(page.m_internal, needle) rect = mupdf.FzRect(page.m_internal.mediabox) diff --git a/src/extra.i b/src/extra.i index 9d448dac1..8e4e75efb 100644 --- a/src/extra.i +++ b/src/extra.i @@ -1739,29 +1739,6 @@ static const char* JM_font_name(fz_font* font) return s + 1; } -static int detect_super_script(fz_stext_line *line, fz_stext_char *ch) -{ - if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) - { - return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; - } - return 0; -} - -static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch) -{ - int flags = 0; - if (line && ch) - { - flags += detect_super_script(line, ch) * TEXT_FONT_SUPERSCRIPT; - } - flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC; - flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED; - flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED; - flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD; - return flags; -} - static void jm_trace_text_span( jm_tracedraw_device* dev, fz_text_span* span, @@ -2297,37 +2274,64 @@ void JM_append_rune(fz_buffer *buff, int ch); // but lines within a block are concatenated by space instead a new-line // character (which else leads to 2 new-lines). //----------------------------------------------------------------------------- -void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page) +void _as_text(fz_stext_block *block, mupdf::FzBuffer& res, mupdf::FzStextPage& page) { + /* + Recursive function for output by blocks as identified by the + MuPDF SEGMENT logic. + The recursion happens when we encounter a structure block. + */ fz_rect rect = page.m_internal->mediabox; - - for (auto block: page) + int last_char; + fz_stext_line *line; + fz_stext_char *ch; + while (block) { - if (block.m_internal->type == FZ_STEXT_BLOCK_TEXT) + switch (block->type) { - for (auto line: block) - { - int last_char = 0; - for (auto ch: line) + case FZ_STEXT_BLOCK_STRUCT: + if (block->u.s.down) + { + _as_text(block->u.s.down->first_block, res, page); + } + break; + + case FZ_STEXT_BLOCK_TEXT: + last_char = 0; + for (line = block->u.t.first_line; line; line = line->next) { - fz_rect chbbox = JM_char_bbox( line, ch); - if (mupdf::ll_fz_is_infinite_rect(rect) - || JM_rects_overlap(rect, chbbox) - ) + for (ch = line->first_char; ch; ch = ch->next) + { + fz_rect chbbox = JM_char_bbox( line, ch); + if (mupdf::ll_fz_is_infinite_rect(rect) || JM_rects_overlap(rect, chbbox)) + { + last_char = ch->c; + JM_append_rune(res.m_internal, last_char); + } + } + if (last_char != 10 && last_char > 0) { - last_char = ch.m_internal->c; - JM_append_rune(res.m_internal, last_char); + mupdf::ll_fz_append_string(res.m_internal, "\n"); + last_char = 10; } } if (last_char != 10 && last_char > 0) { mupdf::ll_fz_append_string(res.m_internal, "\n"); + last_char = 10; } - } + break; } + block = block->next; } } +void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page) +{ + fz_stext_block *block = page.m_internal->first_block; + _as_text(block, res, page); +} + // path_type is one of: @@ -3006,6 +3010,25 @@ PyObject* get_cdrawings(mupdf::FzPage& page, PyObject *extended=NULL, PyObject * } +static int detect_super_script(fz_stext_line *line, fz_stext_char *ch) +{ + if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) + { + return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; + } + return 0; +} + +static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch) +{ + int flags = detect_super_script(line, ch); + flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC; + flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED; + flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED; + flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD; + return flags; +} + //--------------------------------------------------------------------------- // APPEND non-ascii runes in unicode escape format to fz_buffer //--------------------------------------------------------------------------- @@ -3264,51 +3287,77 @@ int JM_append_word( return word_n + 1; // word counter } -PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters) +int _as_words(fz_stext_block *block, mupdf::FzBuffer& buff, fz_rect tp_rect, PyObject *lines, int block_n, PyObject *delimiters) { - int block_n = -1; - fz_rect wbbox = fz_empty_rect; // word bbox - fz_rect tp_rect = this_tpage.m_internal->mediabox; - - PyObject *lines = NULL; - mupdf::FzBuffer buff = mupdf::fz_new_buffer(64); - lines = PyList_New(0); - for (mupdf::FzStextBlock block: this_tpage) + /* 'buff' is intermediate storage for composing a word. Used as parameter only for + avoiding repeated allocation of an FzBuffer.*/ + int line_n; + fz_stext_line *line; + fz_stext_char *ch; + fz_rect wbbox, blockrect; + while (block) { - block_n++; - if (block.m_internal->type != FZ_STEXT_BLOCK_TEXT) + switch (block->type) { - continue; - } - int line_n = -1; - for (mupdf::FzStextLine line: block) - { - line_n++; - int word_n = 0; // word counter per line - mupdf::fz_clear_buffer(buff); // reset word buffer - size_t buflen = 0; // reset char counter - int last_char_rtl = 0; // was last character RTL? - for (mupdf::FzStextChar ch: line) - { - mupdf::FzRect cbbox = JM_char_bbox(line, ch); - if (!JM_rects_overlap(tp_rect, *cbbox.internal()) && !fz_is_infinite_rect(tp_rect)) + case FZ_STEXT_BLOCK_STRUCT: + if (block->u.s.down) { - continue; + block_n = _as_words(block->u.s.down->first_block, buff, tp_rect, lines, block_n, delimiters); } - // prevent Unicode ZWJ 0x200d to start a word - if (buflen == 0 && ch.m_internal->c == 0x200d) + break; + + case FZ_STEXT_BLOCK_TEXT: + block_n++; + blockrect = block->bbox; + wbbox = fz_empty_rect; + line_n = -1; + for (line = block->u.t.first_line; line; line = line->next) { - continue; - } - int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters); - int this_char_rtl = JM_is_rtl_char(ch.m_internal->c); - if (word_delimiter || this_char_rtl != last_char_rtl) - { - if (buflen == 0 && word_delimiter) + line_n++; + int word_n = 0; // word counter per line + mupdf::fz_clear_buffer(buff); // reset word buffer + int last_char_rtl = 0; // was last character RTL? + for (ch = line->first_char; ch; ch = ch->next) { - continue; // skip delimiters at line start + mupdf::FzRect cbbox = JM_char_bbox(line, ch); + if (!JM_rects_overlap(tp_rect, *cbbox.internal()) && !fz_is_infinite_rect(tp_rect)) + { + continue; + } + // prevent Unicode ZWJ 0x200d to start a word + if (mupdf::fz_buffer_storage(buff, NULL) == 0 && ch->c == 0x200d) + { + continue; + } + int word_delimiter = JM_is_word_delimiter(ch->c, delimiters); + int this_char_rtl = JM_is_rtl_char(ch->c); + if (word_delimiter || this_char_rtl != last_char_rtl) + { + if (mupdf::fz_buffer_storage(buff, NULL) == 0 && word_delimiter) + { + continue; // skip delimiters at line start + } + if (!fz_is_empty_rect(wbbox)) + { + word_n = JM_append_word( + lines, + buff.m_internal, + &wbbox, + block_n, + line_n, + word_n + ); + } + mupdf::fz_clear_buffer(buff); + if (word_delimiter) continue; + } + // append one unicode character to the word + JM_append_rune(buff.m_internal, ch->c); + last_char_rtl = this_char_rtl; + // enlarge word bbox + wbbox = fz_union_rect(wbbox, JM_char_bbox(line, ch)); } - if (!fz_is_empty_rect(wbbox)) + if (mupdf::fz_buffer_storage(buff, NULL) && !fz_is_empty_rect(wbbox)) { word_n = JM_append_word( lines, @@ -3320,35 +3369,27 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters) ); } mupdf::fz_clear_buffer(buff); - buflen = 0; // reset char counter - if (word_delimiter) continue; } - // append one unicode character to the word - JM_append_rune(buff.m_internal, ch.m_internal->c); - last_char_rtl = this_char_rtl; - buflen++; - // enlarge word bbox - wbbox = fz_union_rect(wbbox, JM_char_bbox(line, ch)); - } - if (buflen && !fz_is_empty_rect(wbbox)) - { - word_n = JM_append_word( - lines, - buff.m_internal, - &wbbox, - block_n, - line_n, - word_n - ); - } - mupdf::fz_clear_buffer(buff); - buflen = 0; + break; } + block = block->next; } - return lines; + return block_n; } +PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters) +{ + int block_n = -1; + fz_rect tp_rect = this_tpage.m_internal->mediabox; + PyObject *lines = NULL; + mupdf::FzBuffer buff = mupdf::fz_new_buffer(64); + lines = PyList_New(0); + mupdf::FzStextBlock block = this_tpage.m_internal->first_block; + block_n = _as_words(block.m_internal, buff, tp_rect, lines, block_n, delimiters); + return lines; +} + struct ScopedPyObject /* PyObject* wrapper, destructor calls Py_CLEAR() unless `release()` has been @@ -3385,74 +3426,116 @@ called. */ PyObject* m_pyobject = nullptr; }; - -PyObject* extractBLOCKS(mupdf::FzStextPage& self) +int _as_blocks(fz_stext_block *block, fz_rect tp_rect, PyObject *lines, int block_n) { - fz_stext_page *this_tpage = self.m_internal; - fz_rect tp_rect = this_tpage->mediabox; - mupdf::FzBuffer res(1024); - ScopedPyObject lines( PyList_New(0)); - int block_n = -1; - for (fz_stext_block* block = this_tpage->first_block; block; block = block->next) + /* + Recursive function for output by blocks as identified by the + MuPDF SEGMENT logic. + Recursion happens on encountering a structure block. + In addition to the previous support of text and image, we now also support + vector blocks. + */ + PyObject *text = NULL; + fz_rect blockrect; + mupdf::FzBuffer res; + while (block) { - ScopedPyObject text; - block_n++; - fz_rect blockrect = fz_empty_rect; - if (block->type == FZ_STEXT_BLOCK_TEXT) + switch (block->type) { - mupdf::fz_clear_buffer(res); // set text buffer to empty - int line_n = -1; - int last_char = 0; - (void) line_n; /* Not actually used, but keeping in the code for now. */ - for (fz_stext_line* line = block->u.t.first_line; line; line = line->next) - { - line_n++; - fz_rect linerect = fz_empty_rect; - for (fz_stext_char* ch = line->first_char; ch; ch = ch->next) + case FZ_STEXT_BLOCK_STRUCT: + if (block->u.s.down) + { + block_n = _as_blocks(block->u.s.down->first_block, tp_rect, lines, block_n); + } + break; + + case FZ_STEXT_BLOCK_TEXT: + blockrect = fz_empty_rect; + res = mupdf::fz_new_buffer(1024); + int last_char; + for (fz_stext_line* line = block->u.t.first_line; line; line = line->next) { - fz_rect cbbox = JM_char_bbox(line, ch); - if (!JM_rects_overlap(tp_rect, cbbox) && !fz_is_infinite_rect(tp_rect)) + fz_rect linerect = fz_empty_rect; + for (fz_stext_char* ch = line->first_char; ch; ch = ch->next) { - continue; + fz_rect cbbox = JM_char_bbox(line, ch); + if (!JM_rects_overlap(tp_rect, cbbox) && !fz_is_infinite_rect(tp_rect)) + { + continue; + } + JM_append_rune(res.m_internal, ch->c); + last_char = ch->c; + linerect = fz_union_rect(linerect, cbbox); + } + if (last_char != 10 && !fz_is_empty_rect(linerect)) + { + JM_append_rune(res.m_internal, 10); } - JM_append_rune(res.m_internal, ch->c); - last_char = ch->c; - linerect = fz_union_rect(linerect, cbbox); + blockrect = fz_union_rect(blockrect, linerect); } - if (last_char != 10 && !fz_is_empty_rect(linerect)) + text = JM_EscapeStrFromBuffer(res); + break; + + case FZ_STEXT_BLOCK_IMAGE: + if (fz_contains_rect(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) { - mupdf::fz_append_byte(res, 10); + blockrect = block->bbox; + fz_image *img = block->u.i.image; + fz_colorspace *cs = img->colorspace; + text = PyUnicode_FromFormat( + "\n", + mupdf::ll_fz_colorspace_name(cs), + img->w, + img->h, + img->bpc + ); } - blockrect = fz_union_rect(blockrect, linerect); - } - text = JM_EscapeStrFromBuffer(res); + break; + + case FZ_STEXT_BLOCK_VECTOR: + if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) + { + blockrect = block->bbox; + int alpha = (int) (block->u.v.argb >> 24); + int color = (int) (block->u.v.argb & 0xffffff); + text = PyUnicode_FromFormat( + "\n\n", + (block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED) ? "stroked" : "filled", + color, + alpha, + (block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) ? "true":"false", + (block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES) ? "true":"false"); + } + break; } - else if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) + + if (text) { - fz_image *img = block->u.i.image; - fz_colorspace *cs = img->colorspace; - text = PyUnicode_FromFormat( - "", - mupdf::ll_fz_colorspace_name(cs), - img->w, - img->h, - img->bpc - ); - blockrect = fz_union_rect(blockrect, block->bbox); + block_n += 1; + PyObject *litem = PyTuple_New(7); + PyTuple_SET_ITEM(litem, 0, Py_BuildValue("f", blockrect.x0)); + PyTuple_SET_ITEM(litem, 1, Py_BuildValue("f", blockrect.y0)); + PyTuple_SET_ITEM(litem, 2, Py_BuildValue("f", blockrect.x1)); + PyTuple_SET_ITEM(litem, 3, Py_BuildValue("f", blockrect.y1)); + PyTuple_SET_ITEM(litem, 4, Py_BuildValue("O", text)); + PyTuple_SET_ITEM(litem, 5, Py_BuildValue("i", block_n)); + PyTuple_SET_ITEM(litem, 6, Py_BuildValue("i", block->type)); + LIST_APPEND(lines, litem); } - if (!fz_is_empty_rect(blockrect)) - { - ScopedPyObject litem = PyTuple_New(7); - PyTuple_SET_ITEM(litem.get(), 0, Py_BuildValue("f", blockrect.x0)); - PyTuple_SET_ITEM(litem.get(), 1, Py_BuildValue("f", blockrect.y0)); - PyTuple_SET_ITEM(litem.get(), 2, Py_BuildValue("f", blockrect.x1)); - PyTuple_SET_ITEM(litem.get(), 3, Py_BuildValue("f", blockrect.y1)); - PyTuple_SET_ITEM(litem.get(), 4, Py_BuildValue("O", text.get())); - PyTuple_SET_ITEM(litem.get(), 5, Py_BuildValue("i", block_n)); - PyTuple_SET_ITEM(litem.get(), 6, Py_BuildValue("i", block->type)); - LIST_APPEND(lines.get(), litem.get()); + text = NULL; + block = block->next; } + return block_n; } + +PyObject* extractBLOCKS(mupdf::FzStextPage& self) +{ + fz_stext_page *this_tpage = self.m_internal; + fz_rect tp_rect = this_tpage->mediabox; + ScopedPyObject lines(PyList_New(0)); + int block_n = -1; + fz_stext_block *block = this_tpage->first_block; + block_n = _as_blocks(block, tp_rect, lines.get(), block_n); return lines.release(); } @@ -3599,10 +3682,88 @@ void JM_make_image_block(fz_stext_block *block, PyObject *block_dict) fz_drop_buffer(ctx, mask_buf); fz_drop_buffer(ctx, freebuf); } - fz_catch(ctx) {;} + fz_catch(ctx) + { + fz_ignore_error(ctx); + } + return; +} + + +void JM_make_vector_block(fz_stext_block *block, PyObject *block_dict) +{ + DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); + DICT_SETITEMSTR_DROP(block_dict, "stroked", JM_BOOL(block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED)); + DICT_SETITEMSTR_DROP(block_dict, "isrect", JM_BOOL(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE)); + DICT_SETITEMSTR_DROP(block_dict, "continues", JM_BOOL(block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES)); + int color = (int) block->u.v.argb & 0xffffff; // extract color components + int alpha = block->u.v.argb >> 24; // extract alpha value + DICT_SETITEM_DROP(block_dict, dictkey_color, Py_BuildValue("i", color)); + DICT_SETITEMSTR_DROP(block_dict, "alpha", Py_BuildValue("i", alpha)); + return; +} + +void JM_make_grid_block(fz_stext_block *block, PyObject *block_dict) +{ + Py_ssize_t i; + PyObject *pos; + + DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); + + DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); + + DICT_SETITEMSTR_DROP(block_dict, "max_uncertain", Py_BuildValue("ii", + block->u.b.xs->max_uncertainty, + block->u.b.ys->max_uncertainty)); + + // x coordinates with uncertainties + pos = PyList_New((size_t) block->u.b.xs->len); + for (i = 0; i < block->u.b.xs->len; i++) + { + PyList_SetItem(pos, i, Py_BuildValue("fi", + block->u.b.xs->list[i].pos, + block->u.b.xs->list[i].uncertainty)); + } + DICT_SETITEMSTR_DROP(block_dict, "xpos", pos); + + // y coordinates with uncertainties + pos = PyList_New((size_t) block->u.b.ys->len); + for (i = 0; i < block->u.b.ys->len; i++) + { + PyList_SetItem(pos, i, Py_BuildValue("fi", + block->u.b.ys->list[i].pos, + block->u.b.ys->list[i].uncertainty)); + } + DICT_SETITEMSTR_DROP(block_dict, "ypos", pos); + return; } + +void make_table_dict(fz_stext_page *tp, PyObject *table_dict, PyObject *bbox) +{ + fz_rect bounds = JM_rect_from_py(bbox); + fz_stext_block *block; + + try + { + block = mupdf::ll_fz_find_table_within_bounds(tp, bounds); + } + catch (std::exception&) + { + /* Ignore failure to find a table structure. */ + return; + } + + // Check if a table structure was found + if (block && block->type == FZ_STEXT_BLOCK_GRID) + { + JM_make_grid_block(block, table_dict); + } + +} + + static void JM_make_text_block(fz_stext_block *block, PyObject *block_dict, int raw, fz_buffer *buff, fz_rect tp_rect) { fz_stext_line *line; @@ -3638,38 +3799,111 @@ static void JM_make_text_block(fz_stext_block *block, PyObject *block_dict, int return; } + +void JM_make_struct_block(fz_stext_block *block, PyObject *block_dict) +{ + DICT_SETITEMSTR_DROP(block_dict, "index", Py_BuildValue("i",block->u.s.index)); + if (block->u.s.down) + { + DICT_SETITEMSTR_DROP(block_dict, "raw", Py_BuildValue("s",block->u.s.down->raw)); + DICT_SETITEMSTR_DROP(block_dict, "std", Py_BuildValue("s",fz_structure_to_string(block->u.s.down->standard))); + } + +} + + +int _as_dict(PyObject *block_list, fz_stext_block *block, fz_buffer *text_buffer, int raw, fz_rect tp_rect, int block_n) +{ + /* + Recursive function for output by blocks as identified by the + MuPDF SEGMENT logic. + */ + PyObject *block_dict; + while (block) + { + switch (block->type) + { + case FZ_STEXT_BLOCK_STRUCT: + if (block->u.s.down && block->u.s.down->first_block) + { + block_n++; + block_dict = PyDict_New(); + DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); + DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); + DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); + JM_make_struct_block(block, block_dict); + PyObject *subblocks = PyList_New(0); + block_n = _as_dict(subblocks, block->u.s.down->first_block, text_buffer, raw, tp_rect, block_n); + DICT_SETITEM_DROP(block_dict, dictkey_blocks, subblocks); + LIST_APPEND_DROP(block_list, block_dict); + } + break; + + case FZ_STEXT_BLOCK_TEXT: + if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) + { + block_dict = PyDict_New(); + block_n++; + DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); + DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); + DICT_SETITEMSTR_DROP(block_dict, "flags", Py_BuildValue("i", block->u.t.flags)); + JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect); + LIST_APPEND_DROP(block_list, block_dict); + } + break; + + case FZ_STEXT_BLOCK_IMAGE: + if (fz_contains_rect(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) + { + block_dict = PyDict_New(); + block_n++; + DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); + DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); + DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); + JM_make_image_block(block, block_dict); + LIST_APPEND_DROP(block_list, block_dict); + } + break; + + case FZ_STEXT_BLOCK_VECTOR: + if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) + { + block_dict = PyDict_New(); + block_n++; + DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); + DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); + JM_make_vector_block(block, block_dict); + LIST_APPEND_DROP(block_list, block_dict); + } + break; + + case FZ_STEXT_BLOCK_GRID: + if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) + { + block_dict = PyDict_New(); + block_n++; + DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); + DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); + JM_make_grid_block(block, block_dict); + LIST_APPEND_DROP(block_list, block_dict); + } + break; + } + block = block->next; + } + return block_n; +} + void JM_make_textpage_dict(fz_stext_page *tp, PyObject *page_dict, int raw) { fz_context* ctx = mupdf::internal_context_get(); fz_stext_block *block; fz_buffer *text_buffer = fz_new_buffer(ctx, 128); - PyObject *block_dict, *block_list = PyList_New(0); + PyObject *block_list = PyList_New(0); fz_rect tp_rect = tp->mediabox; + block = tp->first_block; int block_n = -1; - for (block = tp->first_block; block; block = block->next) { - block_n++; - if (!fz_contains_rect(tp_rect, block->bbox) && - !fz_is_infinite_rect(tp_rect) && - block->type == FZ_STEXT_BLOCK_IMAGE) { - continue; - } - if (!fz_is_infinite_rect(tp_rect) && - fz_is_empty_rect(fz_intersect_rect(tp_rect, block->bbox))) { - continue; - } - - block_dict = PyDict_New(); - DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); - DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); - if (block->type == FZ_STEXT_BLOCK_IMAGE) { - DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); - JM_make_image_block(block, block_dict); - } else { - JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect); - } - - LIST_APPEND_DROP(block_list, block_dict); - } + block_n = _as_dict(block_list, block, text_buffer, raw, tp_rect, block_n); DICT_SETITEM_DROP(page_dict, dictkey_blocks, block_list); fz_drop_buffer(ctx, text_buffer); } @@ -4270,6 +4504,7 @@ fz_stext_page* page_get_textpage( PyObject* matrix ); +void make_table_dict(fz_stext_page *tp, PyObject *table_dict, PyObject *bbox); void JM_make_textpage_dict(fz_stext_page *tp, PyObject *page_dict, int raw); PyObject *pixmap_pixel(fz_pixmap* pm, int x, int y); int pixmap_n(mupdf::FzPixmap& pixmap); diff --git a/src/table.py b/src/table.py index 3de8b2c86..9b73782d3 100644 --- a/src/table.py +++ b/src/table.py @@ -80,39 +80,121 @@ from dataclasses import dataclass from operator import itemgetter import weakref +import pymupdf +from pymupdf import mupdf # ------------------------------------------------------------------- # Start of PyMuPDF interface code # ------------------------------------------------------------------- -from . import ( - Rect, - Matrix, - TEXTFLAGS_TEXT, - TEXT_FONT_BOLD, - TEXT_FONT_ITALIC, - TEXT_FONT_MONOSPACED, - TEXT_FONT_SUPERSCRIPT, - TEXT_COLLECT_STYLES, - TOOLS, - EMPTY_RECT, - sRGB_to_pdf, - Point, - message, - mupdf, -) EDGES = [] # vector graphics from PyMuPDF CHARS = [] # text characters from PyMuPDF TEXTPAGE = None TEXT_BOLD = mupdf.FZ_STEXT_BOLD TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT -FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES - +FLAGS = ( + 0 + | pymupdf.TEXTFLAGS_TEXT + | pymupdf.TEXT_COLLECT_STYLES + | pymupdf.TEXT_ACCURATE_BBOXES + | pymupdf.TEXT_MEDIABOX_CLIP +) +# needed by mupdf function fz_find_table_within_bounds(). +TABLE_DETECTOR_FLAGS = ( + 0 + | pymupdf.TEXT_ACCURATE_BBOXES + | pymupdf.TEXT_SEGMENT + | pymupdf.TEXT_COLLECT_VECTORS + | pymupdf.TEXT_MEDIABOX_CLIP +) white_spaces = set(string.whitespace) # for checking white space only cells +def _iou(r1, r2): + """Compute intersection over union of two rectangles.""" + ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0])) + iy = max(0, min(r1[3], r2[3]) - max(r1[1], r2[1])) + intersection = ix * iy # intersection area + if not intersection: + return 0 + area1 = (r1[2] - r1[0]) * (r1[3] - r1[1]) + area2 = (r2[2] - r2[0]) * (r2[3] - r2[1]) + return intersection / (area1 + area2 - intersection) + + +def intersects_words_h(bbox, y, word_rects) -> bool: + """Check whether any of the words in bbox are cut through by + horizontal line y. + """ + return any(r.y0 < y < r.y1 for r in word_rects if r in bbox) + + +def get_table_dict_from_rect(textpage, rect): + """Extract MuPDF table structure information from a given rectangle.""" + table_dict = {} + pymupdf.extra.make_table_dict(textpage.this.m_internal, table_dict, rect) + return table_dict + + +def make_table_from_bbox(textpage, word_rects, rect): + """Detect table structure within a given rectangle.""" + cells = [] # table cells as (x0,y0,x1,y1) tuples + + # calls fz_find_table_within_bounds + block = get_table_dict_from_rect(textpage, rect) + # No table structure found if not a grid block + if block.get("type") != mupdf.FZ_STEXT_BLOCK_GRID: + return cells + bbox = pymupdf.Rect(block["bbox"]) # resulting table bbox + + # lists of (pos,uncertainty) tuples + xpos = sorted(block["xpos"], key=lambda x: x[0]) + ypos = sorted(block["ypos"], key=lambda y: y[0]) + + # maximum uncertainties in x and y directions + xmaxu, ymaxu = block["max_uncertain"] + + # Modify ypos to remove uncertain positions, and y positions + # that cut through words. + nypos = [] + for y, yunc in ypos: + if yunc > 0: # allow no uncertain y values + continue + if intersects_words_h(bbox, y, word_rects): + continue # allow no y that cuts through words + if nypos and (y - nypos[-1] < 3): + nypos[-1] = y # snap close positions + else: + nypos.append(y) + + # New max y uncertainty: 35% of remaining y positions. + # Omit x positions that intersect too many words, otherwise + # only remove x for the affected cells. + ymaxu = max(0, round((len(nypos) - 2) * 0.35)) + + # Exclude x positions with too high uncertainty + # (we allow more uncertainty in x direction) + nxpos = [x[0] for x in xpos if x[1] <= ymaxu] + if bbox.x1 > nxpos[-1] + 3: + nxpos.append(bbox.x1) # ensure right table border + + # Compose cells from the remaining x and y positions. + for i in range(len(nypos) - 1): + row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1]) + # Sub-select words in this row and sort them by left coordinate + row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0) + # Sub-select x values that do not cut through words + this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)] + for j in range(len(this_xpos) - 1): + cell = pymupdf.Rect(this_xpos[j], nypos[i], this_xpos[j + 1], nypos[i + 1]) + if not cell.is_empty: # valid cell + cells.append(tuple(cell)) + # Add new table to TableFinder tables + return cells + + def extract_cells(textpage, cell, markdown=False): - """Extract text from a rect-like 'cell' as plain or MD style text. + """Extract text from a rect-like 'cell' as plain or MD styled text. This function should ultimately be used to extract text from a table cell. Markdown output will only work correctly if extraction flag bit @@ -171,9 +253,12 @@ def extract_cells(textpage, cell, markdown=False): # only include chars with more than 50% bbox overlap span_text = "" for char in span["chars"]: - bbox = Rect(char["bbox"]) + this_char = char["c"] + bbox = pymupdf.Rect(char["bbox"]) if abs(bbox & cell) > 0.5 * abs(bbox): - span_text += char["c"] + span_text += this_char + elif this_char in white_spaces: + span_text += " " if not span_text: continue # skip empty span @@ -190,10 +275,10 @@ def extract_cells(textpage, cell, markdown=False): if span["char_flags"] & TEXT_BOLD: prefix += "**" suffix = "**" + suffix - if span["flags"] & TEXT_FONT_ITALIC: + if span["flags"] & pymupdf.TEXT_FONT_ITALIC: prefix += "_" suffix = "_" + suffix - if span["flags"] & TEXT_FONT_MONOSPACED: + if span["flags"] & pymupdf.TEXT_FONT_MONOSPACED: prefix += "`" suffix = "`" + suffix @@ -1358,7 +1443,7 @@ def bbox_to_corners(bbox) -> tuple: # PyMuPDF modification: # Remove tables without text or having only 1 column for i in range(len(tables) - 1, -1, -1): - r = EMPTY_RECT() + r = pymupdf.EMPTY_RECT() x1_vals = set() x0_vals = set() for c in tables[i]: @@ -1556,7 +1641,7 @@ def to_pandas(self, **kwargs): try: import pandas as pd except ModuleNotFoundError: - message("Package 'pandas' is not installed") + pymupdf.message("Package 'pandas' is not installed") raise pd_dict = {} @@ -1618,7 +1703,7 @@ def top_row_bg_color(self): above. If different, return True indicating that the original table top row is already the header. """ - bbox0 = Rect(self.rows[0].bbox) + bbox0 = pymupdf.Rect(self.rows[0].bbox) bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1] top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1] @@ -1636,15 +1721,17 @@ def row_has_bold(bbox): Returns True if any spans are bold else False. """ - blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"] + blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[ + "blocks" + ] spans = [s for b in blocks for l in b["lines"] for s in l["spans"]] - return any(s["flags"] & TEXT_FONT_BOLD for s in spans) + return any(s["flags"] & pymupdf.TEXT_FONT_BOLD for s in spans) try: row = self.rows[0] cells = row.cells - bbox = Rect(row.bbox) + bbox = pymupdf.Rect(row.bbox) except IndexError: # this table has no rows return None @@ -1686,7 +1773,9 @@ def row_has_bold(bbox): clip.y0 = 0 # start at top of page clip.y1 = bbox.y0 # end at top of table - blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"] + blocks = page.get_text("dict", clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)[ + "blocks" + ] # non-empty, non-superscript spans above table, sorted descending by y1 spans = sorted( [ @@ -1696,7 +1785,7 @@ def row_has_bold(bbox): for s in l["spans"] if not ( white_spaces.issuperset(s["text"]) - or s["flags"] & TEXT_FONT_SUPERSCRIPT + or s["flags"] & pymupdf.TEXT_FONT_SUPERSCRIPT ) ], key=lambda s: s["bbox"][3], @@ -1712,7 +1801,7 @@ def row_has_bold(bbox): s = spans[i] y1 = s["bbox"][3] # span bottom h = y1 - s["bbox"][1] # span bbox height - bold = s["flags"] & TEXT_FONT_BOLD + bold = s["flags"] & pymupdf.TEXT_FONT_BOLD # use first item to start the lists if i == 0: @@ -1759,7 +1848,7 @@ def row_has_bold(bbox): return header_top_row # re-compute clip above table - nclip = EMPTY_RECT() + nclip = pymupdf.EMPTY_RECT() for s in [s for s in spans if s["bbox"][3] >= select[-1]]: nclip |= s["bbox"] if not nclip.is_empty: @@ -1768,7 +1857,7 @@ def row_has_bold(bbox): clip.y1 = bbox.y0 # make sure we still include every word above # Confirm that no word in clip is intersecting a column separator - word_rects = [Rect(w[:4]) for w in page.get_text("words", clip=clip)] + word_rects = [pymupdf.Rect(w[:4]) for w in page.get_text("words", clip=clip)] word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True) select = [] @@ -2074,7 +2163,7 @@ def make_chars(page, clip=None): for line in block["lines"]: ldir = line["dir"] # = (cosine, sine) of angle ldir = (round(ldir[0], 4), round(ldir[1], 4)) - matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0) + matrix = pymupdf.Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0) if ldir[1] == 0: upright = True else: @@ -2082,11 +2171,11 @@ def make_chars(page, clip=None): for span in sorted(line["spans"], key=lambda s: s["bbox"][0]): fontname = span["font"] fontsize = span["size"] - color = sRGB_to_pdf(span["color"]) + color = pymupdf.sRGB_to_pdf(span["color"]) for char in sorted(span["chars"], key=lambda c: c["bbox"][0]): - bbox = Rect(char["bbox"]) + bbox = pymupdf.Rect(char["bbox"]) bbox_ctm = bbox * ctm - origin = Point(char["origin"]) * ctm + origin = pymupdf.Point(char["origin"]) * ctm matrix.e = origin.x matrix.f = origin.y text = char["c"] @@ -2136,9 +2225,9 @@ def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes prect = page.rect if page.rotation in (90, 270): w, h = prect.br - prect = Rect(0, 0, h, w) + prect = pymupdf.Rect(0, 0, h, w) if clip is not None: - clip = Rect(clip) + clip = pymupdf.Rect(clip) else: clip = prect @@ -2309,8 +2398,8 @@ def make_line(p, p1, p2, clip): rect.width <= min_length and rect.width < rect.height ): # simulates a vertical line x = abs(rect.x1 + rect.x0) / 2 # take middle value for x - p1 = Point(x, rect.y0) - p2 = Point(x, rect.y1) + p1 = pymupdf.Point(x, rect.y0) + p2 = pymupdf.Point(x, rect.y1) line_dict = make_line(p, p1, p2, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) @@ -2320,8 +2409,8 @@ def make_line(p, p1, p2, clip): rect.height <= min_length and rect.height < rect.width ): # simulates a horizontal line y = abs(rect.y1 + rect.y0) / 2 # take middle value for y - p1 = Point(rect.x0, y) - p2 = Point(rect.x1, y) + p1 = pymupdf.Point(rect.x0, y) + p2 = pymupdf.Point(rect.x1, y) line_dict = make_line(p, p1, p2, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) @@ -2386,8 +2475,8 @@ def make_line(p, p1, p2, clip): else: add_lines = [] for p1, p2 in add_lines: - p1 = Point(p1) - p2 = Point(p2) + p1 = pymupdf.Point(p1) + p2 = pymupdf.Point(p2) line_dict = make_line(path, p1, p2, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) @@ -2397,7 +2486,7 @@ def make_line(p, p1, p2, clip): else: add_boxes = [] for box in add_boxes: - r = Rect(box) + r = pymupdf.Rect(box) line_dict = make_line(path, r.tl, r.bl, clip) if line_dict: EDGES.append(line_to_edge(line_dict)) @@ -2426,17 +2515,17 @@ def page_rotation_set0(page): if rot == 90: # before derotation, shift content horizontally - mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0) + mat0 = pymupdf.Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0) elif rot == 270: # before derotation, shift content vertically - mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0) + mat0 = pymupdf.Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0) else: - mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0) + mat0 = pymupdf.Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0) # prefix with derotation matrix mat = mat0 * page.derotation_matrix cmd = b"%g %g %g %g %g %g cm " % tuple(mat) - xref = TOOLS._insert_contents(page, cmd, 0) + xref = pymupdf.TOOLS._insert_contents(page, cmd, 0) # swap x- and y-coordinates if rot in (90, 270): @@ -2496,11 +2585,12 @@ def find_tables( add_boxes=None, # user-specified rectangles paths=None, # accept vector graphics as parameter ): + pymupdf._warn_layout_once() global CHARS, EDGES CHARS = [] EDGES = [] - old_small = bool(TOOLS.set_small_glyph_heights()) # save old value - TOOLS.set_small_glyph_heights(True) # we need minimum bboxes + old_small = bool(pymupdf.TOOLS.set_small_glyph_heights()) # save old value + pymupdf.TOOLS.set_small_glyph_heights(True) # we need minimum bboxes if page.rotation != 0: page, old_xref, old_rot, old_mediabox = page_rotation_set0(page) else: @@ -2543,21 +2633,65 @@ def find_tables( "text_x_tolerance": text_x_tolerance, "text_y_tolerance": text_y_tolerance, } - tset = TableSettings.resolve(settings=settings) - page.table_settings = tset - - make_chars(page, clip=clip) # create character list of page - make_edges( - page, - clip=clip, - tset=tset, - paths=paths, - add_lines=add_lines, - add_boxes=add_boxes, - ) # create lines and curves - tables = TableFinder(page, settings=tset) - - TOOLS.set_small_glyph_heights(old_small) - if old_xref is not None: - page = page_rotation_reset(page, old_xref, old_rot, old_mediabox) - return tables + + old_quad_corrections = pymupdf.TOOLS.unset_quad_corrections() + try: + page.get_layout() + if page.layout_information: + pymupdf.TOOLS.unset_quad_corrections(True) + boxes = [ + pymupdf.Rect(b[:4]) for b in page.layout_information if b[-1] == "table" + ] + else: + boxes = [] + + if boxes: # layout did find some tables + pass + elif page.layout_information is not None: + # layout was executed but found no tables + # make sure we exit quickly with an empty TableFinder + tbf = TableFinder(page) + return tbf + + tset = TableSettings.resolve(settings=settings) + page.table_settings = tset + + make_chars(page, clip=clip) # create character list of page + make_edges( + page, + clip=clip, + tset=tset, + paths=paths, + add_lines=add_lines, + add_boxes=add_boxes, + ) # create lines and curves + + tbf = TableFinder(page, settings=tset) + + if boxes: + # only keep Finder tables that match a layout box + tbf.tables = [ + tab + for tab in tbf.tables + if any(_iou(tab.bbox, r) >= 0.6 for r in boxes) + ] + # build the complementary list of layout table boxes + my_boxes = [ + r for r in boxes if all(_iou(r, tab.bbox) < 0.6 for tab in tbf.tables) + ] + if my_boxes: + word_rects = [pymupdf.Rect(w[:4]) for w in TEXTPAGE.extractWORDS()] + tp2 = page.get_textpage(flags=TABLE_DETECTOR_FLAGS) + for rect in my_boxes: + cells = make_table_from_bbox(tp2, word_rects, rect) # pylint: disable=E0606 + tbf.tables.append(Table(page, cells)) + except Exception as e: + pymupdf.message("find_tables: exception occurred: %s" % str(e)) + return None + finally: + pymupdf.TOOLS.set_small_glyph_heights(old_small) + if old_xref is not None: + page = page_rotation_reset(page, old_xref, old_rot, old_mediabox) + pymupdf.TOOLS.unset_quad_corrections(old_quad_corrections) + + return tbf