From 5a0679d82e32e2223083292dfeedefb7db02dacf Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Tue, 8 Apr 2025 16:29:52 -0400 Subject: [PATCH] Version 0.0.21 Various smaller bug fixes. --- CHANGES.md | 9 ++ pymupdf4llm/pymupdf4llm/__init__.py | 2 +- .../pymupdf4llm/helpers/multi_column.py | 18 +-- .../pymupdf4llm/helpers/pymupdf_rag.py | 103 ++++++++++-------- pymupdf4llm/setup.py | 2 +- 5 files changed, 77 insertions(+), 57 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f0c6bb06..d722d1db 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,14 @@ # Change Log +## Changes in version 0.0.21 + +### Fixes: + +* [116](https://github.com/pymupdf/RAG/issues/116) - Handling Graphical Images & Superscripts + +### Other Changes: + + ## Changes in version 0.0.20 ### Fixes: diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 9a78817c..51ec6c3a 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.20" +__version__ = "0.0.21" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py index a4ee573b..4cdd8097 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py +++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py @@ -64,7 +64,7 @@ import pymupdf -pymupdf.TOOLS.set_small_glyph_heights(True) +pymupdf.TOOLS.unset_quad_corrections(True) def column_boxes( @@ -237,7 +237,7 @@ def join_rects_phase2(bboxes): if ( abs(r.x0 - r0.x0) <= 3 and abs(r.x1 - r0.x1) <= 3 - and abs(r0.y1 - r.y0) <= 12 + and abs(r0.y1 - r.y0) <= 10 ): r0 |= r new_rects[-1] = r0 @@ -344,7 +344,7 @@ def join_rects_phase3(bboxes, path_rects, cache): ] if textpage is None: - textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT) + textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXT_ACCURATE_BBOXES) bboxes = [] @@ -417,7 +417,6 @@ def join_rects_phase3(bboxes, path_rects, cache): # immediately return of no text found if bboxes == []: return [] - # -------------------------------------------------------------------- # Join bboxes to establish some column structure # -------------------------------------------------------------------- @@ -467,7 +466,8 @@ def join_rects_phase3(bboxes, path_rects, cache): return nblocks # several phases of rectangle joining - nblocks = join_rects_phase1(nblocks) + # TODO: disabled for now as too aggressive: + # nblocks = join_rects_phase1(nblocks) nblocks = join_rects_phase2(nblocks) nblocks = join_rects_phase3(nblocks, path_rects, cache) @@ -491,14 +491,14 @@ def join_rects_phase3(bboxes, path_rects, cache): # check if footer margin is given if len(sys.argv) > 2: footer_margin = int(sys.argv[2]) - else: # use default vaue - footer_margin = 50 + else: + footer_margin = 0 # check if header margin is given if len(sys.argv) > 3: header_margin = int(sys.argv[3]) - else: # use default vaue - header_margin = 50 + else: + header_margin = 0 # open document doc = pymupdf.open(filename) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index c8cb38e2..1c35c1db 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -157,6 +157,8 @@ def get_header_id(self, span: dict, page=None) -> str: markdown header prefix string of 0 to n concatenated '#' characters. """ fontsize = round(span["size"]) # compute fontsize + if fontsize <= self.body_limit: + return "" hdr_id = self.header_id.get(fontsize, "") return hdr_id @@ -278,7 +280,7 @@ def to_markdown( ignore_code: (bool) suppress code-like formatting (mono-space fonts) extract_words: (bool) include "words"-like output in page chunks show_progress: (bool) print progress as each page is processed. - glyph_fallback: (bool) replace the Invalid Unicode by glyph number. + use_glyphs: (bool) replace the Invalid Unicode by glyph numbers. """ if write_images is False and embed_images is False and force_text is False: @@ -427,8 +429,8 @@ def write_text( if clip is None: clip = parms.clip out_string = "" - # This is a list of tuples (linerect, spanlist) + # This is a list of tuples (linerect, spanlist) nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3) nlines = [ l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values()) @@ -450,21 +452,18 @@ def write_text( # Pick up tables ABOVE this text block # ------------------------------------------------------------ if tables: - tab_candidates = sorted( - [ - (i, tab_rect) - for i, tab_rect in parms.tab_rects.items() - if tab_rect.y1 <= lrect.y0 - and i not in parms.deleted_tables - and ( - 0 - or lrect.x0 <= tab_rect.x0 < lrect.x1 - or lrect.x0 < tab_rect.x1 <= lrect.x1 - or tab_rect.x0 <= lrect.x0 < lrect.x1 <= tab_rect.x1 - ) - ], - key=lambda j: (j[1].y1, j[1].x0), - ) + tab_candidates = [ + (i, tab_rect) + for i, tab_rect in parms.tab_rects.items() + if tab_rect.y1 <= lrect.y0 + and i not in parms.written_tables + and ( + 0 + or lrect.x0 <= tab_rect.x0 < lrect.x1 + or lrect.x0 < tab_rect.x1 <= lrect.x1 + or tab_rect.x0 <= lrect.x0 < lrect.x1 <= tab_rect.x1 + ) + ] for i, _ in tab_candidates: out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n" if EXTRACT_WORDS: @@ -481,14 +480,14 @@ def write_text( key=lambda c: (c.y1, c.x0), ) parms.line_rects.extend(cells) - parms.deleted_tables.append(i) + parms.written_tables.append(i) # ------------------------------------------------------------ # Pick up images / graphics ABOVE this text block # ------------------------------------------------------------ if images: for i in range(len(parms.img_rects)): - if i in parms.deleted_images: + if i in parms.written_images: continue r = parms.img_rects[i] if r.y1 <= lrect.y0 and ( @@ -502,7 +501,7 @@ def write_text( out_string += GRAPHICS_TEXT % pathname # recursive invocation - if force_text: + if force_text is True: img_txt = write_text( parms, r, @@ -513,7 +512,7 @@ def write_text( if not is_white(img_txt): out_string += img_txt - parms.deleted_images.append(i) + parms.written_images.append(i) parms.line_rects.append(lrect) @@ -668,7 +667,7 @@ def output_tables(parms, text_rect): [j for j in parms.tab_rects.items() if j[1].y1 <= text_rect.y0], key=lambda j: (j[1].y1, j[1].x0), ): - if i in parms.deleted_tables: + if i in parms.written_tables: continue this_md += parms.tabs[i].to_markdown(clean=False) if EXTRACT_WORDS: @@ -685,14 +684,11 @@ def output_tables(parms, text_rect): key=lambda c: (c.y1, c.x0), ) parms.line_rects.extend(cells) - del parms.tab_rects[i] # do not touch this table twice + parms.written_tables.append(i) # do not touch this table twice else: # output all remaining tables - for i, trect in sorted( - parms.tab_rects.items(), - key=lambda j: (j[1].y1, j[1].x0), - ): - if i in parms.deleted_tables: + for i, trect in parms.tab_rects.items(): + if i in parms.written_tables: continue this_md += parms.tabs[i].to_markdown(clean=False) if EXTRACT_WORDS: @@ -709,10 +705,10 @@ def output_tables(parms, text_rect): key=lambda c: (c.y1, c.x0), ) parms.line_rects.extend(cells) - del parms.tab_rects[i] # do not touch this table twice + parms.written_tables.append(i) # do not touch this table twice return this_md - def output_images(parms, text_rect): + def output_images(parms, text_rect, force_text): """Output images and graphics above text rectangle.""" if not parms.img_rects: return "" @@ -723,10 +719,10 @@ def output_images(parms, text_rect): continue if img_rect.x0 >= text_rect.x1 or img_rect.x1 <= text_rect.x0: continue - if i in parms.deleted_images: + if i in parms.written_images: continue pathname = save_image(parms, img_rect, i) - parms.deleted_images.append(i) # do not touch this image twice + parms.written_images.append(i) # do not touch this image twice if pathname: this_md += GRAPHICS_TEXT % pathname if force_text: @@ -741,10 +737,10 @@ def output_images(parms, text_rect): this_md += img_txt else: # output all remaining images for i, img_rect in enumerate(parms.img_rects): - if i in parms.deleted_images: + if i in parms.written_images: continue pathname = save_image(parms, img_rect, i) - parms.deleted_images.append(i) # do not touch this image twice + parms.written_images.append(i) # do not touch this image twice if pathname: this_md += GRAPHICS_TEXT % pathname if force_text: @@ -867,6 +863,9 @@ def get_page_output( # extract external links on page parms.links = [l for l in page.get_links() if l["kind"] == pymupdf.LINK_URI] + # extract annotation rectangles on page + parms.annot_rects = [a.rect for a in page.annots()] + # make a TextPage for all later extractions parms.textpage = page.get_textpage(flags=textflags, clip=parms.clip) @@ -904,10 +903,20 @@ def get_page_output( parms.img_rects = [i["bbox"] for i in parms.images] # Locate all tables on page + parms.written_tables = [] # stores already written tables if table_strategy is None: parms.tabs = [] else: parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) + del_this = [] + for i, t in enumerate(parms.tabs): + if t.row_count < 2 or t.col_count < 2: + # ignore tables with too few rows or columns + del_this.append(i) + for i in sorted(del_this, reverse=True): + del parms.tabs.tables[i] + parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1])) + # Make a list of table boundary boxes. # Must include the header bbox (which may exist outside tab.bbox) tab_rects = {} @@ -930,11 +939,13 @@ def get_page_output( paths = [ p for p in page.get_drawings() - if not intersects_rects(p["rect"], parms.tab_rects0) - and p["rect"] in parms.clip - and 3 < p["rect"].width < parms.clip.width - and 3 < p["rect"].height < parms.clip.height - and not (p["type"] == "f" and p["fill"] == parms.bg_color) + if p["rect"] in parms.clip + and p["rect"].width < parms.clip.width + and p["rect"].height < parms.clip.height + and (p["rect"].width > 3 or p["rect"].height > 3) + and not (p["fill"] == parms.bg_color and p["fill"] != None) + and not intersects_rects(p["rect"], parms.tab_rects0) + and not intersects_rects(p["rect"], parms.annot_rects) ] else: paths = [] @@ -948,19 +959,19 @@ def get_page_output( vg_clusters0 = [] # worthwhile vector graphics go here # walk through all vector graphics outside any table - for bbox in refine_boxes(page.cluster_drawings(drawings=paths)): + clusters = page.cluster_drawings(drawings=paths) + for bbox in clusters: if is_significant(bbox, paths): vg_clusters0.append(bbox) # remove paths that are not in some relevant graphic parms.actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)] - # also add image rectangles to the list + # also add image rectangles to the list and vice versa vg_clusters0.extend(parms.img_rects) parms.img_rects.extend(vg_clusters0) parms.img_rects = sorted(set(parms.img_rects), key=lambda r: (r.y1, r.x0)) - parms.deleted_images = [] - parms.deleted_tables = [] + parms.written_images = [] # these may no longer be pairwise disjoint: # remove area overlaps by joining into larger rects parms.vg_clusters0 = refine_boxes(vg_clusters0) @@ -989,7 +1000,7 @@ def get_page_output( for text_rect in text_rects: # output tables above this rectangle parms.md_string += output_tables(parms, text_rect) - parms.md_string += output_images(parms, text_rect) + parms.md_string += output_images(parms, text_rect, force_text) # output text inside this rectangle parms.md_string += write_text( @@ -1004,7 +1015,7 @@ def get_page_output( # write any remaining tables and images parms.md_string += output_tables(parms, None) - parms.md_string += output_images(parms, None) + parms.md_string += output_images(parms, None, force_text) parms.md_string += "\n-----\n\n" while parms.md_string.startswith("\n"): @@ -1153,7 +1164,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit): import time try: - filename = "slide12.pdf" + filename = "sample_document.pdf" except IndexError: print(f"Usage:\npython {os.path.basename(__file__)} input.pdf") sys.exit() diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 34059308..2b5647fd 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="pymupdf4llm", - version="0.0.20", + version="0.0.21", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",