From 13c84c670c2d072830c8582b26b5f77d0d4f10ad Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Fri, 9 May 2025 11:41:20 -0400 Subject: [PATCH] Changes for v0.0.23 --- pdf4llm/setup.py | 4 +- pymupdf4llm/pymupdf4llm/__init__.py | 2 +- .../pymupdf4llm/helpers/get_text_lines.py | 12 ++++- .../pymupdf4llm/helpers/pymupdf_rag.py | 50 +++++++++++++------ pymupdf4llm/setup.py | 2 +- 5 files changed, 50 insertions(+), 20 deletions(-) diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index 11b23c7a..1b0cbd9f 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -13,11 +13,11 @@ "Programming Language :: Python :: 3", "Topic :: Utilities", ] -requires = ["pymupdf4llm==0.0.22"] +requires = ["pymupdf4llm==0.0.23"] setuptools.setup( name="pdf4llm", - version="0.0.22", + version="0.0.23", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG", diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index cbee778f..08571e43 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.22" +__version__ = "0.0.23" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index 47ca1b06..81326564 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -24,7 +24,12 @@ def is_white(text): return WHITE.issuperset(text) -def get_raw_lines(textpage, clip=None, tolerance=3): +def get_raw_lines( + textpage, + clip=None, + tolerance=3, + ignore_invisible=True, +): """Extract the text spans from a TextPage in natural reading sequence. All spans roughly on the same line are joined to generate an improved line. @@ -43,6 +48,8 @@ def get_raw_lines(textpage, clip=None, tolerance=3): turn may be based on a sub-rectangle of the full page). tolerance: (float) put spans on the same line if their top or bottom coordinate differ by no more than this value. + ignore_invisible: (bool) if True, invisible text is ignored. This may + have been set to False for pages with OCR text. Returns: A sorted list of items (rect, [spans]), each representing one line. The @@ -109,7 +116,8 @@ def sanitize_spans(line): sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect if is_white(s["text"]): # ignore white text continue - if s["alpha"] == 0: # ignore invisible text + # ignore invisible text + if s["alpha"] == 0 and ignore_invisible: continue if abs(sbbox & clip) < abs(sbbox) * 0.8: # if not in clip continue diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index c80a2fbc..af725f1d 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -86,7 +86,7 @@ def __init__( self, doc: str, pages: list = None, - body_limit: float = 11, # default if no text found + body_limit: float = 12, # force this to be body text max_levels: int = 6, # accept this many header levels ): """Read all text and make a dictionary of fontsizes. @@ -135,7 +135,7 @@ def __init__( ) if temp: # most frequent font size - self.body_limit = min(body_limit, temp[-1][0]) + self.body_limit = max(body_limit, temp[-1][0]) else: self.body_limit = body_limit @@ -203,7 +203,7 @@ def get_header_id(self, span: dict, page=None) -> str: return "" # check if the span matches a TOC entry text = span["text"].strip() - for t in toc: + for t in my_toc: title = t[1].strip() # title of TOC entry lvl = t[0] # level of TOC entry if text.startswith(title) or title.startswith(text): @@ -494,7 +494,12 @@ def write_text( out_string = "" # This is a list of tuples (linerect, spanlist) - nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3) + nlines = get_raw_lines( + parms.textpage, + clip=clip, + tolerance=3, + ignore_invisible=not parms.accept_invisible, + ) nlines = [ l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values()) ] @@ -821,6 +826,16 @@ def output_images(parms, text_rect, force_text): return this_md + def page_is_ocr(page): + """Check if page exclusivley contains OCR text. + + For this to be true, all text must be written as "ignore-text". + """ + text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]]) + if text_types == {"ignore-text"}: + return True + return False + def get_bg_color(page): """Determine the background color of the page. @@ -919,6 +934,7 @@ def get_page_output( parms.graphics = [] parms.words = [] parms.line_rects = [] + parms.accept_invisible = page_is_ocr(page) # accept invisible text # determine background color parms.bg_color = get_bg_color(page) @@ -968,11 +984,17 @@ def get_page_output( parms.img_rects = [i["bbox"] for i in parms.images] + # catch too-many-graphics situation + graphics_count = len([b for b in page.get_bboxlog() if "path" in b[0]]) + if GRAPHICS_LIMIT and graphics_count > GRAPHICS_LIMIT: + IGNORE_GRAPHICS = True + table_strategy = None + # Locate all tables on page parms.written_tables = [] # stores already written tables omitted_table_rects = [] if table_strategy is None: - parms.tabs = [] + parms.tabs = None else: parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) # remove tables with too few rows or columns @@ -986,14 +1008,15 @@ def get_page_output( # Make a list of table boundary boxes. # Must include the header bbox (which may exist outside tab.bbox) tab_rects = {} - for i, t in enumerate(parms.tabs.tables): - tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox) - tab_dict = { - "bbox": tuple(tab_rects[i]), - "rows": t.row_count, - "columns": t.col_count, - } - parms.tables.append(tab_dict) + if parms.tabs is not None: + for i, t in enumerate(parms.tabs.tables): + tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox) + tab_dict = { + "bbox": tuple(tab_rects[i]), + "rows": t.row_count, + "columns": t.col_count, + } + parms.tables.append(tab_dict) parms.tab_rects = tab_rects # list of table rectangles parms.tab_rects0 = list(tab_rects.values()) @@ -1084,7 +1107,6 @@ def get_page_output( parms.md_string += output_tables(parms, None) parms.md_string += output_images(parms, None, force_text) - parms.md_string += "\n-----\n\n" while parms.md_string.startswith("\n"): parms.md_string = parms.md_string[1:] parms.md_string = parms.md_string.replace(chr(0), chr(0xFFFD)) diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index a1a3b174..2df69920 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="pymupdf4llm", - version="0.0.22", + version="0.0.23", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",