From ba9406125f9a84748c7276bbc743efa9b611eafc Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Sat, 19 Jul 2025 07:41:28 -0400 Subject: [PATCH] Version 0.0.27 --- CHANGES.md | 15 ++++- pdf4llm/setup.py | 9 ++- .../pymupdf4llm/helpers/pymupdf_rag.py | 66 ++++++++++--------- pymupdf4llm/pymupdf4llm/versions_file.py | 3 +- pymupdf4llm/setup.py | 7 +- 5 files changed, 59 insertions(+), 41 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 736bb192..3982e731 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,18 @@ # Change Log +## Changes in version 0.0.27 + +### Fixes: + +* [296](https://github.com/pymupdf/RAG/issues/296) - [Bug] A specific diagram recognized as significant ... +* [294](https://github.com/pymupdf/RAG/issues/294) - Unable to extract images from Page +* [272](https://github.com/pymupdf/RAG/issues/272) - Disappeared page breaks + +### Other Changes: + +* Added new parameter to `to_markdown`: `page_separators=False`. If `True` and `page_chunks=False` a line like `--- end of page=nnn ---` is appended to each pages markdown text. The page number is 0-based. Intended for debugging purposes. + + ## Changes in version 0.0.26 ### Fixes: @@ -14,7 +27,7 @@ * The class `TocHeaders` is now a top-level import and can now be directly used. -* Method `to_markdown` has a new parameter `detect_bg_color=True` which guesses the page's background color. If detection is successful, vectors having this fill color are ignored (default). Setting this to `False` will "fill" vectors to always be considered in vector graphics detection. +* Method `to_markdown` has a new parameter `detect_bg_color=True` (default) which guesses the page's background color. If a background is detected, fill-only vectors having this color are ignored. `False` will always consider "fill" vectors in vector graphics detection. * Text written with a `Type 3` font will now always be considered. Previously, this text was always treated as invisible and was hence suppressed. diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index a24fdf25..5849e9ad 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -13,11 +13,11 @@ "Programming Language :: Python :: 3", "Topic :: Utilities", ] -requires = ["pymupdf4llm==0.0.26"] +requires = ["pymupdf4llm==0.0.27"] setuptools.setup( name="pdf4llm", - version="0.0.26", + version="0.0.27", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG", @@ -29,13 +29,12 @@ license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License", url="https://github.com/pymupdf/RAG", classifiers=classifiers, - package_data={ - "pdf4llm": ["LICENSE"], - }, + package_data={}, project_urls={ "Documentation": "https://pymupdf.readthedocs.io/", "Source": "https://github.com/pymupdf/RAG/tree/main/pdf4llm/pdf4llm", "Tracker": "https://github.com/pymupdf/RAG/issues", "Changelog": "https://github.com/pymupdf/RAG/blob/main/CHANGES.md", + "License": "https://github.com/pymupdf/RAG/blob/main/LICENSE", }, ) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 85aa41dd..ad6ca0c1 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -316,6 +316,7 @@ def to_markdown( filename=None, force_text=True, page_chunks=False, + page_separators=False, margins=0, dpi=150, page_width=612, @@ -341,6 +342,7 @@ def to_markdown( image_format: (str) use this image format. Choose a supported one. force_text: (bool) output text despite of image background. page_chunks: (bool) whether to segment output by page. + page_separators: (bool) whether to include page separators in output. margins: omit content overlapping margin areas. dpi: (int) desired resolution for generated images. page_width: (float) assumption if page layout is variable. @@ -381,7 +383,7 @@ def to_markdown( IGNORE_IMAGES = ignore_images IGNORE_GRAPHICS = ignore_graphics DETECT_BG_COLOR = detect_bg_color - if doc.is_form_pdf or doc.has_annots(): + if doc.is_form_pdf or (doc.is_pdf and doc.has_annots()): doc.bake() # for reflowable documents allow making 1 page for the whole document @@ -560,6 +562,7 @@ def write_text( ) parms.line_rects.extend(cells) parms.written_tables.append(i) + prev_hdr_string = None # ------------------------------------------------------------ # Pick up images / graphics ABOVE this text block @@ -592,6 +595,7 @@ def write_text( if not is_white(img_txt): out_string += img_txt parms.written_images.append(i) + prev_hdr_string = None parms.line_rects.append(lrect) # if line rect is far away from the previous one, add a line break @@ -751,7 +755,7 @@ def output_tables(parms, text_rect): ): if i in parms.written_tables: continue - this_md += parms.tabs[i].to_markdown(clean=False) + this_md += parms.tabs[i].to_markdown(clean=False) + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( @@ -772,7 +776,7 @@ def output_tables(parms, text_rect): for i, trect in parms.tab_rects.items(): if i in parms.written_tables: continue - this_md += parms.tabs[i].to_markdown(clean=False) + this_md += parms.tabs[i].to_markdown(clean=False) + "\n" if EXTRACT_WORDS: # for "words" extraction, add table cells as line rects cells = sorted( @@ -954,7 +958,7 @@ def get_page_output( ) # accept invisible text # determine background color - parms.bg_color = get_bg_color(page) if DETECT_BG_COLOR else None + parms.bg_color = None if not DETECT_BG_COLOR else get_bg_color(page) left, top, right, bottom = margins parms.clip = page.rect + (left, top, -right, -bottom) @@ -994,12 +998,12 @@ def get_page_output( if img_info: img_max_size = abs(parms.clip) * 0.9 sane = [i for i in img_info if abs(i["bbox"] & parms.clip) < img_max_size] - if len(sane) < len(img_info): # found some - img_info = sane # use those images instead - # output full page image - name = save_image(parms, parms.clip, "full") - if name: - parms.md_string += GRAPHICS_TEXT % name + if len(sane) < len(img_info): # found some + img_info = sane # use those images instead + # output full page image + name = save_image(parms, parms.clip, "full") + if name: + parms.md_string += GRAPHICS_TEXT % name img_info = img_info[:30] # only accept the largest up to 30 images # run from back to front (= small to large) @@ -1024,31 +1028,31 @@ def get_page_output( # Locate all tables on page parms.written_tables = [] # stores already written tables omitted_table_rects = [] + parms.tabs = [] if IGNORE_GRAPHICS or not table_strategy: # do not try to extract tables - parms.tabs = None + pass else: - parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) - # remove tables with too few rows or columns - for i in range(len(parms.tabs.tables) - 1, -1, -1): - t = parms.tabs.tables[i] + tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) + for t in tabs.tables: + # remove tables with too few rows or columns if t.row_count < 2 or t.col_count < 2: omitted_table_rects.append(pymupdf.Rect(t.bbox)) - del parms.tabs.tables[i] - parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1])) + continue + parms.tabs.append(t) + parms.tabs.sort(key=lambda t: (t.bbox[0], t.bbox[1])) # Make a list of table boundary boxes. # Must include the header bbox (which may exist outside tab.bbox) tab_rects = {} - if parms.tabs is not None: - for i, t in enumerate(parms.tabs.tables): - tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox) - tab_dict = { - "bbox": tuple(tab_rects[i]), - "rows": t.row_count, - "columns": t.col_count, - } - parms.tables.append(tab_dict) + for i, t in enumerate(parms.tabs): + tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox) + tab_dict = { + "bbox": tuple(tab_rects[i]), + "rows": t.row_count, + "columns": t.col_count, + } + parms.tables.append(tab_dict) parms.tab_rects = tab_rects # list of table rectangles parms.tab_rects0 = list(tab_rects.values()) @@ -1064,15 +1068,12 @@ def get_page_output( and p["rect"].width < parms.clip.width and p["rect"].height < parms.clip.height and (p["rect"].width > 3 or p["rect"].height > 3) - and not (p["fill"] == parms.bg_color and p["fill"] != None) - and not intersects_rects( - p["rect"], parms.tab_rects0 + omitted_table_rects - ) + and not (p["type"] == "f" and p["fill"] == parms.bg_color) + and not intersects_rects(p["rect"], parms.tab_rects0) and not intersects_rects(p["rect"], parms.annot_rects) ] else: paths = [] - # catch too-many-graphics situation if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT: paths = [] @@ -1168,6 +1169,9 @@ def get_page_output( else: words = [] parms.words = words + if page_separators: + # add page separators to output + parms.md_string += f"\n\n--- end of page={parms.page.number} ---\n\n" return parms if page_chunks is False: diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index 105a643e..5d422f48 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,2 +1,3 @@ +# Generated file - do not edit. MINIMUM_PYMUPDF_VERSION = (1, 26, 3) -VERSION = '0.0.26' +VERSION = '0.0.27' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index ab8608db..8770f1ab 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -14,12 +14,12 @@ "Topic :: Utilities", ] -version = "0.0.26" +version = "0.0.27" requires = ["pymupdf>=1.26.3"] text = requires[0].split("=")[1] text = tuple(map(int, text.split("."))) -text = f"MINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n" +text = f"# Generated file - do not edit.\nMINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n" Path("pymupdf4llm/versions_file.py").write_text(text) setuptools.setup( @@ -37,12 +37,13 @@ url="https://github.com/pymupdf/RAG", classifiers=classifiers, package_data={ - "pymupdf4llm": ["LICENSE", "helpers/*.py", "llama/*.py"], + "pymupdf4llm": ["helpers/*.py", "llama/*.py"], }, project_urls={ "Documentation": "https://pymupdf.readthedocs.io/", "Source": "https://github.com/pymupdf/RAG/tree/main/pymupdf4llm/pymupdf4llm", "Tracker": "https://github.com/pymupdf/RAG/issues", "Changelog": "https://github.com/pymupdf/RAG/blob/main/CHANGES.md", + "License": "https://github.com/pymupdf/RAG/blob/main/LICENSE", }, )