diff --git a/CHANGES.md b/CHANGES.md index 0193004b..f0c6bb06 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,23 @@ # Change Log +## Changes in version 0.0.20 + +### Fixes: + +* [171](https://github.com/pymupdf/RAG/issues/171) - Text rects overlap with tables and images that should be excluded. +* [189](https://github.com/pymupdf/RAG/issues/189) - The position of the extracted image is incorrect +* [238](https://github.com/pymupdf/RAG/issues/238) - When text is laid out around the picture, text extraction is missing. + +### Other Changes: + +* Added **_new parameter_** `ignore_images`: (bool) optional. `True` will not consider images in any way. May be useful for pages where a plethora of images prevents meaningful layout analysis. Typical examples are PowerPoint slides and derived / similar pages. + +* Added **_new parameter_** `ignore_graphics`: (bool), optional. `True` will not consider graphics except for table detection. May be useful for pages where a plethora of vector graphics prevents meaningful layout analysis. Typical examples are PowerPoint slides and derived / similar pages. + +* Added **_new parameter_** to class `IdentifyHeaders`: Use `max_levels` (integer <= 6) to limit the generation of header tag levels. e.g. `headers = pymupdf4llm.IdentifyHeaders(doc, max_level=3)` ensures that only up to 3 header levels will ever be generated. Any text with a font size less than the value of `###` will be body text. In this case, the markdown generation itself would be coded as `md = pymupdf4llm.to_markdown(doc, hdr_info=headers, ...)`. + +* Changed parameter `table_strategy`: When specifying `None`, no effort to detecting tables will be made. This can be useful when tables are of no interest or known to not exist in a given file. This will speed up processing significantly. Be prepared to see more changes and extensions here. + ## Changes in version 0.0.19 diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index 66e6a4e0..8285d9f6 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -13,11 +13,11 @@ "Programming Language :: Python :: 3", "Topic :: Utilities", ] -requires = ["pymupdf4llm>=0.0.19"] +requires = ["pymupdf4llm==0.0.20"] setuptools.setup( name="pdf4llm", - version="0.0.19", + version="0.0.20", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG", diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 5e613733..9a78817c 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.19" +__version__ = "0.0.20" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py index f8eb01ce..a4ee573b 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py +++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py @@ -76,6 +76,7 @@ def column_boxes( textpage=None, paths=None, avoid=None, + ignore_images=False, ): """Determine bboxes which wrap a column on the page. @@ -261,7 +262,9 @@ def join_rects_phase3(bboxes, path_rects, cache): continue # do not join different backgrounds - if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache): + if in_bbox_using_cache( + prect0, path_rects, cache + ) != in_bbox_using_cache(prect1, path_rects, cache): continue temp = prect0 | prect1 test = set( @@ -333,11 +336,12 @@ def join_rects_phase3(bboxes, path_rects, cache): clip.y1 -= footer_margin # Remove footer area clip.y0 += header_margin # Remove header area - paths = [ - p - for p in page.get_drawings() - if p["rect"].width < clip.width and p["rect"].height < clip.height - ] + if paths is None: + paths = [ + p + for p in page.get_drawings() + if p["rect"].width < clip.width and p["rect"].height < clip.height + ] if textpage is None: textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT) @@ -371,8 +375,9 @@ def join_rects_phase3(bboxes, path_rects, cache): path_rects.sort(key=lambda b: (b.y0, b.x0)) # bboxes of images on page, no need to sort them - for item in page.get_images(): - img_bboxes.extend(page.get_image_rects(item[0])) + if ignore_images is False: + for item in page.get_images(): + img_bboxes.extend(page.get_image_rects(item[0])) # blocks of text on page blocks = textpage.extractDICT()["blocks"] @@ -433,7 +438,9 @@ def join_rects_phase3(bboxes, path_rects, cache): continue # never join across different background colors - if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(bb, path_rects, cache): + if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache( + bb, path_rects, cache + ): continue temp = bb | nbb # temporary extension of new block diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 2741fb03..c8cb38e2 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -18,7 +18,7 @@ Dependencies ------------- -PyMuPDF v1.25.4 or later +PyMuPDF v1.25.5 or later Copyright and License ---------------------- @@ -47,6 +47,7 @@ from collections import defaultdict pymupdf.TOOLS.unset_quad_corrections(True) + # Characters recognized as bullets when starting a line. bullet = tuple( [ @@ -85,7 +86,8 @@ def __init__( self, doc: str, pages: list = None, - body_limit: float = 12, + body_limit: float = 11, # default if no text found + max_levels: int = 6, # accept this many header levels ): """Read all text and make a dictionary of fontsizes. @@ -94,6 +96,8 @@ def __init__( pages: consider these page numbers only body_limit: treat text with larger font size as a header """ + if not isinstance(max_levels, int) or max_levels not in range(1, 7): + raise ValueError("max_levels must be an integer between 1 and 6") if isinstance(doc, pymupdf.Document): mydoc = doc else: @@ -113,8 +117,8 @@ def __init__( for s in l["spans"] if not is_white(s["text"]) ]: - fontsz = round(span["size"]) - fontsizes[fontsz] += len(span["text"].strip()) + fontsz = round(span["size"]) # # compute rounded fontsize + fontsizes[fontsz] += len(span["text"].strip()) # add character count if mydoc != doc: # if opened here, close it now @@ -124,15 +128,14 @@ def __init__( self.header_id = {} # If not provided, choose the most frequent font size as body text. - # If no text at all on all pages, just use 12. + # If no text at all on all pages, just use body_limit. # In any case all fonts not exceeding temp = sorted( - [(k, v) for k, v in fontsizes.items()], - key=lambda i: i[1], - reverse=True, + [(k, v) for k, v in fontsizes.items()], key=lambda i: (i[1], i[0]) ) if temp: - self.body_limit = min(body_limit, temp[0][0]) + # most frequent font size + self.body_limit = min(body_limit, temp[-1][0]) else: self.body_limit = body_limit @@ -140,11 +143,12 @@ def __init__( sizes = sorted( [f for f in fontsizes.keys() if f > self.body_limit], reverse=True, - )[:6] + )[:max_levels] + self.body_limit = min(self.body_limit, sizes[-1] - 1 if sizes else body_limit) # make the header tag dictionary - for i, size in enumerate(sizes): - self.header_id[size] = "#" * (i + 1) + " " + for i, size in enumerate(sizes, start=1): + self.header_id[size] = "#" * i + " " def get_header_id(self, span: dict, page=None) -> str: """Return appropriate markdown header prefix. @@ -153,12 +157,7 @@ def get_header_id(self, span: dict, page=None) -> str: markdown header prefix string of 0 to n concatenated '#' characters. """ fontsize = round(span["size"]) # compute fontsize - if fontsize <= self.body_limit: # shortcut for body text - return "" hdr_id = self.header_id.get(fontsize, "") - # If no header but larger than body text, assign
. - if not hdr_id and fontsize > self.body_limit: - hdr_id = "###### " return hdr_id @@ -238,6 +237,8 @@ def to_markdown( hdr_info=None, write_images=False, embed_images=False, + ignore_images=False, + ignore_graphics=False, image_path="", image_format="png", image_size_limit=0.05, @@ -304,6 +305,8 @@ def to_markdown( FILENAME = doc.name if filename is None else filename GRAPHICS_LIMIT = graphics_limit FONTSIZE_LIMIT = fontsize_limit + IGNORE_IMAGES = ignore_images + IGNORE_GRAPHICS = ignore_graphics # for reflowable documents allow making 1 page for the whole document if doc.is_reflowable: @@ -425,19 +428,13 @@ def write_text( clip = parms.clip out_string = "" # This is a list of tuples (linerect, spanlist) + + nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3) nlines = [ - l - for l in get_raw_lines(parms.textpage, clip=clip, tolerance=3) - if not intersects_rects(l[0], parms.tab_rects.values()) + l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values()) ] - parms.line_rects.extend( - [ - l[0] - for l in nlines - if not intersects_rects(l[0], parms.tab_rects.values()) - ] - ) # store line rectangles + parms.line_rects.extend([l[0] for l in nlines]) # store line rectangles prev_lrect = None # previous line rectangle prev_bno = -1 # previous block number of line @@ -463,6 +460,7 @@ def write_text( 0 or lrect.x0 <= tab_rect.x0 < lrect.x1 or lrect.x0 < tab_rect.x1 <= lrect.x1 + or tab_rect.x0 <= lrect.x0 < lrect.x1 <= tab_rect.x1 ) ], key=lambda j: (j[1].y1, j[1].x0), @@ -493,7 +491,12 @@ def write_text( if i in parms.deleted_images: continue r = parms.img_rects[i] - if r.y1 <= lrect.y0 and not (r & lrect).is_empty: + if r.y1 <= lrect.y0 and ( + 0 + or lrect.x0 <= r.x0 < lrect.x1 + or lrect.x0 < r.x1 <= lrect.x1 + or r.x0 <= lrect.x0 < lrect.x1 <= r.x1 + ): pathname = save_image(parms, r, i) if pathname: out_string += GRAPHICS_TEXT % pathname @@ -565,6 +568,8 @@ def write_text( prev_hdr_string = hdr_string continue + prev_hdr_string = hdr_string + span0 = spans[0] bno = span0["block"] # block number of line if bno != prev_bno: @@ -620,12 +625,8 @@ def write_text( else: text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " if text.startswith(bullet): - text = text[1:] - if len(text) > 1 and text[1] == " ": - t = "-" - else: - t = "- " - text = t + text[1:] + text = "- " + text[1:] + text = text.replace(" ", " ") dist = span0["bbox"][0] - clip.x0 cwidth = (span0["bbox"][2] - span0["bbox"][0]) / len(span0["text"]) if cwidth == 0.0: @@ -831,7 +832,9 @@ def sort_words(words: list) -> list: nwords.extend(line) return nwords - def get_page_output(doc, pno, margins, textflags, FILENAME): + def get_page_output( + doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS + ): """Process one page. Args: @@ -868,17 +871,20 @@ def get_page_output(doc, pno, margins, textflags, FILENAME): parms.textpage = page.get_textpage(flags=textflags, clip=parms.clip) # extract images on page - img_info = page.get_image_info() + if not IGNORE_IMAGES: + img_info = page.get_image_info() + else: + img_info = [] for i in range(len(img_info)): - item = img_info[i] - bbox = pymupdf.Rect(item["bbox"]) & parms.clip - item["bbox"] = +bbox - img_info[i] = item + img_info[i]["bbox"] = pymupdf.Rect(img_info[i]["bbox"]) img_info = [ i for i in img_info if i["bbox"].width >= image_size_limit * parms.clip.width and i["bbox"].height >= image_size_limit * parms.clip.height + and i["bbox"] in parms.clip + and i["bbox"].width > 3 + and i["bbox"].height > 3 ] # sort descending by image area size img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True) @@ -898,7 +904,10 @@ def get_page_output(doc, pno, margins, textflags, FILENAME): parms.img_rects = [i["bbox"] for i in parms.images] # Locate all tables on page - parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) + if table_strategy is None: + parms.tabs = [] + else: + parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) # Make a list of table boundary boxes. # Must include the header bbox (which may exist outside tab.bbox) tab_rects = {} @@ -917,15 +926,18 @@ def get_page_output(doc, pno, margins, textflags, FILENAME): # Select paths not intersecting any table. # Ignore full page graphics. # Ignore fill paths having the background color. - paths = [ - p - for p in page.get_drawings() - if not intersects_rects(p["rect"], parms.tab_rects0) - and p["rect"] in parms.clip - and 3 < p["rect"].width < parms.clip.width - and 3 < p["rect"].height < parms.clip.height - and not (p["type"] == "f" and p["fill"] == parms.bg_color) - ] + if not IGNORE_GRAPHICS: + paths = [ + p + for p in page.get_drawings() + if not intersects_rects(p["rect"], parms.tab_rects0) + and p["rect"] in parms.clip + and 3 < p["rect"].width < parms.clip.width + and 3 < p["rect"].height < parms.clip.height + and not (p["type"] == "f" and p["fill"] == parms.bg_color) + ] + else: + paths = [] # catch too-many-graphics situation if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT: @@ -959,11 +971,12 @@ def get_page_output(doc, pno, margins, textflags, FILENAME): text_rects = column_boxes( parms.page, paths=parms.actual_paths, - no_image_text=True, + no_image_text=not force_text, textpage=parms.textpage, avoid=parms.tab_rects0 + parms.vg_clusters0, footer_margin=margins[3], header_margin=margins[1], + ignore_images=IGNORE_IMAGES, ) """ @@ -1039,6 +1052,7 @@ def get_page_output(doc, pno, margins, textflags, FILENAME): 0 | mupdf.FZ_STEXT_CLIP | mupdf.FZ_STEXT_ACCURATE_BBOXES + | mupdf.FZ_STEXT_IGNORE_ACTUALTEXT | 32768 # mupdf.FZ_STEXT_COLLECT_STYLES ) # optionally replace 0xFFFD by glyph number @@ -1049,7 +1063,9 @@ def get_page_output(doc, pno, margins, textflags, FILENAME): print(f"Processing {FILENAME}...") pages = ProgressBar(pages) for pno in pages: - parms = get_page_output(doc, pno, margins, textflags, FILENAME) + parms = get_page_output( + doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS + ) if page_chunks is False: document_output += parms.md_string else: @@ -1137,7 +1153,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit): import time try: - filename = "markdown.pdf" + filename = "slide12.pdf" except IndexError: print(f"Usage:\npython {os.path.basename(__file__)} input.pdf") sys.exit() @@ -1168,8 +1184,11 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit): md_string = to_markdown( doc, pages=pages, - write_images=True, - force_text=False, + # write_images=True, + force_text=True, + ignore_images=True, + ignore_graphics=True, + table_strategy=None, ) FILENAME = doc.name # output to a text file with extension ".md" diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index af4342ad..34059308 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -13,11 +13,11 @@ "Programming Language :: Python :: 3", "Topic :: Utilities", ] -requires = ["pymupdf>=1.25.4"] +requires = ["pymupdf>=1.25.5"] setuptools.setup( name="pymupdf4llm", - version="0.0.19", + version="0.0.20", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",