diff --git a/CHANGES.md b/CHANGES.md index 0193004b..f0c6bb06 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,23 @@ # Change Log +## Changes in version 0.0.20 + +### Fixes: + +* [171](https://github.com/pymupdf/RAG/issues/171) - Text rects overlap with tables and images that should be excluded. +* [189](https://github.com/pymupdf/RAG/issues/189) - The position of the extracted image is incorrect +* [238](https://github.com/pymupdf/RAG/issues/238) - When text is laid out around the picture, text extraction is missing. + +### Other Changes: + +* Added **_new parameter_** `ignore_images`: (bool) optional. `True` will not consider images in any way. May be useful for pages where a plethora of images prevents meaningful layout analysis. Typical examples are PowerPoint slides and derived / similar pages. + +* Added **_new parameter_** `ignore_graphics`: (bool), optional. `True` will not consider graphics except for table detection. May be useful for pages where a plethora of vector graphics prevents meaningful layout analysis. Typical examples are PowerPoint slides and derived / similar pages. + +* Added **_new parameter_** to class `IdentifyHeaders`: Use `max_levels` (integer <= 6) to limit the generation of header tag levels. e.g. `headers = pymupdf4llm.IdentifyHeaders(doc, max_level=3)` ensures that only up to 3 header levels will ever be generated. Any text with a font size less than the value of `###` will be body text. In this case, the markdown generation itself would be coded as `md = pymupdf4llm.to_markdown(doc, hdr_info=headers, ...)`. + +* Changed parameter `table_strategy`: When specifying `None`, no effort to detecting tables will be made. This can be useful when tables are of no interest or known to not exist in a given file. This will speed up processing significantly. Be prepared to see more changes and extensions here. + ## Changes in version 0.0.19 diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index 66e6a4e0..8285d9f6 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -13,11 +13,11 @@ "Programming Language :: Python :: 3", "Topic :: Utilities", ] -requires = ["pymupdf4llm>=0.0.19"] +requires = ["pymupdf4llm==0.0.20"] setuptools.setup( name="pdf4llm", - version="0.0.19", + version="0.0.20", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG", diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 5e613733..9a78817c 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.19" +__version__ = "0.0.20" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py index f8eb01ce..a4ee573b 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py +++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py @@ -76,6 +76,7 @@ def column_boxes( textpage=None, paths=None, avoid=None, + ignore_images=False, ): """Determine bboxes which wrap a column on the page. @@ -261,7 +262,9 @@ def join_rects_phase3(bboxes, path_rects, cache): continue # do not join different backgrounds - if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache): + if in_bbox_using_cache( + prect0, path_rects, cache + ) != in_bbox_using_cache(prect1, path_rects, cache): continue temp = prect0 | prect1 test = set( @@ -333,11 +336,12 @@ def join_rects_phase3(bboxes, path_rects, cache): clip.y1 -= footer_margin # Remove footer area clip.y0 += header_margin # Remove header area - paths = [ - p - for p in page.get_drawings() - if p["rect"].width < clip.width and p["rect"].height < clip.height - ] + if paths is None: + paths = [ + p + for p in page.get_drawings() + if p["rect"].width < clip.width and p["rect"].height < clip.height + ] if textpage is None: textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT) @@ -371,8 +375,9 @@ def join_rects_phase3(bboxes, path_rects, cache): path_rects.sort(key=lambda b: (b.y0, b.x0)) # bboxes of images on page, no need to sort them - for item in page.get_images(): - img_bboxes.extend(page.get_image_rects(item[0])) + if ignore_images is False: + for item in page.get_images(): + img_bboxes.extend(page.get_image_rects(item[0])) # blocks of text on page blocks = textpage.extractDICT()["blocks"] @@ -433,7 +438,9 @@ def join_rects_phase3(bboxes, path_rects, cache): continue # never join across different background colors - if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(bb, path_rects, cache): + if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache( + bb, path_rects, cache + ): continue temp = bb | nbb # temporary extension of new block diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 2741fb03..c8cb38e2 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -18,7 +18,7 @@ Dependencies ------------- -PyMuPDF v1.25.4 or later +PyMuPDF v1.25.5 or later Copyright and License ---------------------- @@ -47,6 +47,7 @@ from collections import defaultdict pymupdf.TOOLS.unset_quad_corrections(True) + # Characters recognized as bullets when starting a line. bullet = tuple( [ @@ -85,7 +86,8 @@ def __init__( self, doc: str, pages: list = None, - body_limit: float = 12, + body_limit: float = 11, # default if no text found + max_levels: int = 6, # accept this many header levels ): """Read all text and make a dictionary of fontsizes. @@ -94,6 +96,8 @@ def __init__( pages: consider these page numbers only body_limit: treat text with larger font size as a header """ + if not isinstance(max_levels, int) or max_levels not in range(1, 7): + raise ValueError("max_levels must be an integer between 1 and 6") if isinstance(doc, pymupdf.Document): mydoc = doc else: @@ -113,8 +117,8 @@ def __init__( for s in l["spans"] if not is_white(s["text"]) ]: - fontsz = round(span["size"]) - fontsizes[fontsz] += len(span["text"].strip()) + fontsz = round(span["size"]) # # compute rounded fontsize + fontsizes[fontsz] += len(span["text"].strip()) # add character count if mydoc != doc: # if opened here, close it now @@ -124,15 +128,14 @@ def __init__( self.header_id = {} # If not provided, choose the most frequent font size as body text. - # If no text at all on all pages, just use 12. + # If no text at all on all pages, just use body_limit. # In any case all fonts not exceeding temp = sorted( - [(k, v) for k, v in fontsizes.items()], - key=lambda i: i[1], - reverse=True, + [(k, v) for k, v in fontsizes.items()], key=lambda i: (i[1], i[0]) ) if temp: - self.body_limit = min(body_limit, temp[0][0]) + # most frequent font size + self.body_limit = min(body_limit, temp[-1][0]) else: self.body_limit = body_limit @@ -140,11 +143,12 @@ def __init__( sizes = sorted( [f for f in fontsizes.keys() if f > self.body_limit], reverse=True, - )[:6] + )[:max_levels] + self.body_limit = min(self.body_limit, sizes[-1] - 1 if sizes else body_limit) # make the header tag dictionary - for i, size in enumerate(sizes): - self.header_id[size] = "#" * (i + 1) + " " + for i, size in enumerate(sizes, start=1): + self.header_id[size] = "#" * i + " " def get_header_id(self, span: dict, page=None) -> str: """Return appropriate markdown header prefix. @@ -153,12 +157,7 @@ def get_header_id(self, span: dict, page=None) -> str: markdown header prefix string of 0 to n concatenated '#' characters. """ fontsize = round(span["size"]) # compute fontsize - if fontsize <= self.body_limit: # shortcut for body text - return "" hdr_id = self.header_id.get(fontsize, "") - # If no header but larger than body text, assign