diff --git a/CHANGES.md b/CHANGES.md index 3f6271f0..6277e35e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,16 @@ # Change Log +## Changes in version 0.2.6 + +### Fixes: + +* [Forum](https://forum.mupdf.com/t/bug-pymupdf4llm-list-index-out-of-range-in-document-layout-py-2/216) - List index out of range ... + +### Other Changes: + + +------ + ## Changes in version 0.2.5 ### Fixes: diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index b3c962d6..71c771f0 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -6,7 +6,7 @@ with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f: readme = f.read() -version = "0.2.4" # must always equal the pymupdf4llm version +version = "0.2.6" # must always equal the pymupdf4llm version classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index 2a278332..1b922a1d 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -548,7 +548,7 @@ def fallback_text_to_md(textlines, ignore_code: bool = False, clip=None): for tl in textlines: ltext = "|" + "|".join([s["text"].strip() for s in tl["spans"]]) + "|\n" output += ltext - output += "**----- End of picture text -----**
\n" + output += "\n**----- End of picture text -----**
\n" return output + "\n\n" @@ -631,7 +631,7 @@ def to_markdown( continue # pictures and formulas: either write image file or embed - if btype in ("picture", "formula", "fallback"): + if btype in ("picture", "formula", "table-fallback"): if isinstance(box.image, str): output += GRAPHICS_TEXT % box.image + "\n\n" elif isinstance(box.image, bytes): @@ -650,7 +650,7 @@ def to_markdown( ignore_code=ignore_code or page.full_ocred, clip=clip, ) - elif btype == "fallback": + elif btype == "table-fallback": output += fallback_text_to_md( box.textlines, ignore_code=ignore_code or page.full_ocred, @@ -741,7 +741,7 @@ def to_text( continue if btype == "page-footer" and footer is False: continue - if btype in ("picture", "formula", "fallback"): + if btype in ("picture", "formula", "table-fallback"): output += f"==> picture [{clip.width} x {clip.height}] <==\n\n" if box.textlines: if btype == "picture": @@ -750,7 +750,7 @@ def to_text( ignore_code=ignore_code or page.full_ocred, clip=clip, ) - elif btype == "fallback": + elif btype == "table-fallback": output += fallback_text_to_text( box.textlines, ignore_code=ignore_code or page.full_ocred, @@ -1018,7 +1018,7 @@ def parse_document( except Exception as e: # print(f"table detection error '{e}' on page {page.number+1}") - layoutbox.boxclass = "fallback" + layoutbox.boxclass = "table-fallback" # table structure not detected: treat like an image if document.embed_images or document.write_images: pix = page.get_pixmap(clip=clip, dpi=document.image_dpi) diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index 4f3cc890..1a3a6546 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -101,7 +101,7 @@ def sanitize_spans(line): ): continue # no joining # We need to join bbox and text of two consecutive spans - # On occasion, spans may also be duplicated. + # Sometimes, spans may also be duplicated. if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]: s0["text"] += s1["text"] s0["bbox"] |= s1["bbox"] # join boundary boxes @@ -131,7 +131,8 @@ def sanitize_spans(line): continue for sno, s in enumerate(line["spans"]): # the numered spans sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect - if is_white(s["text"]): # ignore white text + if is_white(s["text"]): + # ignore white text if not a Type3 font continue # Ignore invisible text. Type 3 font text is never invisible. if ( diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index f6e261a8..c9beb15d 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -258,6 +258,10 @@ def clean_tables(page, blocks): l for b in blocks if b["type"] == 0 for l in b["lines"] if l["bbox"] in bbox ] y_vals0 = sorted(set(round(l["bbox"][3]) for l in lines)) + if not y_vals0: + # no text lines in the table bbox + page.layout_information[i][4] = "table-fallback" + continue y_vals = [y_vals0[0]] for y in y_vals0[1:]: if y - y_vals[-1] > 3: diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index f68a2df4..1e8e3c90 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,3 +1,3 @@ # Generated file - do not edit. MINIMUM_PYMUPDF_VERSION = (1, 26, 6) -VERSION = '0.2.5' +VERSION = '0.2.6' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 79c08042..18ebd2a6 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -11,7 +11,7 @@ "Topic :: Utilities", ] -version = "0.2.5" +version = "0.2.6" pymupdf_version = "1.26.6" pymupdf_version_tuple = tuple(int(x) for x in pymupdf_version.split(".")) requires = [f"pymupdf>={pymupdf_version}", "tabulate"]