Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Change Log

## Changes in version 0.2.6

### Fixes:

* [Forum](https://forum.mupdf.com/t/bug-pymupdf4llm-list-index-out-of-range-in-document-layout-py-2/216) - List index out of range ...
Copy link

Copilot AI Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The description for the fix is incomplete. It ends with an ellipsis ("...") without providing a complete explanation of what was fixed. Consider providing a complete description such as: "List index out of range error when processing tables with no text lines"

Suggested change
* [Forum](https://forum.mupdf.com/t/bug-pymupdf4llm-list-index-out-of-range-in-document-layout-py-2/216) - List index out of range ...
* [Forum](https://forum.mupdf.com/t/bug-pymupdf4llm-list-index-out-of-range-in-document-layout-py-2/216) - Fixed "List index out of range" error when processing tables with no text lines.

Copilot uses AI. Check for mistakes.

### Other Changes:


------

## Changes in version 0.2.5

### Fixes:
Expand Down
2 changes: 1 addition & 1 deletion pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()

version = "0.2.4" # must always equal the pymupdf4llm version
version = "0.2.6" # must always equal the pymupdf4llm version

classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down
12 changes: 6 additions & 6 deletions pymupdf4llm/pymupdf4llm/helpers/document_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ def fallback_text_to_md(textlines, ignore_code: bool = False, clip=None):
for tl in textlines:
ltext = "|" + "|".join([s["text"].strip() for s in tl["spans"]]) + "|\n"
output += ltext
output += "**----- End of picture text -----**<br>\n"
output += "\n**----- End of picture text -----**<br>\n"
return output + "\n\n"


Expand Down Expand Up @@ -631,7 +631,7 @@ def to_markdown(
continue

# pictures and formulas: either write image file or embed
if btype in ("picture", "formula", "fallback"):
if btype in ("picture", "formula", "table-fallback"):
if isinstance(box.image, str):
output += GRAPHICS_TEXT % box.image + "\n\n"
elif isinstance(box.image, bytes):
Expand All @@ -650,7 +650,7 @@ def to_markdown(
ignore_code=ignore_code or page.full_ocred,
clip=clip,
)
elif btype == "fallback":
elif btype == "table-fallback":
output += fallback_text_to_md(
box.textlines,
ignore_code=ignore_code or page.full_ocred,
Expand Down Expand Up @@ -741,7 +741,7 @@ def to_text(
continue
if btype == "page-footer" and footer is False:
continue
if btype in ("picture", "formula", "fallback"):
if btype in ("picture", "formula", "table-fallback"):
output += f"==> picture [{clip.width} x {clip.height}] <==\n\n"
if box.textlines:
if btype == "picture":
Expand All @@ -750,7 +750,7 @@ def to_text(
ignore_code=ignore_code or page.full_ocred,
clip=clip,
)
elif btype == "fallback":
elif btype == "table-fallback":
output += fallback_text_to_text(
box.textlines,
ignore_code=ignore_code or page.full_ocred,
Expand Down Expand Up @@ -1018,7 +1018,7 @@ def parse_document(

except Exception as e:
# print(f"table detection error '{e}' on page {page.number+1}")
layoutbox.boxclass = "fallback"
layoutbox.boxclass = "table-fallback"
# table structure not detected: treat like an image
if document.embed_images or document.write_images:
pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
Expand Down
5 changes: 3 additions & 2 deletions pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def sanitize_spans(line):
):
continue # no joining
# We need to join bbox and text of two consecutive spans
# On occasion, spans may also be duplicated.
# Sometimes, spans may also be duplicated.
if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
s0["text"] += s1["text"]
s0["bbox"] |= s1["bbox"] # join boundary boxes
Expand Down Expand Up @@ -131,7 +131,8 @@ def sanitize_spans(line):
continue
for sno, s in enumerate(line["spans"]): # the numered spans
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
if is_white(s["text"]): # ignore white text
if is_white(s["text"]):
# ignore white text if not a Type3 font
continue
# Ignore invisible text. Type 3 font text is never invisible.
if (
Expand Down
4 changes: 4 additions & 0 deletions pymupdf4llm/pymupdf4llm/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ def clean_tables(page, blocks):
l for b in blocks if b["type"] == 0 for l in b["lines"] if l["bbox"] in bbox
]
y_vals0 = sorted(set(round(l["bbox"][3]) for l in lines))
if not y_vals0:
# no text lines in the table bbox
page.layout_information[i][4] = "table-fallback"
continue
y_vals = [y_vals0[0]]
for y in y_vals0[1:]:
if y - y_vals[-1] > 3:
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/versions_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Generated file - do not edit.
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
VERSION = '0.2.5'
VERSION = '0.2.6'
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"Topic :: Utilities",
]

version = "0.2.5"
version = "0.2.6"
pymupdf_version = "1.26.6"
pymupdf_version_tuple = tuple(int(x) for x in pymupdf_version.split("."))
requires = [f"pymupdf>={pymupdf_version}", "tabulate"]
Expand Down