diff --git a/CHANGES.md b/CHANGES.md
index 3f6271f0..6277e35e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,16 @@
# Change Log
+## Changes in version 0.2.6
+
+### Fixes:
+
+* [Forum](https://forum.mupdf.com/t/bug-pymupdf4llm-list-index-out-of-range-in-document-layout-py-2/216) - List index out of range ...
+
+### Other Changes:
+
+
+------
+
## Changes in version 0.2.5
### Fixes:
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
index b3c962d6..71c771f0 100644
--- a/pdf4llm/setup.py
+++ b/pdf4llm/setup.py
@@ -6,7 +6,7 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()
-version = "0.2.4" # must always equal the pymupdf4llm version
+version = "0.2.6" # must always equal the pymupdf4llm version
classifiers = [
"Development Status :: 5 - Production/Stable",
diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
index 2a278332..1b922a1d 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
@@ -548,7 +548,7 @@ def fallback_text_to_md(textlines, ignore_code: bool = False, clip=None):
for tl in textlines:
ltext = "|" + "|".join([s["text"].strip() for s in tl["spans"]]) + "|\n"
output += ltext
- output += "**----- End of picture text -----**
\n"
+ output += "\n**----- End of picture text -----**
\n"
return output + "\n\n"
@@ -631,7 +631,7 @@ def to_markdown(
continue
# pictures and formulas: either write image file or embed
- if btype in ("picture", "formula", "fallback"):
+ if btype in ("picture", "formula", "table-fallback"):
if isinstance(box.image, str):
output += GRAPHICS_TEXT % box.image + "\n\n"
elif isinstance(box.image, bytes):
@@ -650,7 +650,7 @@ def to_markdown(
ignore_code=ignore_code or page.full_ocred,
clip=clip,
)
- elif btype == "fallback":
+ elif btype == "table-fallback":
output += fallback_text_to_md(
box.textlines,
ignore_code=ignore_code or page.full_ocred,
@@ -741,7 +741,7 @@ def to_text(
continue
if btype == "page-footer" and footer is False:
continue
- if btype in ("picture", "formula", "fallback"):
+ if btype in ("picture", "formula", "table-fallback"):
output += f"==> picture [{clip.width} x {clip.height}] <==\n\n"
if box.textlines:
if btype == "picture":
@@ -750,7 +750,7 @@ def to_text(
ignore_code=ignore_code or page.full_ocred,
clip=clip,
)
- elif btype == "fallback":
+ elif btype == "table-fallback":
output += fallback_text_to_text(
box.textlines,
ignore_code=ignore_code or page.full_ocred,
@@ -1018,7 +1018,7 @@ def parse_document(
except Exception as e:
# print(f"table detection error '{e}' on page {page.number+1}")
- layoutbox.boxclass = "fallback"
+ layoutbox.boxclass = "table-fallback"
# table structure not detected: treat like an image
if document.embed_images or document.write_images:
pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
index 4f3cc890..1a3a6546 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -101,7 +101,7 @@ def sanitize_spans(line):
):
continue # no joining
# We need to join bbox and text of two consecutive spans
- # On occasion, spans may also be duplicated.
+ # Sometimes, spans may also be duplicated.
if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
s0["text"] += s1["text"]
s0["bbox"] |= s1["bbox"] # join boundary boxes
@@ -131,7 +131,8 @@ def sanitize_spans(line):
continue
for sno, s in enumerate(line["spans"]): # the numered spans
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
- if is_white(s["text"]): # ignore white text
+ if is_white(s["text"]):
+ # ignore white text if not a Type3 font
continue
# Ignore invisible text. Type 3 font text is never invisible.
if (
diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py
index f6e261a8..c9beb15d 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/utils.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py
@@ -258,6 +258,10 @@ def clean_tables(page, blocks):
l for b in blocks if b["type"] == 0 for l in b["lines"] if l["bbox"] in bbox
]
y_vals0 = sorted(set(round(l["bbox"][3]) for l in lines))
+ if not y_vals0:
+ # no text lines in the table bbox
+ page.layout_information[i][4] = "table-fallback"
+ continue
y_vals = [y_vals0[0]]
for y in y_vals0[1:]:
if y - y_vals[-1] > 3:
diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py
index f68a2df4..1e8e3c90 100644
--- a/pymupdf4llm/pymupdf4llm/versions_file.py
+++ b/pymupdf4llm/pymupdf4llm/versions_file.py
@@ -1,3 +1,3 @@
# Generated file - do not edit.
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
-VERSION = '0.2.5'
+VERSION = '0.2.6'
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
index 79c08042..18ebd2a6 100644
--- a/pymupdf4llm/setup.py
+++ b/pymupdf4llm/setup.py
@@ -11,7 +11,7 @@
"Topic :: Utilities",
]
-version = "0.2.5"
+version = "0.2.6"
pymupdf_version = "1.26.6"
pymupdf_version_tuple = tuple(int(x) for x in pymupdf_version.split("."))
requires = [f"pymupdf>={pymupdf_version}", "tabulate"]