diff --git a/CHANGES.md b/CHANGES.md
index 11325052..7e0d5289 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,15 +1,53 @@
# Change Log
-## Changes in version 0.0.28
+## Changes in version 0.2.1
### Fixes:
-* [xxx](https://github.com/pymupdf/RAG/issues/xxx) -
+* [320](https://github.com/pymupdf/RAG/issues/320) - [Bug] ValueError: min() iterable argument is empty ...
+* [319](https://github.com/pymupdf/RAG/issues/319) - [Bug] ValueError: min() arg is an empty sequence
### Other Changes:
-* xxx
+* OCR invocation now differentiates between full-page OCR and text-only OCR: If the page does contain text but the percentage of unreadable characters exceeds a certain threshold (90%), we only OCR text span boundary boxes and replace span text with OCR'ed text where necessary.
+------
+
+## Changes in version 0.2.0
+
+This version introduces full support of the [PyMuPDF-Layout](https://pypi.org/project/pymupdf-layout/) package. This entails a radically new approach for detecting the layout of document pages using the AI-based features of the layout package.
+
+Improvements include:
+
+* Greatly improved table detection
+* Support of list item hierachy levels
+* Detection of page headers and footers
+* Improved detection of text paragraphs, titles and section headers
+* New output options beyond Markdown: plain text and JSON
+* Automatically detect whether a page needs OCR and invoke Tesseract if both, Tesseract is installed and OpenCV (package [opencv-python](https://pypi.org/project/opencv-python/)) is available. Invocation criteria include absence of readable text, full-page coverage with images, presence of many character-sized vector graphics.
+
+The PyMuPDF-Layout package is not open-source and has its own license, which is different from PyMuPDF4LLM. It also is dependent on a number of other, fairly large packages like [onnxruntime](https://pypi.org/project/onnxruntime/), [numpy](https://pypi.org/project/numpy/), [sympy](https://pypi.org/project/sympy/) and [OpenCV](https://pypi.org/project/opencv-python/), which each in turn have their own dependencies.
+
+We therefore keep the use of the layout feature optional. To activate PyMuPDF-Layout support the following import statement **_must be included before_** importing PyMuPDF4LLM itself:
+
+```python
+import pymupdf.layout
+import pymupdf4llm
+```
+
+Thereafter, PyMuPDF's namespace is available. The known method `pymupdf4llm.to_markdown()` automatically works with AI-based empowerment.
+In addition, two new methods become available:
+* `pymupdf4llm.to_text()` - which works much like markdown output but produces plain text.
+* `pymupdf4llm.to_json()` - which outputs the document's metadata and the selected pages in JSON format.
+
+### Fixes:
+
+
+### Other Changes:
+
+* If `show_progress=True`, Python package [tqdm](https://pypi.org/project/tqdm/) is automatically used when available to display a progress bar. If tqdm is not installed, our own text-based progress bar is used.
+
+------
## Changes in version 0.0.27
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
index ba82a62c..2e434a8b 100644
--- a/pdf4llm/setup.py
+++ b/pdf4llm/setup.py
@@ -6,7 +6,7 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()
-version = "0.2.0"
+version = "0.2.1"
classifiers = [
"Development Status :: 5 - Production/Stable",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
index 8ca76f53..d8139ddc 100644
--- a/pymupdf4llm/pymupdf4llm/__init__.py
+++ b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -26,6 +26,9 @@ def parse_document(
image_format="png",
image_path="",
pages=None,
+ output_images=True,
+ show_progress=False,
+ force_text=True,
):
return DL.parse_document(
doc,
@@ -34,6 +37,9 @@ def parse_document(
image_format=image_format,
image_path=image_path,
pages=pages,
+ output_images=output_images,
+ show_progress=show_progress,
+ force_text=force_text,
)
def to_markdown(
@@ -75,6 +81,9 @@ def to_markdown(
image_format=image_format,
image_path=image_path,
pages=pages,
+ output_images=embed_images or write_images,
+ show_progress=show_progress,
+ force_text=force_text,
)
return parsed_doc.to_markdown(
header=header,
@@ -82,6 +91,7 @@ def to_markdown(
write_images=write_images,
embed_images=embed_images,
ignore_code=ignore_code,
+ show_progress=show_progress,
)
def to_json(
@@ -92,6 +102,9 @@ def to_json(
image_format="png",
image_path="",
pages=None,
+ output_images=False,
+ show_progress=False,
+ force_text=True,
):
parsed_doc = parse_document(
doc,
@@ -99,6 +112,9 @@ def to_json(
image_format=image_format,
image_path=image_path,
pages=pages,
+ output_images=output_images,
+ show_progress=show_progress,
+ force_text=force_text,
)
return parsed_doc.to_json()
@@ -109,6 +125,8 @@ def to_text(
footer=True,
pages=None,
ignore_code=False,
+ show_progress=False,
+ force_text=True,
):
parsed_doc = parse_document(
doc,
@@ -117,11 +135,15 @@ def to_text(
image_format="png",
image_path="",
pages=pages,
+ output_images=False,
+ show_progress=show_progress,
+ force_text=force_text,
)
return parsed_doc.to_text(
header=header,
footer=footer,
ignore_code=ignore_code,
+ show_progress=show_progress,
)
diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
index f9ad27f8..37c658e6 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
@@ -1,28 +1,42 @@
-import pymupdf # PyMuPDF
-import numpy as np
import cv2
+import numpy as np
+import pymupdf # PyMuPDF
+from pymupdf4llm.helpers.utils import WHITE_CHARS
-WHITE_CHARS = set(
- [chr(i) for i in range(33)]
- + [
- "\u00a0", # Non-breaking space
- "\u2000", # En quad
- "\u2001", # Em quad
- "\u2002", # En space
- "\u2003", # Em space
- "\u2004", # Three-per-em space
- "\u2005", # Four-per-em space
- "\u2006", # Six-per-em space
- "\u2007", # Figure space
- "\u2008", # Punctuation space
- "\u2009", # Thin space
- "\u200a", # Hair space
- "\u202f", # Narrow no-break space
- "\u205f", # Medium mathematical space
- "\u3000", # Ideographic space
- ]
-)
+def get_tessocr(page, bbox, dpi=300):
+ """Return OCR-ed span text using Tesseract.
+
+ Args:
+ page: pymupdf Page
+ bbox: pymupdf Rect or its sequence
+ dpi: resolution for OCR image
+ Returns:
+ The OCR-ed text of the bbox.
+ """
+ # Step 1: Make a high-resolution image of the bbox.
+ pix = page.get_pixmap(dpi=dpi, clip=bbox)
+ ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
+ ocrpage = ocrpdf[0]
+ text = ocrpage.get_text()
+ text = text.replace("\n", " ").strip()
+ return text
+
+
+def repair_blocks(input_blocks, page):
+ repaired_blocks = []
+ for block in input_blocks:
+ if block["type"] == 0: # text block
+ for line in block["lines"]:
+ for span in line["spans"]:
+ if not chr(0xFFFD) in span["text"]:
+ continue
+ text = get_tessocr(page, span["bbox"])
+ span["text"] = text
+ repaired_blocks.append(block)
+ else:
+ repaired_blocks.append(block)
+ return repaired_blocks
def detect_qr_codes(img):
@@ -152,23 +166,38 @@ def should_ocr_page(
# Check for text
text = page.get_text(flags=0)
decision["has_text"] = not WHITE_CHARS.issuperset(text)
- if decision["has_text"]:
- not_readable_count = len([c for c in text if c == chr(0xFFFD)])
- readability = 1 - not_readable_count / len(text)
- decision["readable_text"] = readability >= text_readability_thresh
all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
decision["has_ocr_text"] = bool(ocr_text_bboxes)
+
+ if decision["has_text"]:
+ unreadable_count = len([c for c in text if c == chr(0xFFFD)])
+ readability = 1 - unreadable_count / len(text)
+ decision["readable_text"] = readability >= text_readability_thresh
+
+ if decision["has_text"] and not decision["readable_text"]:
+ decision["should_ocr"] = True
+ decision["image"], decision["transform"], decision["pixmap"] = get_page_image(
+ page, dpi=dpi
+ )
+
+ if decision["has_text"]:
+ # early exit if any text exists
+ print(
+ f"{decision['has_text']=}, {decision['readable_text']=}, {decision['should_ocr']=}"
+ )
+ return decision
+
# Check for image coverage
- image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
- image_rect=pymupdf.EMPTY_RECT()
+ image_rects = [page_rect & img["bbox"] for img in page.get_image_info()]
+ image_rect = pymupdf.EMPTY_RECT()
for r in image_rects:
- image_rect|=r
- image_area=abs(image_rect)
+ image_rect |= r
+ image_area = abs(image_rect)
if image_area:
images_cover = image_area / page_area
- else:
+ else:
images_cover = 0.0
decision["image_covers_page"] = images_cover >= image_coverage_thresh
@@ -189,16 +218,11 @@ def should_ocr_page(
# Final decision
if (
- 1
- and not decision["has_text"]
- and not decision["readable_text"]
- and (
- 0
- or decision["image_covers_page"]
- or decision["has_vector_drawings"]
- or decision["edge_density"] > edge_thresh
- )
+ 0
+ or decision["image_covers_page"]
+ or decision["has_vector_drawings"]
+ or decision["edge_density"] > edge_thresh
):
decision["should_ocr"] = True
-
+
return decision
diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
index 22ca33a3..a69f5dfe 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
@@ -1,15 +1,19 @@
import base64
import json
import os
-from binascii import b2a_base64
from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
from pathlib import Path
+from typing import Dict, List, Optional, Union
+
import pymupdf
import tabulate
-from pymupdf4llm.helpers.get_text_lines import get_raw_lines
from pymupdf4llm.helpers import utils
+from pymupdf4llm.helpers.get_text_lines import get_raw_lines
+try:
+ from tqdm import tqdm as ProgressBar
+except ImportError:
+ from pymupdf4llm.helpers.progress import ProgressBar
try:
import cv2
from pymupdf4llm.helpers import check_ocr
@@ -33,7 +37,7 @@
def omit_if_pua_char(text):
"""Check if character is in the Private Use Area (PUA) of Unicode."""
- if len(text) > 1:
+ if len(text) > 1: # only single characters are checked
return text
o = ord(text)
if (
@@ -46,8 +50,7 @@ def omit_if_pua_char(text):
def create_list_item_levels(layout_info):
- """Create a dictionary that maps the box number of each list-item to
- its hierarchy level.
+ """Map the layout box number of each list-item to its hierarchy level.
Args:
layout_info (list): the bbox list "page.layout_information"
@@ -59,10 +62,10 @@ def create_list_item_levels(layout_info):
segment = [] # current segment
# Create segments of contiguous list items. Each non-list-item finishes
- # the current segment. Also, if two list-items belong to different page
- # text columns ends a segment.
+ # the current segment. Also, two list-items in a row belonging to different
+ # page text columns end the segment after the first item.
for i, item in enumerate(layout_info):
- if item.boxclass != "list-item": # bbox class is not list-item
+ if item.boxclass != "list-item": # bbox class is no list-item
if segment: # end and save the current segment
segments.append(segment)
segment = []
@@ -83,7 +86,7 @@ def create_list_item_levels(layout_info):
# walk through segments and assign levels
for i, s in enumerate(segments):
- if not s:
+ if not s: # skip empty segments
continue
s.sort(key=lambda x: x[1].x0) # sort by x0 coordinate of the bbox
@@ -119,16 +122,15 @@ def is_monospaced(textlines):
def is_superscripted(line):
spans = line["spans"]
+ line_bbox = line["bbox"]
if not spans:
return False
- if spans[0]["flags"] & 1: # check for superscript
+ span0 = spans[0]
+ if span0["flags"] & 1: # check for superscript flag
return True
- if len(spans) < 2:
+ if len(spans) < 2: # single span line: skip
return False
- if (
- spans[0]["origin"][1] < spans[1]["origin"][1]
- and spans[0]["size"] < spans[1]["size"]
- ):
+ if spans0["origin"][1] < spans[1]["origin"][1] and span0["size"] < spans[1]["size"]:
return True
return False
@@ -138,18 +140,18 @@ def get_plain_text(spans):
Parameter is a list of span dictionaries. The spans may come from
one or more original "textlines" items.
Returns the text string of the boundary box.
- The text string always ends with the suffix and a space
"""
output = ""
for i, s in enumerate(spans):
- span_text = s["text"].strip() # remove leading/trailing spaces
superscript = s["flags"] & 1
- span_text = s["text"].strip()
+ span_text = s["text"].strip() # remove leading/trailing spaces
if superscript:
+ # enclose superscripted text in brackets if first span
if i == 0:
span_text = f"[{span_text}] "
elif output.endswith(" "):
output = output[:-1]
+ # resolve hyphenation
if output.endswith("- ") and len(output.split()[-1]) > 2:
output = output[:-2]
output += span_text + " "
@@ -160,7 +162,7 @@ def list_item_to_text(textlines, level):
"""
Convert "list-item" bboxes to text.
"""
- indent = " " * (level - 1)
+ indent = " " * (level - 1) # indentation based on level
output = indent
line = textlines[0]
x0 = line["bbox"][0] # left of first line
@@ -195,11 +197,13 @@ def footnote_to_text(textlines):
"""
Convert "footnote" bboxes to text.
"""
+ # we render footnotes as blockquotes
output = "> "
line = textlines[0]
spans = line["spans"]
for line in textlines[1:]:
+ # superscripted line starts a new footnote line
if is_superscripted(line):
line_output = get_plain_text(spans)
output += line_output
@@ -214,7 +218,10 @@ def footnote_to_text(textlines):
def code_block_to_text(textlines):
- """Output a code block in plain text format."""
+ """Output a code block in plain text format.
+
+ Basic difference is that lines are separated by line breaks.
+ """
output = ""
for line in textlines:
line_text = ""
@@ -228,15 +235,17 @@ def code_block_to_text(textlines):
def text_to_text(textlines, ignore_code: bool = False):
"""
- Convert "text" bboxes to plain text, as well as other boxclasses
+ Convert "text" bboxes to plain text, as well as boxclasses
not specifically handled elsewhere.
- The line text is written without line breaks. At the end,
- two newlines are added to separate from the next block.
+ The text of all spans of all lines is written without line breaks.
+ At the end, two newlines are added to separate from the next block.
"""
if not textlines:
return ""
if is_superscripted(textlines[0]): # check for superscript
+ # handle mis-classified text boundary box
return footnote_to_text(textlines)
+ # handle completely mnonospaced textlines as code block
if not ignore_code and is_monospaced(textlines):
return code_block_to_text(textlines)
@@ -249,6 +258,47 @@ def text_to_text(textlines, ignore_code: bool = False):
return output + "\n\n"
+def picture_text_to_text(textlines, ignore_code: bool = False, clip=None):
+ """
+ Convert text extracted from images to plain text format.
+ """
+ output = "----- Start of picture text -----\n"
+ for tl in textlines:
+ line_text = " ".join([s["text"] for s in tl["spans"]])
+ output += line_text.rstrip() + "\n"
+ output += "----- End of picture text -----\n"
+ return output + "\n"
+
+
+def fallback_text_to_text(textlines, ignore_code: bool = False, clip=None):
+ """Convert text extracted from unrecognized tables.
+
+ We hope for some sort of table structure being present in the text spans:
+ The maximum span count in the lines is assumed to equal column count.
+ """
+ span_count = max(len(tl["spans"]) for tl in textlines)
+ lines = []
+ output = ""
+ for tl in textlines:
+ spans = tl["spans"]
+ # prepare a row with empty strings in each cell
+ line = [""] * span_count
+ if len(spans) < span_count and spans[0]["bbox"][0] > clip[0] + 10:
+ i = 1
+ else:
+ i = 0
+ for j, s in enumerate(spans, start=i):
+ line[j] = s["text"].strip()
+ lines.append(line)
+ tab_text = tabulate.tabulate(
+ lines,
+ tablefmt="grid",
+ maxcolwidths=int(100 / span_count),
+ )
+ output += tab_text + "\n"
+ return output + "\n"
+
+
def get_styled_text(spans):
"""Output text with markdown style codes based on font properties.
Parameter is a list of span dictionaries. The spans may come from
@@ -325,7 +375,7 @@ def list_item_to_md(textlines, level):
This post-layout heuristics helps cover cases where more than
one list item is contained in a single bbox.
"""
- indent = " " * (level - 1)
+ indent = " " * (level - 1) # indentation based on level
line = textlines[0]
x0 = line["bbox"][0] # left of first line
spans = line["spans"]
@@ -337,6 +387,7 @@ def list_item_to_md(textlines, level):
starter = "1. "
if not omit_if_pua_char(span0["text"].strip()):
+ # bullet was a PUA char: remove it
spans.pop(0)
if spans:
x0 = spans[0]["bbox"][0]
@@ -452,6 +503,33 @@ def text_to_md(textlines, ignore_code: bool = False):
return output + "\n\n"
+def picture_text_to_md(textlines, ignore_code: bool = False, clip=None):
+ """
+ Convert text extracted from images to markdown format.
+ """
+ output = "**----- Start of picture text -----**
\n"
+ for tl in textlines:
+ line_text = " ".join([s["text"] for s in tl["spans"]])
+ output += line_text.rstrip() + "
"
+ output += "**----- End of picture text -----**
\n"
+ return output + "\n\n"
+
+
+def fallback_text_to_md(textlines, ignore_code: bool = False, clip=None):
+ """
+ Convert text extracted from images to markdown format.
+ """
+ span_count = max(len(tl["spans"]) for tl in textlines)
+ output = "**----- Start of picture text -----**
\n"
+ output += "|" * (span_count + 1) + "\n"
+ output += "|" + "|".join(["---"] * span_count) + "|\n"
+ for tl in textlines:
+ ltext = "|" + "|".join([s["text"].strip() for s in tl["spans"]]) + "|\n"
+ output += ltext
+ output += "**----- End of picture text -----**
\n"
+ return output + "\n\n"
+
+
@dataclass
class LayoutBox:
x0: float
@@ -502,42 +580,68 @@ def to_markdown(
write_images: bool = False,
embed_images: bool = False,
ignore_code: bool = False,
+ show_progress: bool = False,
) -> str:
"""
Serialize ParsedDocument to markdown text.
"""
output = ""
- for page in self.pages:
+ if show_progress and len(self.pages) > 5:
+ print(f"Generating markdown text...")
+ this_iterator = ProgressBar(self.pages)
+ else:
+ this_iterator = self.pages
+ for page in this_iterator:
- # make mapping: box number to list item level
+ # Make a mapping: box number -> list item hierarchy level
list_item_levels = create_list_item_levels(page.boxes)
+
for i, box in enumerate(page.boxes):
clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1)
btype = box.boxclass
+
+ # skip headers/footers if requested
if btype == "page-header" and header is False:
continue
if btype == "page-footer" and footer is False:
continue
- if btype in ("picture", "formula") and box.image:
- img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}"
- if write_images:
- filename = os.path.basename(self.filename).replace(" ", "-")
- image_filename = os.path.join(
- self.image_path,
- f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}",
- )
- Path(image_filename).write_bytes(box.image)
-
- output += GRAPHICS_TEXT % img_filename
-
- elif embed_images:
- # make a base64 encoded string of the image
- data = b2a_base64(box.image).decode()
- data = f"data:image/{self.image_format};base64," + data
- output += GRAPHICS_TEXT % data + "\n\n"
+ # pictures and formulas: either write image file or embed
+ if btype in ("picture", "formula", "fallback"):
+ if box.image:
+ if write_images:
+ img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}"
+ filename = os.path.basename(self.filename).replace(" ", "-")
+ image_filename = os.path.join(
+ self.image_path,
+ f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}",
+ )
+ Path(image_filename).write_bytes(box.image)
+
+ output += GRAPHICS_TEXT % img_filename
+
+ elif embed_images:
+ # make a base64 encoded string of the image
+ data = base64.b64encode(box.image).decode()
+ data = f"data:image/{self.image_format};base64," + data
+ output += GRAPHICS_TEXT % data + "\n\n"
else:
- output += f"**==> {btype} [{clip.width} x {clip.height}] intentionally omitted <==**\n\n"
+ output += f"**==> picture [{clip.width} x {clip.height}] intentionally omitted <==**\n\n"
+
+ # output text in image if requested
+ if box.textlines:
+ if btype == "picture":
+ output += picture_text_to_md(
+ box.textlines,
+ ignore_code=ignore_code or page.ocrpage,
+ clip=clip,
+ )
+ elif btype == "fallback":
+ output += fallback_text_to_md(
+ box.textlines,
+ ignore_code=ignore_code or page.ocrpage,
+ clip=clip,
+ )
continue
if btype == "table":
output += box.table["markdown"] + "\n\n"
@@ -564,7 +668,7 @@ def to_markdown(
return output
- def to_json(self) -> str:
+ def to_json(self, show_progress=False) -> str:
# Serialize to JSON
class LayoutEncoder(json.JSONEncoder):
def default(self, s):
@@ -593,13 +697,19 @@ def to_text(
header: bool = True,
footer: bool = True,
ignore_code: bool = False,
+ show_progress: bool = False,
) -> str:
"""
Serialize ParsedDocument to plain text. Optionally omit page headers or footers.
"""
# Flatten all text boxes into plain text
output = ""
- for page in self.pages:
+ if show_progress and len(self.pages) > 5:
+ print(f"Generating plain text ..")
+ this_iterator = ProgressBar(self.pages)
+ else:
+ this_iterator = self.pages
+ for page in this_iterator:
list_item_levels = create_list_item_levels(page.boxes)
for i, box in enumerate(page.boxes):
clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1)
@@ -608,8 +718,21 @@ def to_text(
continue
if btype == "page-footer" and footer is False:
continue
- if btype in ("picture", "formula"):
- output += f"==> {btype} [{clip.width} x {clip.height}] <==\n\n"
+ if btype in ("picture", "formula", "fallback"):
+ output += f"==> picture [{clip.width} x {clip.height}] <==\n\n"
+ if box.textlines:
+ if btype == "picture":
+ output += picture_text_to_text(
+ box.textlines,
+ ignore_code=ignore_code or page.ocrpage,
+ clip=clip,
+ )
+ elif btype == "fallback":
+ output += fallback_text_to_text(
+ box.textlines,
+ ignore_code=ignore_code or page.ocrpage,
+ clip=clip,
+ )
continue
if btype == "table":
output += (
@@ -637,6 +760,9 @@ def parse_document(
image_format="png",
image_path="",
pages=None,
+ show_progress=False,
+ output_images=True,
+ force_text=False,
) -> ParsedDocument:
if isinstance(doc, pymupdf.Document):
mydoc = doc
@@ -651,6 +777,7 @@ def parse_document(
document.image_format = image_format
document.image_path = image_path
document.pages = []
+ document.force_text = force_text
try:
reason = "OpenCV not installed"
assert cv2 is not None
@@ -677,6 +804,9 @@ def parse_document(
raise ValueError(
"'pages' parameter must be None, int, or a sequence of ints less than page count"
)
+ if show_progress and len(page_filter) > 5:
+ print(f"Parsing {len(page_filter)} pages of '{document.filename}'...")
+ page_filter = ProgressBar(page_filter)
for pno in page_filter:
page = mydoc.load_page(pno)
@@ -687,28 +817,36 @@ def parse_document(
decision = {"should_ocr": False}
if decision["should_ocr"]:
print(f"Performing OCR on {page.number=}[{page.number+1}]...")
- pix = decision["pixmap"] # retrieve the Pixmap
- pdf_data = pix.pdfocr_tobytes() # OCR it
- ocr_pdf = pymupdf.open("pdf", pdf_data) # get the OCR'd PDF
- ocrpage = ocr_pdf[0] # this is its OCR'd page
- # remove everything except the text
- ocrpage.add_redact_annot(ocrpage.rect)
- ocrpage.apply_redactions(
- images=pymupdf.PDF_REDACT_IMAGE_REMOVE,
- graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
- text=pymupdf.PDF_REDACT_TEXT_NONE,
- )
- # copy text over to original page
- page.show_pdf_page(page.rect, ocr_pdf, 0)
- ocr_pdf.close() # discard temporary OCR PDF
- del ocr_pdf
+ if not decision.get("has_text"):
+ pix = decision["pixmap"] # retrieve the Pixmap
+ pdf_data = pix.pdfocr_tobytes() # OCR it
+ ocr_pdf = pymupdf.open("pdf", pdf_data) # get the OCR'd PDF
+ ocrpage = ocr_pdf[0] # this is its OCR'd page
+ # remove everything except the text
+ ocrpage.add_redact_annot(ocrpage.rect)
+ ocrpage.apply_redactions(
+ images=pymupdf.PDF_REDACT_IMAGE_REMOVE,
+ graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
+ text=pymupdf.PDF_REDACT_TEXT_NONE,
+ )
+ # copy text over to original page
+ page.show_pdf_page(page.rect, ocr_pdf, 0)
+ ocr_pdf.close() # discard temporary OCR PDF
+ del ocr_pdf
+ textpage = page.get_textpage(flags=FLAGS)
+ blocks = textpage.extractDICT()["blocks"]
+ else:
+ textpage = page.get_textpage(flags=FLAGS)
+ blocks = textpage.extractDICT()["blocks"]
+ blocks = check_ocr.repair_blocks(blocks, page)
+ else:
+ textpage = page.get_textpage(flags=FLAGS)
+ blocks = textpage.extractDICT()["blocks"]
bboxlog = page.get_bboxlog()
- ocrpage = decision["should_ocr"] or (
+ ocrpage = (
set([b[0] for b in bboxlog if b[0] == "ignore-text"]) == CHECK_OCR_TEXT
)
- textpage = page.get_textpage(flags=FLAGS)
- blocks = textpage.extractDICT()["blocks"]
page.get_layout()
utils.clean_pictures(page, blocks)
utils.add_image_orphans(page, blocks)
@@ -749,8 +887,23 @@ def parse_document(
clip = pymupdf.Rect(box[:4])
if layoutbox.boxclass in ("picture", "formula"):
- pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
- layoutbox.image = pix.tobytes(document.image_format)
+ if output_images:
+ pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
+ layoutbox.image = pix.tobytes(document.image_format)
+ else:
+ layoutbox.image = None
+ if layoutbox.boxclass == "picture" and document.force_text:
+ # extract any text within the image box
+ layoutbox.textlines = [
+ {"bbox": l[0], "spans": l[1]}
+ for l in get_raw_lines(
+ textpage=None,
+ blocks=pagelayout.fulltext,
+ clip=clip,
+ ignore_invisible=not ocrpage,
+ only_horizontal=False,
+ )
+ ]
elif layoutbox.boxclass == "table":
# This is either a table detected by native TableFinder or by
@@ -791,11 +944,23 @@ def parse_document(
)
except Exception as e:
- print(f"table detection error '{e}' on page {page.number+1}")
+ # print(f"table detection error '{e}' on page {page.number+1}")
+ layoutbox.boxclass = "fallback"
# table structure not detected: treat like an image
- pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
- layoutbox.image = pix.tobytes(document.image_format)
- layoutbox.boxclass = "picture"
+ if output_images:
+ pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
+ layoutbox.image = pix.tobytes(document.image_format)
+ else:
+ layoutbox.image = None
+ layoutbox.textlines = [
+ {"bbox": l[0], "spans": l[1]}
+ for l in get_raw_lines(
+ textpage=None,
+ blocks=pagelayout.fulltext,
+ clip=clip,
+ ignore_invisible=not ocrpage,
+ )
+ ]
else:
# Handle text-like box classes:
# Extract text line information within the box.
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
index f3ef2c94..4f3cc890 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -12,17 +12,16 @@
License GNU Affero GPL 3.0
"""
-import string
import sys
import pymupdf
+from pymupdf4llm.helpers.utils import WHITE_CHARS
-WHITE = set(string.whitespace)
TYPE3_FONT_NAME = "Unnamed-T3"
def is_white(text):
- return WHITE.issuperset(text)
+ return WHITE_CHARS.issuperset(text)
def get_raw_lines(
@@ -31,6 +30,7 @@ def get_raw_lines(
clip=None,
tolerance=3,
ignore_invisible=True,
+ only_horizontal=True,
):
"""Extract the text spans from a TextPage in natural reading sequence.
@@ -124,7 +124,10 @@ def sanitize_spans(line):
spans = [] # all spans in TextPage here
for bno, b in enumerate(blocks): # the numbered blocks
for lno, line in enumerate(b["lines"]): # the numbered lines
- if abs(1 - line["dir"][0]) > 1e-3: # only accept horizontal text
+ line_dir = line["dir"]
+ if (
+ only_horizontal and abs(1 - line_dir[0]) > 1e-3
+ ): # only accept horizontal text
continue
for sno, s in enumerate(line["spans"]): # the numered spans
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
@@ -150,12 +153,13 @@ def sanitize_spans(line):
# include line/block numbers to facilitate separator insertion
s["line"] = lno
s["block"] = bno
+ s["dir"] = line_dir
spans.append(s)
if not spans: # no text at all
return []
- spans.sort(key=lambda s: s["bbox"].y1) # sort spans by bottom coord
+ spans.sort(key=lambda s: (-s["dir"][0], s["bbox"].y1)) # sort spans by bottom coord
nlines = [] # final result
line = [spans[0]] # collects spans with fitting vertical coordinates
lrect = spans[0]["bbox"] # rectangle joined from span rectangles
diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
index 4cdd8097..63b966c7 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
@@ -60,9 +60,8 @@
License GNU Affero GPL 3.0
"""
-import string
-
import pymupdf
+from pymupdf4llm.helpers.utils import WHITE_CHARS
pymupdf.TOOLS.unset_quad_corrections(True)
@@ -88,11 +87,10 @@ def column_boxes(
paths: use these drawings instead of extracting here
avoid: ignore text in any of these areas
"""
- WHITE = set(string.whitespace)
def is_white(text):
"""Check for relevant text."""
- return WHITE.issuperset(text)
+ return WHITE_CHARS.issuperset(text)
def in_bbox(bb, bboxes):
"""Return 1-based number if a bbox contains bb, else return 0."""
diff --git a/pymupdf4llm/pymupdf4llm/helpers/progress.py b/pymupdf4llm/pymupdf4llm/helpers/progress.py
index e71e601a..7cbb2eac 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/progress.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/progress.py
@@ -29,13 +29,9 @@ def __init__(self, items: List[Any], progress_width: int = 40):
self._increment = self._progress_width / self._len if self._len else 1
# Init progress bar
- sys.stdout.write(
- "[%s] (0/%d)" % (" " * self._progress_width, self._len)
- )
+ sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len))
sys.stdout.flush()
- sys.stdout.write(
- "\b" * (self._progress_width + len(str(self._len)) + 6)
- )
+ sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6))
def __iter__(self):
return self
@@ -61,9 +57,7 @@ def __next__(self):
# Update the numerical progress
padded_index = str(self._current_index).rjust(self._len_digits)
progress_info = f" ({padded_index}/{self._len})"
- sys.stdout.write(
- "\b" * (self._progress_width + len(progress_info) + 1)
- )
+ sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1))
sys.stdout.write("[")
sys.stdout.write(
"=" * int(self._current_index * self._progress_width / self._len)
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index f0c76e6b..d0f2e9aa 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -45,26 +45,17 @@
from pymupdf import mupdf
from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
from pymupdf4llm.helpers.multi_column import column_boxes
-from pymupdf4llm.helpers.progress import ProgressBar
+from pymupdf4llm.helpers.utils import BULLETS
+
+try:
+ from tqdm import tqdm as ProgressBar
+except ImportError:
+ from pymupdf4llm.helpers.progress import ProgressBar
pymupdf.TOOLS.unset_quad_corrections(True)
-# Characters recognized as bullets when starting a line.
-bullet = tuple(
- [
- "- ",
- "* ",
- "> ",
- chr(0xB6),
- chr(0xB7),
- chr(8224),
- chr(8225),
- chr(8226),
- chr(0xF0A7),
- chr(0xF0B7),
- ]
- + list(map(chr, range(9632, 9680)))
-)
+# Characters assumed as bullets when starting a line.
+bullet = tuple(BULLETS | {"- ", "* ", "> "})
GRAPHICS_TEXT = "\n\n"
@@ -1116,16 +1107,16 @@ def get_page_output(
# layout analysis. Treat whole page as one text block.
text_rects = [parms.clip]
else:
- text_rects = column_boxes(
- parms.page,
- paths=parms.actual_paths,
- no_image_text=not force_text,
- textpage=parms.textpage,
- avoid=parms.tab_rects0 + parms.vg_clusters0,
- footer_margin=margins[3],
- header_margin=margins[1],
- ignore_images=IGNORE_IMAGES,
- )
+ text_rects = column_boxes(
+ parms.page,
+ paths=parms.actual_paths,
+ no_image_text=not force_text,
+ textpage=parms.textpage,
+ avoid=parms.tab_rects0 + parms.vg_clusters0,
+ footer_margin=margins[3],
+ header_margin=margins[1],
+ ignore_images=IGNORE_IMAGES,
+ )
"""
------------------------------------------------------------------
diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py
index 9df5a3e0..f25fadb1 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/utils.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py
@@ -1,6 +1,38 @@
import pymupdf
-white_spaces = set([chr(i) for i in range(33)]) | {0xA0, 0x2002, 0x2003, 0x2009, 0x202F}
+WHITE_CHARS = set(
+ [chr(i) for i in range(33)]
+ + [
+ "\u00a0", # Non-breaking space
+ "\u2000", # En quad
+ "\u2001", # Em quad
+ "\u2002", # En space
+ "\u2003", # Em space
+ "\u2004", # Three-per-em space
+ "\u2005", # Four-per-em space
+ "\u2006", # Six-per-em space
+ "\u2007", # Figure space
+ "\u2008", # Punctuation space
+ "\u2009", # Thin space
+ "\u200a", # Hair space
+ "\u202f", # Narrow no-break space
+ "\u205f", # Medium mathematical space
+ "\u3000", # Ideographic space
+ ]
+)
+
+BULLETS = set(
+ [
+ chr(0xB6),
+ chr(0xB7),
+ chr(0x2020),
+ chr(0x2021),
+ chr(0x2022),
+ chr(0xF0A7),
+ chr(0xF0B7),
+ ]
+ + list(map(chr, range(0x25A0, 0x2600)))
+)
def table_cleaner(page, blocks, tbbox):
@@ -156,19 +188,29 @@ def add_image_orphans(page, blocks):
images = []
for img in page.get_image_info():
r = page.rect & img["bbox"]
+ if r.width <= 3 or r.height <= 3:
+ continue
if r.is_empty or abs(r) >= area_limit:
continue
images.append(r)
paths = []
- for b in blocks:
- if b["type"] != 3:
- continue
- r = page.rect & b["bbox"]
+ vectors = sorted(
+ [
+ page.rect & b["bbox"]
+ for b in blocks
+ if b["type"] == 3
+ and b["bbox"][3] - b["bbox"][1] > 3
+ and b["bbox"][2] - b["bbox"][0] > 3
+ ],
+ key=lambda v: abs(v),
+ reverse=True,
+ )
+ vectors = vectors[:500]
+
+ for r in vectors:
if abs(r) >= area_limit:
continue
- if r.width < 3 and r.height < 3:
- continue
r_low_limit = 0.1 * abs(r)
r_hi_limit = 0.8 * abs(r)
@@ -186,7 +228,7 @@ def add_image_orphans(page, blocks):
# resolve mutual containment of images and vectors
imgs = sorted(images + vectors, key=lambda r: abs(r), reverse=True)
-
+ imgs = imgs[:500]
filtered_imgs = []
for r in imgs:
if not any(r in fr for fr in filtered_imgs):
@@ -405,27 +447,35 @@ def filter_contained(boxes) -> list:
body_boxes.append(box)
# compute joined boxes of body
- joined_boxes = pymupdf.Rect(
- min(b[0] for b in body_boxes),
- min(b[1] for b in body_boxes),
- max(b[2] for b in body_boxes),
- max(b[3] for b in body_boxes),
- )
+ if not body_boxes:
+ joined_boxes = pymupdf.EMPTY_RECT()
+ else:
+ joined_boxes = pymupdf.Rect(
+ min(b[0] for b in body_boxes),
+ min(b[1] for b in body_boxes),
+ max(b[2] for b in body_boxes),
+ max(b[3] for b in body_boxes),
+ )
# extract vectors contained in the TextPage
- min_bbox_height = min(b[3] - b[1] for b in body_boxes)
- vectors = [
- pymupdf.Rect(b["bbox"])
- for b in blocks
- if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and b["bbox"] in joined_boxes
- ]
- # bring body into reading order
- ordered = compute_reading_order(
- body_boxes,
- joined_boxes,
- vectors,
- vertical_gap=this_vertical_gap,
- )
+ if not joined_boxes.is_empty:
+ min_bbox_height = min(b[3] - b[1] for b in body_boxes)
+ vectors = [
+ pymupdf.Rect(b["bbox"])
+ for b in blocks
+ if b["bbox"][3] - b["bbox"][1] >= min_bbox_height
+ and b["bbox"] in joined_boxes
+ ]
+ # bring body into reading order
+ ordered = compute_reading_order(
+ body_boxes,
+ joined_boxes,
+ vectors,
+ vertical_gap=this_vertical_gap,
+ )
+ else:
+ ordered = []
+
# Final full boxes list. We do simple sorts for non-body boxes.
final = (
sorted(page_headers, key=lambda r: (r[1], r[0]))
@@ -633,7 +683,7 @@ def outside_cell(bbox, cell):
bbox = pymupdf.Rect(char["bbox"])
if abs(bbox & cell) > 0.5 * abs(bbox):
span_text += this_char
- elif this_char in white_spaces:
+ elif this_char in WHITE_CHARS:
span_text += " "
if not span_text:
diff --git a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py
index b178d996..e2217498 100644
--- a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py
+++ b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py
@@ -21,9 +21,7 @@ class PDFMarkdownReader(BaseReader):
def __init__(
self,
- meta_filter: Optional[
- Callable[[Dict[str, Any]], Dict[str, Any]]
- ] = None,
+ meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
):
self.meta_filter = meta_filter
@@ -79,15 +77,14 @@ def _process_doc_page(
**load_kwargs: Any,
):
"""Processes a single page of a PDF document."""
- extra_info = self._process_doc_meta(
- doc, file_path, page_number, extra_info
- )
+ extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
if self.meta_filter:
extra_info = self.meta_filter(extra_info)
text = to_markdown(
- doc, pages=[page_number],
+ doc,
+ pages=[page_number],
hdr_info=hdr_info,
**load_kwargs,
)
diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py
index afbe9821..f6a11125 100644
--- a/pymupdf4llm/pymupdf4llm/versions_file.py
+++ b/pymupdf4llm/pymupdf4llm/versions_file.py
@@ -1,3 +1,3 @@
# Generated file - do not edit.
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
-VERSION = '0.2.0'
+VERSION = '0.2.1'
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
index 5c7cd2c6..ca3ca28a 100644
--- a/pymupdf4llm/setup.py
+++ b/pymupdf4llm/setup.py
@@ -14,7 +14,7 @@
"Topic :: Utilities",
]
-version = "0.2.0"
+version = "0.2.1"
requires = ["pymupdf>=1.26.6", "tabulate"]
text = requires[0].split("=")[1]