Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,53 @@
# Change Log

## Changes in version 0.0.28
## Changes in version 0.2.1

### Fixes:

* [xxx](https://github.com/pymupdf/RAG/issues/xxx) -
* [320](https://github.com/pymupdf/RAG/issues/320) - [Bug] ValueError: min() iterable argument is empty ...
* [319](https://github.com/pymupdf/RAG/issues/319) - [Bug] ValueError: min() arg is an empty sequence

### Other Changes:

* xxx
* OCR invocation now differentiates between full-page OCR and text-only OCR: If the page does contain text but the percentage of unreadable characters exceeds a certain threshold (90%), we only OCR text span boundary boxes and replace span text with OCR'ed text where necessary.

------

## Changes in version 0.2.0

This version introduces full support of the [PyMuPDF-Layout](https://pypi.org/project/pymupdf-layout/) package. This entails a radically new approach for detecting the layout of document pages using the AI-based features of the layout package.

Improvements include:

* Greatly improved table detection
* Support of list item hierachy levels
* Detection of page headers and footers
* Improved detection of text paragraphs, titles and section headers
* New output options beyond Markdown: plain text and JSON
* Automatically detect whether a page needs OCR and invoke Tesseract if both, Tesseract is installed and OpenCV (package [opencv-python](https://pypi.org/project/opencv-python/)) is available. Invocation criteria include absence of readable text, full-page coverage with images, presence of many character-sized vector graphics.

The PyMuPDF-Layout package is not open-source and has its own license, which is different from PyMuPDF4LLM. It also is dependent on a number of other, fairly large packages like [onnxruntime](https://pypi.org/project/onnxruntime/), [numpy](https://pypi.org/project/numpy/), [sympy](https://pypi.org/project/sympy/) and [OpenCV](https://pypi.org/project/opencv-python/), which each in turn have their own dependencies.

We therefore keep the use of the layout feature optional. To activate PyMuPDF-Layout support the following import statement **_must be included before_** importing PyMuPDF4LLM itself:

```python
import pymupdf.layout
import pymupdf4llm
```

Thereafter, PyMuPDF's namespace is available. The known method `pymupdf4llm.to_markdown()` automatically works with AI-based empowerment.
In addition, two new methods become available:
* `pymupdf4llm.to_text()` - which works much like markdown output but produces plain text.
* `pymupdf4llm.to_json()` - which outputs the document's metadata and the selected pages in JSON format.

### Fixes:


### Other Changes:

* If `show_progress=True`, Python package [tqdm](https://pypi.org/project/tqdm/) is automatically used when available to display a progress bar. If tqdm is not installed, our own text-based progress bar is used.

------

## Changes in version 0.0.27

Expand Down
2 changes: 1 addition & 1 deletion pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()

version = "0.2.0"
version = "0.2.1"

classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down
22 changes: 22 additions & 0 deletions pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def parse_document(
image_format="png",
image_path="",
pages=None,
output_images=True,
show_progress=False,
force_text=True,
):
return DL.parse_document(
doc,
Expand All @@ -34,6 +37,9 @@ def parse_document(
image_format=image_format,
image_path=image_path,
pages=pages,
output_images=output_images,
show_progress=show_progress,
force_text=force_text,
)

def to_markdown(
Expand Down Expand Up @@ -75,13 +81,17 @@ def to_markdown(
image_format=image_format,
image_path=image_path,
pages=pages,
output_images=embed_images or write_images,
show_progress=show_progress,
force_text=force_text,
)
return parsed_doc.to_markdown(
header=header,
footer=footer,
write_images=write_images,
embed_images=embed_images,
ignore_code=ignore_code,
show_progress=show_progress,
)

def to_json(
Expand All @@ -92,13 +102,19 @@ def to_json(
image_format="png",
image_path="",
pages=None,
output_images=False,
show_progress=False,
force_text=True,
):
parsed_doc = parse_document(
doc,
image_dpi=image_dpi,
image_format=image_format,
image_path=image_path,
pages=pages,
output_images=output_images,
show_progress=show_progress,
force_text=force_text,
)
return parsed_doc.to_json()

Expand All @@ -109,6 +125,8 @@ def to_text(
footer=True,
pages=None,
ignore_code=False,
show_progress=False,
force_text=True,
):
parsed_doc = parse_document(
doc,
Expand All @@ -117,11 +135,15 @@ def to_text(
image_format="png",
image_path="",
pages=pages,
output_images=False,
show_progress=show_progress,
force_text=force_text,
)
return parsed_doc.to_text(
header=header,
footer=footer,
ignore_code=ignore_code,
show_progress=show_progress,
)


Expand Down
106 changes: 65 additions & 41 deletions pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,42 @@
import pymupdf # PyMuPDF
import numpy as np
import cv2
import numpy as np
import pymupdf # PyMuPDF
from pymupdf4llm.helpers.utils import WHITE_CHARS


WHITE_CHARS = set(
[chr(i) for i in range(33)]
+ [
"\u00a0", # Non-breaking space
"\u2000", # En quad
"\u2001", # Em quad
"\u2002", # En space
"\u2003", # Em space
"\u2004", # Three-per-em space
"\u2005", # Four-per-em space
"\u2006", # Six-per-em space
"\u2007", # Figure space
"\u2008", # Punctuation space
"\u2009", # Thin space
"\u200a", # Hair space
"\u202f", # Narrow no-break space
"\u205f", # Medium mathematical space
"\u3000", # Ideographic space
]
)
def get_tessocr(page, bbox, dpi=300):
"""Return OCR-ed span text using Tesseract.

Args:
page: pymupdf Page
bbox: pymupdf Rect or its sequence
dpi: resolution for OCR image
Returns:
The OCR-ed text of the bbox.
"""
# Step 1: Make a high-resolution image of the bbox.
pix = page.get_pixmap(dpi=dpi, clip=bbox)
ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
ocrpage = ocrpdf[0]
text = ocrpage.get_text()
text = text.replace("\n", " ").strip()
return text


def repair_blocks(input_blocks, page):
repaired_blocks = []
for block in input_blocks:
if block["type"] == 0: # text block
for line in block["lines"]:
for span in line["spans"]:
if not chr(0xFFFD) in span["text"]:
continue
text = get_tessocr(page, span["bbox"])
span["text"] = text
repaired_blocks.append(block)
else:
repaired_blocks.append(block)
return repaired_blocks


def detect_qr_codes(img):
Expand Down Expand Up @@ -152,23 +166,38 @@ def should_ocr_page(
# Check for text
text = page.get_text(flags=0)
decision["has_text"] = not WHITE_CHARS.issuperset(text)
if decision["has_text"]:
not_readable_count = len([c for c in text if c == chr(0xFFFD)])
readability = 1 - not_readable_count / len(text)
decision["readable_text"] = readability >= text_readability_thresh

all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
decision["has_ocr_text"] = bool(ocr_text_bboxes)

if decision["has_text"]:
unreadable_count = len([c for c in text if c == chr(0xFFFD)])
readability = 1 - unreadable_count / len(text)
decision["readable_text"] = readability >= text_readability_thresh

if decision["has_text"] and not decision["readable_text"]:
decision["should_ocr"] = True
decision["image"], decision["transform"], decision["pixmap"] = get_page_image(
page, dpi=dpi
)

if decision["has_text"]:
# early exit if any text exists
print(
f"{decision['has_text']=}, {decision['readable_text']=}, {decision['should_ocr']=}"
)
return decision

# Check for image coverage
image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
image_rect=pymupdf.EMPTY_RECT()
image_rects = [page_rect & img["bbox"] for img in page.get_image_info()]
image_rect = pymupdf.EMPTY_RECT()
for r in image_rects:
image_rect|=r
image_area=abs(image_rect)
image_rect |= r
image_area = abs(image_rect)
if image_area:
images_cover = image_area / page_area
else:
else:
images_cover = 0.0
decision["image_covers_page"] = images_cover >= image_coverage_thresh

Expand All @@ -189,16 +218,11 @@ def should_ocr_page(

# Final decision
if (
1
and not decision["has_text"]
and not decision["readable_text"]
and (
0
or decision["image_covers_page"]
or decision["has_vector_drawings"]
or decision["edge_density"] > edge_thresh
)
0
or decision["image_covers_page"]
or decision["has_vector_drawings"]
or decision["edge_density"] > edge_thresh
):
decision["should_ocr"] = True

return decision
Loading