Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,29 @@
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
readme = f.read()

version = "0.1.7"

classifiers = [
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
"Intended Audience :: Developers",
"Programming Language :: Python :: 3",
"Topic :: Utilities",
]
requires = ["pymupdf4llm==0.0.28"]

requires = [f"pymupdf4llm=={version}"]

setuptools.setup(
name="pdf4llm",
version="0.0.28",
version=version,
author="Artifex",
author_email="support@artifex.com",
description="PyMuPDF Utilities for LLM/RAG",
packages=setuptools.find_packages(),
long_description=readme,
long_description_content_type="text/markdown",
install_requires=requires,
python_requires=">=3.9",
python_requires=">=3.10",
license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
url="https://github.com/pymupdf/RAG",
classifiers=classifiers,
Expand Down
124 changes: 121 additions & 3 deletions pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,132 @@
import pymupdf
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
try:
import pymupdf.layout
except ImportError:
import pymupdf

from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION

if tuple(map(int, pymupdf.__version__.split("."))) < MINIMUM_PYMUPDF_VERSION:
raise ImportError(f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}")
raise ImportError(
f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}"
)

__version__ = VERSION
version = VERSION
version_tuple = tuple(map(int, version.split(".")))

if not callable(pymupdf._get_layout):
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown

pymupdf._warn_layout_once() # recommend pymupdf_layout

else:
from .helpers import document_layout as DL

def parse_document(
doc,
filename="",
image_dpi=150,
image_format="png",
image_path="",
pages=None,
):
return DL.parse_document(
doc,
filename=filename,
image_dpi=image_dpi,
image_format=image_format,
image_path=image_path,
pages=pages,
)

def to_markdown(
doc,
*,
header=True,
footer=True,
pages=None,
hdr_info=None,
write_images=False,
embed_images=False,
ignore_images=False,
ignore_graphics=False,
detect_bg_color=True,
image_path="",
image_format="png",
image_size_limit=0.05,
filename="",
force_text=True,
page_chunks=False,
page_separators=False,
margins=0,
dpi=150,
page_width=612,
page_height=None,
table_strategy="lines_strict",
graphics_limit=None,
fontsize_limit=3,
ignore_code=False,
extract_words=False,
show_progress=False,
use_glyphs=False,
ignore_alpha=False,
):
parsed_doc = parse_document(
doc,
filename=filename,
image_dpi=dpi,
image_format=image_format,
image_path=image_path,
pages=pages,
)
return parsed_doc.to_markdown(
header=header,
footer=footer,
write_images=write_images,
embed_images=embed_images,
ignore_code=ignore_code,
)

def to_json(
doc,
header=True,
footer=True,
image_dpi=150,
image_format="png",
image_path="",
pages=None,
):
parsed_doc = parse_document(
doc,
image_dpi=image_dpi,
image_format=image_format,
image_path=image_path,
pages=pages,
)
return parsed_doc.to_json()

def to_text(
doc,
filename="",
header=True,
footer=True,
pages=None,
ignore_code=False,
):
parsed_doc = parse_document(
doc,
filename=filename,
image_dpi=150,
image_format="png",
image_path="",
pages=pages,
)
return parsed_doc.to_text(
header=header,
footer=footer,
ignore_code=ignore_code,
)


def LlamaMarkdownReader(*args, **kwargs):
from .llama import pdf_markdown_reader
Expand Down
204 changes: 204 additions & 0 deletions pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import pymupdf # PyMuPDF
import numpy as np
import cv2


WHITE_CHARS = set(
[chr(i) for i in range(33)]
+ [
"\u00a0", # Non-breaking space
"\u2000", # En quad
"\u2001", # Em quad
"\u2002", # En space
"\u2003", # Em space
"\u2004", # Three-per-em space
"\u2005", # Four-per-em space
"\u2006", # Six-per-em space
"\u2007", # Figure space
"\u2008", # Punctuation space
"\u2009", # Thin space
"\u200a", # Hair space
"\u202f", # Narrow no-break space
"\u205f", # Medium mathematical space
"\u3000", # Ideographic space
]
)


def detect_qr_codes(img):
detector = cv2.QRCodeDetector()
data, points, _ = detector.detectAndDecode(img)

if points is not None and data:
pts = points[0].astype(int)
return {"data": data, "bbox": pts.tolist()}
return None


def detect_barcodes(img):
try:
from pyzbar.pyzbar import decode as barcode_decode
except ImportError:
raise ImportError("pyzbar is required for barcode detection")
gray = img
barcodes = barcode_decode(gray)
results = []

for barcode in barcodes:
results.append(
{
"type": barcode.type,
"data": barcode.data.decode("utf-8"),
"bbox": [(p.x, p.y) for p in barcode.polygon],
}
)
return results


def get_page_image(page, dpi=150):
pix = page.get_pixmap(dpi=dpi)
matrix = pymupdf.Rect(pix.irect).torect(page.rect)
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.height, pix.width, pix.n
)
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
return gray, matrix, pix


def detect_lines(img, min_length=50, max_gap=10, matrix=pymupdf.Identity):
gray = img
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
pix_lines = cv2.HoughLinesP(
edges,
1,
np.pi / 180,
threshold=100,
minLineLength=min_length,
maxLineGap=max_gap,
)
lines = []
for np_linesr in pix_lines:
for r in np_linesr:
p0 = pymupdf.Point(r[0], r[1]) * matrix
p1 = pymupdf.Point(r[2], r[3]) * matrix
lines.append((p0, p1))
return lines # array of (point1, point2)


def detect_curves(img, matrix=pymupdf.Identity):
gray = img
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)

curves = []
for cnt in contours:
if len(cnt) > 5:
ellipse = cv2.fitEllipse(cnt)
curves.append(ellipse)
return curves


def detect_rectangles(img, min_area=1000, matrix=pymupdf.Identity):
gray
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

rectangles = []
for cnt in contours:
approx = cv2.approxPolyDP(cnt, 0.02 * cv2.arcLength(cnt, True), True)
if len(approx) == 4 and cv2.contourArea(cnt) > min_area:
r = pymupdf.Rect(approx) * matrix
rectangles.append(r)
return rectangles


def should_ocr_page(
page,
dpi=150,
edge_thresh=0.015,
vector_thresh=500,
image_coverage_thresh=0.9,
text_readability_thresh=0.9,
):
"""
Decide whether a PyMuPDF page should be OCR'd.

Parameters:
page: PyMuPDF page object
dpi: DPI used for rasterization
edge_thresh: minimum edge density to suggest text presence
vector_thresh: minimum number of vector paths to suggest glyph simulation
image_coverage_thresh: fraction of page area covered by images to trigger OCR
text_readability_thresh: fraction of readable characters to skip OCR

Returns:
dict with decision and diagnostic flags
"""
decision = {
"should_ocr": False,
"has_ocr_text": False,
"has_text": False,
"readable_text": False,
"image_covers_page": False,
"has_vector_drawings": False,
"transform": pymupdf.Identity,
"pixmap": None,
"image": None,
"edge_density": 0.0,
"vector_count": 0,
}
page_rect = page.rect
page_area = abs(page_rect) # size of the full page
# Check for text
text = page.get_text(flags=0)
decision["has_text"] = not WHITE_CHARS.issuperset(text)
if decision["has_text"]:
not_readable_count = len([c for c in text if c == chr(0xFFFD)])
readability = 1 - not_readable_count / len(text)
decision["readable_text"] = readability >= text_readability_thresh

all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
decision["has_ocr_text"] = bool(ocr_text_bboxes)
# Check for image coverage
image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
image_rect=pymupdf.EMPTY_RECT()
for r in image_rects:
image_rect|=r
image_area=abs(image_rect)
if image_area:
images_cover = image_area / page_area
else:
images_cover = 0.0
decision["image_covers_page"] = images_cover >= image_coverage_thresh

# Check vector drawings
drawings = [
p for p in page.get_drawings() if p["rect"].width > 3 or p["rect"].height > 3
]
decision["vector_count"] = len(drawings)
decision["has_vector_drawings"] = len(drawings) >= vector_thresh

# Rasterize and analyze edge density
img, matrix, pix = get_page_image(page, dpi=dpi)
decision["transform"] = matrix
decision["pixmap"] = pix
decision["image"] = img
edges = cv2.Canny(img, 100, 200)
decision["edge_density"] = np.sum(edges > 0) / edges.size

# Final decision
if (
1
and not decision["has_text"]
and not decision["readable_text"]
and (
0
or decision["image_covers_page"]
or decision["has_vector_drawings"]
or decision["edge_density"] > edge_thresh
)
):
decision["should_ocr"] = True

return decision
Loading