From 8ea59e6d67dd5d937720535099010da9b94b5e39 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Mon, 3 Nov 2025 05:51:39 -0400 Subject: [PATCH] Version 0.1.7 more updates --- pdf4llm/setup.py | 9 +- pymupdf4llm/pymupdf4llm/__init__.py | 124 ++- pymupdf4llm/pymupdf4llm/helpers/check_ocr.py | 204 +++++ .../pymupdf4llm/helpers/document_layout.py | 818 ++++++++++++++++++ .../pymupdf4llm/helpers/get_text_lines.py | 26 +- .../pymupdf4llm/helpers/pymupdf_rag.py | 30 +- pymupdf4llm/pymupdf4llm/helpers/utils.py | 637 ++++++++++++++ pymupdf4llm/pymupdf4llm/versions_file.py | 4 +- pymupdf4llm/setup.py | 6 +- 9 files changed, 1833 insertions(+), 25 deletions(-) create mode 100644 pymupdf4llm/pymupdf4llm/helpers/check_ocr.py create mode 100644 pymupdf4llm/pymupdf4llm/helpers/document_layout.py create mode 100644 pymupdf4llm/pymupdf4llm/helpers/utils.py diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index 8edcc3a3..45638cc9 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -6,6 +6,8 @@ with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f: readme = f.read() +version = "0.1.7" + classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", @@ -13,11 +15,12 @@ "Programming Language :: Python :: 3", "Topic :: Utilities", ] -requires = ["pymupdf4llm==0.0.28"] + +requires = [f"pymupdf4llm=={version}"] setuptools.setup( name="pdf4llm", - version="0.0.28", + version=version, author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG", @@ -25,7 +28,7 @@ long_description=readme, long_description_content_type="text/markdown", install_requires=requires, - python_requires=">=3.9", + python_requires=">=3.10", license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License", url="https://github.com/pymupdf/RAG", classifiers=classifiers, diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 50055ece..4b41313a 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,14 +1,132 @@ -import pymupdf -from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown +try: + import pymupdf.layout +except ImportError: + import pymupdf + from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION if tuple(map(int, pymupdf.__version__.split("."))) < MINIMUM_PYMUPDF_VERSION: - raise ImportError(f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}") + raise ImportError( + f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}" + ) __version__ = VERSION version = VERSION version_tuple = tuple(map(int, version.split("."))) +if not callable(pymupdf._get_layout): + from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown + + pymupdf._warn_layout_once() # recommend pymupdf_layout + +else: + from .helpers import document_layout as DL + + def parse_document( + doc, + filename="", + image_dpi=150, + image_format="png", + image_path="", + pages=None, + ): + return DL.parse_document( + doc, + filename=filename, + image_dpi=image_dpi, + image_format=image_format, + image_path=image_path, + pages=pages, + ) + + def to_markdown( + doc, + *, + header=True, + footer=True, + pages=None, + hdr_info=None, + write_images=False, + embed_images=False, + ignore_images=False, + ignore_graphics=False, + detect_bg_color=True, + image_path="", + image_format="png", + image_size_limit=0.05, + filename="", + force_text=True, + page_chunks=False, + page_separators=False, + margins=0, + dpi=150, + page_width=612, + page_height=None, + table_strategy="lines_strict", + graphics_limit=None, + fontsize_limit=3, + ignore_code=False, + extract_words=False, + show_progress=False, + use_glyphs=False, + ignore_alpha=False, + ): + parsed_doc = parse_document( + doc, + filename=filename, + image_dpi=dpi, + image_format=image_format, + image_path=image_path, + pages=pages, + ) + return parsed_doc.to_markdown( + header=header, + footer=footer, + write_images=write_images, + embed_images=embed_images, + ignore_code=ignore_code, + ) + + def to_json( + doc, + header=True, + footer=True, + image_dpi=150, + image_format="png", + image_path="", + pages=None, + ): + parsed_doc = parse_document( + doc, + image_dpi=image_dpi, + image_format=image_format, + image_path=image_path, + pages=pages, + ) + return parsed_doc.to_json() + + def to_text( + doc, + filename="", + header=True, + footer=True, + pages=None, + ignore_code=False, + ): + parsed_doc = parse_document( + doc, + filename=filename, + image_dpi=150, + image_format="png", + image_path="", + pages=pages, + ) + return parsed_doc.to_text( + header=header, + footer=footer, + ignore_code=ignore_code, + ) + def LlamaMarkdownReader(*args, **kwargs): from .llama import pdf_markdown_reader diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py new file mode 100644 index 00000000..f9ad27f8 --- /dev/null +++ b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py @@ -0,0 +1,204 @@ +import pymupdf # PyMuPDF +import numpy as np +import cv2 + + +WHITE_CHARS = set( + [chr(i) for i in range(33)] + + [ + "\u00a0", # Non-breaking space + "\u2000", # En quad + "\u2001", # Em quad + "\u2002", # En space + "\u2003", # Em space + "\u2004", # Three-per-em space + "\u2005", # Four-per-em space + "\u2006", # Six-per-em space + "\u2007", # Figure space + "\u2008", # Punctuation space + "\u2009", # Thin space + "\u200a", # Hair space + "\u202f", # Narrow no-break space + "\u205f", # Medium mathematical space + "\u3000", # Ideographic space + ] +) + + +def detect_qr_codes(img): + detector = cv2.QRCodeDetector() + data, points, _ = detector.detectAndDecode(img) + + if points is not None and data: + pts = points[0].astype(int) + return {"data": data, "bbox": pts.tolist()} + return None + + +def detect_barcodes(img): + try: + from pyzbar.pyzbar import decode as barcode_decode + except ImportError: + raise ImportError("pyzbar is required for barcode detection") + gray = img + barcodes = barcode_decode(gray) + results = [] + + for barcode in barcodes: + results.append( + { + "type": barcode.type, + "data": barcode.data.decode("utf-8"), + "bbox": [(p.x, p.y) for p in barcode.polygon], + } + ) + return results + + +def get_page_image(page, dpi=150): + pix = page.get_pixmap(dpi=dpi) + matrix = pymupdf.Rect(pix.irect).torect(page.rect) + img = np.frombuffer(pix.samples, dtype=np.uint8).reshape( + pix.height, pix.width, pix.n + ) + gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + return gray, matrix, pix + + +def detect_lines(img, min_length=50, max_gap=10, matrix=pymupdf.Identity): + gray = img + edges = cv2.Canny(gray, 50, 150, apertureSize=3) + pix_lines = cv2.HoughLinesP( + edges, + 1, + np.pi / 180, + threshold=100, + minLineLength=min_length, + maxLineGap=max_gap, + ) + lines = [] + for np_linesr in pix_lines: + for r in np_linesr: + p0 = pymupdf.Point(r[0], r[1]) * matrix + p1 = pymupdf.Point(r[2], r[3]) * matrix + lines.append((p0, p1)) + return lines # array of (point1, point2) + + +def detect_curves(img, matrix=pymupdf.Identity): + gray = img + _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV) + contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) + + curves = [] + for cnt in contours: + if len(cnt) > 5: + ellipse = cv2.fitEllipse(cnt) + curves.append(ellipse) + return curves + + +def detect_rectangles(img, min_area=1000, matrix=pymupdf.Identity): + gray + _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV) + contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + rectangles = [] + for cnt in contours: + approx = cv2.approxPolyDP(cnt, 0.02 * cv2.arcLength(cnt, True), True) + if len(approx) == 4 and cv2.contourArea(cnt) > min_area: + r = pymupdf.Rect(approx) * matrix + rectangles.append(r) + return rectangles + + +def should_ocr_page( + page, + dpi=150, + edge_thresh=0.015, + vector_thresh=500, + image_coverage_thresh=0.9, + text_readability_thresh=0.9, +): + """ + Decide whether a PyMuPDF page should be OCR'd. + + Parameters: + page: PyMuPDF page object + dpi: DPI used for rasterization + edge_thresh: minimum edge density to suggest text presence + vector_thresh: minimum number of vector paths to suggest glyph simulation + image_coverage_thresh: fraction of page area covered by images to trigger OCR + text_readability_thresh: fraction of readable characters to skip OCR + + Returns: + dict with decision and diagnostic flags + """ + decision = { + "should_ocr": False, + "has_ocr_text": False, + "has_text": False, + "readable_text": False, + "image_covers_page": False, + "has_vector_drawings": False, + "transform": pymupdf.Identity, + "pixmap": None, + "image": None, + "edge_density": 0.0, + "vector_count": 0, + } + page_rect = page.rect + page_area = abs(page_rect) # size of the full page + # Check for text + text = page.get_text(flags=0) + decision["has_text"] = not WHITE_CHARS.issuperset(text) + if decision["has_text"]: + not_readable_count = len([c for c in text if c == chr(0xFFFD)]) + readability = 1 - not_readable_count / len(text) + decision["readable_text"] = readability >= text_readability_thresh + + all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]] + ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"] + decision["has_ocr_text"] = bool(ocr_text_bboxes) + # Check for image coverage + image_rects=[page_rect&img["bbox"] for img in page.get_image_info()] + image_rect=pymupdf.EMPTY_RECT() + for r in image_rects: + image_rect|=r + image_area=abs(image_rect) + if image_area: + images_cover = image_area / page_area + else: + images_cover = 0.0 + decision["image_covers_page"] = images_cover >= image_coverage_thresh + + # Check vector drawings + drawings = [ + p for p in page.get_drawings() if p["rect"].width > 3 or p["rect"].height > 3 + ] + decision["vector_count"] = len(drawings) + decision["has_vector_drawings"] = len(drawings) >= vector_thresh + + # Rasterize and analyze edge density + img, matrix, pix = get_page_image(page, dpi=dpi) + decision["transform"] = matrix + decision["pixmap"] = pix + decision["image"] = img + edges = cv2.Canny(img, 100, 200) + decision["edge_density"] = np.sum(edges > 0) / edges.size + + # Final decision + if ( + 1 + and not decision["has_text"] + and not decision["readable_text"] + and ( + 0 + or decision["image_covers_page"] + or decision["has_vector_drawings"] + or decision["edge_density"] > edge_thresh + ) + ): + decision["should_ocr"] = True + + return decision diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py new file mode 100644 index 00000000..9074981c --- /dev/null +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -0,0 +1,818 @@ +import base64 +import json +import os +from binascii import b2a_base64 +from dataclasses import dataclass +from typing import Dict, List, Optional, Union +from pathlib import Path +import pymupdf +import tabulate +from pymupdf4llm.helpers.get_text_lines import get_raw_lines +from pymupdf4llm.helpers import utils, check_ocr + +try: + import cv2 +except ImportError: + cv2 = None + +pymupdf.TOOLS.unset_quad_corrections(True) + +GRAPHICS_TEXT = "\n![](%s)\n" +CHECK_OCR_TEXT = {"ignore-text"} +OCR_FONTNAME = "GlyphLessFont" # if encountered do not use "code" style +FLAGS = ( + 0 + | pymupdf.TEXT_COLLECT_STYLES + | pymupdf.TEXT_COLLECT_VECTORS + | pymupdf.TEXT_PRESERVE_IMAGES + | pymupdf.TEXT_ACCURATE_BBOXES + | pymupdf.TEXT_MEDIABOX_CLIP +) + + +def omit_if_pua_char(text): + """Check if character is in the Private Use Area (PUA) of Unicode.""" + if len(text) > 1: + return text + o = ord(text) + if ( + (0xE000 <= o <= 0xF8FF) + or (0xF0000 <= o <= 0xFFFFD) + or (0x100000 <= o <= 0x10FFFD) + ): + return "" + return text + + +def create_list_item_levels(layout_info): + """Create a dictionary that maps the box number of each list-item to + its hierarchy level. + + Args: + layout_info (list): the bbox list "page.layout_information" + + Returns: + dict: {bbox sequence number: level} where level is 1 for top-level. + """ + segments = [] # list of item segments + segment = [] # current segment + + # Create segments of contiguous list items. Each non-list-item finishes + # the current segment. Also, if two list-items belong to different page + # text columns ends a segment. + for i, item in enumerate(layout_info): + if item.boxclass != "list-item": # bbox class is not list-item + if segment: # end and save the current segment + segments.append(segment) + segment = [] + continue + if segment: # check if we need to end the current segment + _, prev_item = segment[-1] + if item.x0 > prev_item.x1 or item.y1 < prev_item.y0: + # end and save the current segment + segments.append(segment) + segment = [] + segment.append((i, item)) # append item to segment + if segment: + segments.append(segment) # append last segment + + item_dict = {} # dictionary of item index -> (level + if not segments: # no list items found + return item_dict + + # walk through segments and assign levels + for i, s in enumerate(segments): + if not s: + continue + s.sort(key=lambda x: x[1].x0) # sort by x0 coordinate of the bbox + + # list of leveled items in the segment: (idx, bbox, level) + # first item has level 1 + leveled_items = [(s[0][0], s[0][1], 1)] + for idx, bbox in s[1:]: + prev_idx, prev_bbox, prev_lvl = leveled_items[-1] + # x0 coordinate increased by more than 10 points: increase level + if bbox.x0 > prev_bbox.x0 + 10: + curr_lvl = prev_lvl + 1 + leveled_items.append((idx, bbox, curr_lvl)) + else: + leveled_items.append((idx, bbox, prev_lvl)) + for idx, bbox, lvl in leveled_items: + item_dict[idx] = lvl + return item_dict + + +def is_monospaced(textlines): + """Detect text bboxes with all mono-spaced lines.""" + line_count = len(textlines) + mono = 0 + + for l in textlines: + all_mono = all( + bool(s["flags"] & 8 and s["font"] != OCR_FONTNAME) for s in l["spans"] + ) + if all_mono: + mono += 1 + return mono == line_count + + +def is_superscripted(line): + spans = line["spans"] + if not spans: + return False + if spans[0]["flags"] & 1: # check for superscript + return True + if len(spans) < 2: + return False + if ( + spans[0]["origin"][1] < spans[1]["origin"][1] + and spans[0]["size"] < spans[1]["size"] + ): + return True + return False + + +def get_plain_text(spans): + """Output text without any markdown or other styling. + Parameter is a list of span dictionaries. The spans may come from + one or more original "textlines" items. + Returns the text string of the boundary box. + The text string always ends with the suffix and a space + """ + output = "" + for i, s in enumerate(spans): + span_text = s["text"].strip() # remove leading/trailing spaces + superscript = s["flags"] & 1 + span_text = s["text"].strip() + if superscript: + if i == 0: + span_text = f"[{span_text}] " + elif output.endswith(" "): + output = output[:-1] + if output.endswith("- ") and len(output.split()[-1]) > 2: + output = output[:-2] + output += span_text + " " + return output + + +def list_item_to_text(textlines, level): + """ + Convert "list-item" bboxes to text. + """ + indent = " " * (level - 1) + output = indent + line = textlines[0] + x0 = line["bbox"][0] # left of first line + spans = line["spans"] + span0 = line["spans"][0] + span0_text = span0["text"].strip() + + if not omit_if_pua_char(span0_text): + spans.pop(0) + if spans: + x0 = spans[0]["bbox"][0] + + for line in textlines[1:]: + this_x0 = line["bbox"][0] + if this_x0 < x0 - 2: + line_output = get_plain_text(spans) + output += line_output + output = output.rstrip() + f"\n\n{indent}" + spans = line["spans"] + if not omit_if_pua_char(spans[0]["text"].strip()): + spans.pop(0) + else: + spans.extend(line["spans"]) + x0 = this_x0 # store this left coordinate + line_output = get_plain_text(spans) + output += line_output + + return output.rstrip() + "\n\n" + + +def footnote_to_text(textlines): + """ + Convert "footnote" bboxes to text. + """ + output = "> " + line = textlines[0] + spans = line["spans"] + + for line in textlines[1:]: + if is_superscripted(line): + line_output = get_plain_text(spans) + output += line_output + output = output.rstrip() + "\n\n> " + spans = line["spans"] + else: + spans.extend(line["spans"]) + line_output = get_plain_text(spans) + output += line_output + + return output.rstrip() + "\n\n" + + +def code_block_to_text(textlines): + """Output a code block in plain text format.""" + output = "" + for line in textlines: + line_text = "" + for s in line["spans"]: + span_text = s["text"] + line_text += span_text + output += line_text.rstrip() + "\n" + output += "\n\n" + return output + + +def text_to_text(textlines, ignore_code: bool = False): + """ + Convert "text" bboxes to plain text, as well as other boxclasses + not specifically handled elsewhere. + The line text is written without line breaks. At the end, + two newlines are added to separate from the next block. + """ + if not textlines: + return "" + if is_superscripted(textlines[0]): # check for superscript + return footnote_to_text(textlines) + if not ignore_code and is_monospaced(textlines): + return code_block_to_text(textlines) + + spans = [] + for l in textlines: + for s in l["spans"]: + assert isinstance(s, dict) + spans.append(s) + output = get_plain_text(spans) + return output + "\n\n" + + +def get_styled_text(spans): + """Output text with markdown style codes based on font properties. + Parameter is a list of span dictionaries. The spans may come from + one or more original "textlines" items. + Returns the text string and the suffix for continuing styles. + The text string always ends with the suffix and a space + """ + output = "" + old_line = 0 + old_block = 0 + suffix = "" + for i, s in enumerate(spans): + # decode font properties + prefix = "" + superscript = s["flags"] & 1 + mono = s["flags"] & 8 and s["font"] != OCR_FONTNAME + bold = s["flags"] & 16 or s["char_flags"] & 8 + italic = s["flags"] & 2 + strikeout = s["char_flags"] & 1 + + # compute styling prefix and suffix + if mono: + prefix = "`" + prefix + if bold: + prefix = "**" + prefix + if italic: + prefix = "_" + prefix + if strikeout: + prefix = "~~" + prefix + + suffix = "".join(reversed(prefix)) # reverse of prefix + + span_text = s["text"].strip() # remove leading/trailing spaces + # convert intersecting link to markdown syntax + # ltext = resolve_links(parms.links, s) + ltext = "" # TODO: implement link resolution + if ltext: + text = f"{hdr_string}{prefix}{ltext}{suffix} " + else: + text = f"{prefix}{span_text}{suffix} " + + # Extend output string taking care of styles staying the same. + if output.endswith(f"{suffix} "): + output = output[: -len(suffix) - 1] + # resolve hyphenation if old_block and old_line are not the same + if ( + 1 + and (old_block, old_line) != (s["block"], s["line"]) + and output.endswith("-") + and len(output.split()[-1]) > 2 + ): + output = output[:-1] + text = span_text + suffix + " " + elif superscript: + text = span_text + suffix + " " + else: + text = " " + span_text + suffix + " " + + old_line = s["line"] + old_block = s["block"] + output += text + return output, suffix + + +def list_item_to_md(textlines, level): + """ + Convert "list-item" bboxes to markdown. + The first line is prefixed with "- ". Subsequent lines are appended + without line break if their rectangle does not start to the left + of the previous line. + Otherwise, a linebreak and "- " are added to the output string. + 2 units of tolerance is used to avoid spurious line breaks. + + This post-layout heuristics helps cover cases where more than + one list item is contained in a single bbox. + """ + indent = " " * (level - 1) + line = textlines[0] + x0 = line["bbox"][0] # left of first line + spans = line["spans"] + span0 = line["spans"][0] + span0_text = span0["text"].strip() + + starter = "- " + if span0_text.endswith(".") and span0_text[:-1].isdigit(): + starter = "1. " + + if not omit_if_pua_char(span0["text"].strip()): + spans.pop(0) + if spans: + x0 = spans[0]["bbox"][0] + + output = indent + starter + for line in textlines[1:]: + this_x0 = line["bbox"][0] + if this_x0 < x0 - 2: + line_output, suffix = get_styled_text(spans) + output += line_output + f"\n\n{indent}{starter}" + spans = line["spans"] + if not omit_if_pua_char(spans[0]["text"].strip()): + spans.pop(0) + else: + spans.extend(line["spans"]) + x0 = this_x0 # store this left coordinate + line_output, suffix = get_styled_text(spans) + output += line_output + + return output + "\n\n" + + +def footnote_to_md(textlines): + """ + Convert "footnote" bboxes to markdown. + The first line is prefixed with "> ". Subsequent lines are appended + without line break if they do not start with a superscript. + Otherwise, a linebreak and "> " are added to the output string. + + This post-layout heuristics helps cover cases where more than + one list item is contained in a single bbox. + """ + line = textlines[0] + spans = line["spans"] + output = "> " + for line in textlines[1:]: + if is_superscripted(line): + line_output, suffix = get_styled_text(spans) + output += line_output + "\n\n> " + spans = line["spans"] + else: + spans.extend(line["spans"]) + line_output, suffix = get_styled_text(spans) + output += line_output + + return output + "\n\n" + + +def section_hdr_to_md(textlines): + """ + Convert "section-header" bboxes to markdown. + This is treated as a level 2 header (##). + The line text itself is handled like normal text. + """ + spans = [] + for l in textlines: + for s in l["spans"]: + assert isinstance(s, dict) + spans.append(s) + output, suffix = get_styled_text(spans) + return f"## {output}\n\n" + + +def title_to_md(textlines): + """ + Convert "title" bboxes to markdown. + This is treated as a level 1 header (#). + The line text itself is handled like normal text. + """ + spans = [] + for l in textlines: + for s in l["spans"]: + assert isinstance(s, dict) + spans.append(s) + output, suffix = get_styled_text(spans) + return f"# {output}\n\n" + + +def code_block_to_md(textlines): + """Output a code block in markdown format.""" + output = "```\n" + for line in textlines: + line_text = "" + for s in line["spans"]: + span_text = s["text"] + line_text += span_text + output += line_text.rstrip() + "\n" + output += "```\n\n" + return output + + +def text_to_md(textlines, ignore_code: bool = False): + """ + Convert "text" bboxes to markdown, as well as other boxclasses + not specifically handled elsewhere. + The line text is written without line breaks. At the end, + two newlines are added to separate from the next block. + """ + if not textlines: + return "" + if is_superscripted(textlines[0]): + # exec advanced superscript detector + return footnote_to_md(textlines) + if not ignore_code and is_monospaced(textlines): + return code_block_to_md(textlines) + + spans = [] + for l in textlines: + for s in l["spans"]: + assert isinstance(s, dict) + spans.append(s) + output, suffix = get_styled_text(spans) + return output + "\n\n" + + +@dataclass +class LayoutBox: + x0: float + y0: float + x1: float + y1: float + boxclass: str # e.g. 'text', 'picture', 'table', etc. + + # if boxclass == 'picture' or 'formula', store image bytes + image: Optional[bytes] = None + + # if boxclass == 'table' + table: Optional[Dict] = None + + # text line information for text-type boxclasses + textlines: Optional[List[Dict]] = None + + +@dataclass +class PageLayout: + page_number: int + width: float + height: float + boxes: List[LayoutBox] + ocrpage: bool = False # whether the page is an OCR page + fulltext: Optional[List[Dict]] = None # full page text in extractDICT format + words: Optional[List[Dict]] = None # list of words with bbox + links: Optional[List[Dict]] = None + + +@dataclass +class ParsedDocument: + filename: Optional[str] = None # source file name + page_count: int = None + toc: Optional[List[List]] = None # e.g. [{'title': 'Intro', 'page': 1}] + pages: List[PageLayout] = None + metadata: Optional[Dict] = None + from_bytes: bool = False # whether loaded from bytes + image_dpi: int = 150 # image resolution + image_format: str = "png" # 'png' or 'jpg' + image_path: str = "" # path to save images + use_ocr: bool = True # whether to invoke OCR if beneficial + + def to_markdown( + self, + header: bool = True, + footer: bool = True, + write_images: bool = False, + embed_images: bool = False, + ignore_code: bool = False, + ) -> str: + """ + Serialize ParsedDocument to markdown text. + """ + output = "" + for page in self.pages: + + # make mapping: box number to list item level + list_item_levels = create_list_item_levels(page.boxes) + for i, box in enumerate(page.boxes): + clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1) + btype = box.boxclass + if btype == "page-header" and header is False: + continue + if btype == "page-footer" and footer is False: + continue + if btype in ("picture", "formula") and box.image: + img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}" + if write_images: + filename = os.path.basename(self.filename).replace(" ", "-") + image_filename = os.path.join( + self.image_path, + f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}", + ) + Path(image_filename).write_bytes(box.image) + + output += GRAPHICS_TEXT % img_filename + + elif embed_images: + # make a base64 encoded string of the image + data = b2a_base64(box.image).decode() + data = f"data:image/{self.image_format};base64," + data + output += GRAPHICS_TEXT % data + "\n\n" + + else: + output += f"**==> {btype} [{clip.width} x {clip.height}] intentionally omitted <==**\n\n" + continue + if btype == "table": + output += box.table["markdown"] + "\n\n" + continue + if not hasattr(box, "textlines"): + print(f"Warning: box {btype} has no textlines") + continue + if btype == "title": + output += title_to_md(box.textlines) + elif btype == "section-header": + output += section_hdr_to_md(box.textlines) + elif btype == "list-item": + output += list_item_to_md(box.textlines, list_item_levels[i]) + elif btype == "footnote": + output += footnote_to_md(box.textlines) + elif not header and btype == "page-header": + continue + elif not footer and btype == "page-footer": + continue + else: # treat as normal MD text + output += text_to_md( + box.textlines, ignore_code=ignore_code or page.ocrpage + ) + + return output + + def to_json(self) -> str: + # Serialize to JSON + class LayoutEncoder(json.JSONEncoder): + def default(self, s): + if isinstance(s, (bytes, bytearray)): + return base64.b64encode(s).decode() + if isinstance( + s, + ( + pymupdf.Rect, + pymupdf.Point, + pymupdf.Matrix, + pymupdf.IRect, + pymupdf.Quad, + ), + ): + return list(s) + if hasattr(s, "__dict__"): + return s.__dict__ + return self.super().default(s) + + js = json.dumps(self, cls=LayoutEncoder, indent=1) + return js + + def to_text( + self, + header: bool = True, + footer: bool = True, + ignore_code: bool = False, + ) -> str: + """ + Serialize ParsedDocument to plain text. Optionally omit page headers or footers. + """ + # Flatten all text boxes into plain text + output = "" + for page in self.pages: + list_item_levels = create_list_item_levels(page.boxes) + for i, box in enumerate(page.boxes): + clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1) + btype = box.boxclass + if btype == "page-header" and header is False: + continue + if btype == "page-footer" and footer is False: + continue + if btype in ("picture", "formula"): + output += f"==> {btype} [{clip.width} x {clip.height}] <==\n\n" + continue + if btype == "table": + output += ( + tabulate.tabulate(box.table["extract"], tablefmt="grid") + + "\n\n" + ) + continue + if btype == "list-item": + output += list_item_to_text(box.textlines, list_item_levels[i]) + continue + if btype == "footnote": + output += footnote_to_text(box.textlines) + continue + output += text_to_text( + box.textlines, ignore_code=ignore_code or page.ocrpage + ) + continue + return output + + +def parse_document( + doc, + filename="", + image_dpi=150, + image_format="png", + image_path="", + pages=None, +) -> ParsedDocument: + if isinstance(doc, pymupdf.Document): + mydoc = doc + else: + mydoc = pymupdf.open(doc) + document = ParsedDocument() + document.filename = mydoc.name if mydoc.name else filename + document.toc = mydoc.get_toc(simple=True) + document.page_count = mydoc.page_count + document.metadata = mydoc.metadata + document.image_dpi = image_dpi + document.image_format = image_format + document.image_path = image_path + document.pages = [] + try: + reason = "OpenCV not installed" + assert cv2 is not None + reason = "Tesseract language data not found" + assert pymupdf.get_tessdata() + document.use_ocr = True + except Exception as e: + print(f"{reason}. Disabling OCR.") + document.use_ocr = False + if pages is None: + page_filter = range(mydoc.page_count) + elif isinstance(pages, int): + while pages < 0: + pages += mydoc.page_count + page_filter = [pages] + elif not hasattr(pages, "__getitem__"): + raise ValueError("'pages' parameter must be an int, or a sequence of ints") + else: + page_filter = sorted(set(pages)) + if ( + not all(isinstance(p, int) for p in page_filter) + or page_filter[-1] >= mydoc.page_count + ): + raise ValueError( + "'pages' parameter must be None, int, or a sequence of ints less than page count" + ) + for pno in page_filter: + page = mydoc.load_page(pno) + + # check if this page should be OCR'd + if document.use_ocr: + decision = check_ocr.should_ocr_page(page, dpi=600) + else: + decision = {"should_ocr": False} + if decision["should_ocr"]: + print(f"Performing OCR on {page.number=}[{page.number+1}]...") + pix = decision["pixmap"] # retrieve the Pixmap + pdf_data = pix.pdfocr_tobytes() # OCR it + ocr_pdf = pymupdf.open("pdf", pdf_data) # get the OCR'd PDF + ocrpage = ocr_pdf[0] # this is its OCR'd page + # remove everything except the text + ocrpage.add_redact_annot(ocrpage.rect) + ocrpage.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_REMOVE, + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, + text=pymupdf.PDF_REDACT_TEXT_NONE, + ) + # copy text over to original page + page.show_pdf_page(page.rect, ocr_pdf, 0) + ocr_pdf.close() # discard temporary OCR PDF + del ocr_pdf + + bboxlog = page.get_bboxlog() + ocrpage = decision["should_ocr"] or ( + set([b[0] for b in bboxlog if b[0] == "ignore-text"]) == CHECK_OCR_TEXT + ) + textpage = page.get_textpage(flags=FLAGS) + blocks = textpage.extractDICT()["blocks"] + page.get_layout() + utils.clean_pictures(page, blocks) + utils.add_image_orphans(page, blocks) + utils.clean_tables(page, blocks) + page.layout_information = utils.find_reading_order(page.layout_information) + + # identify vector graphics to help find tables + all_lines, all_boxes = utils.complete_table_structure(page) + tbf = page.find_tables( + strategy="lines_strict", add_lines=all_lines, add_boxes=all_boxes + ) + fulltext = [b for b in blocks if b["type"] == 0] + words = [ + { + "bbox": pymupdf.Rect(w[:4]), + "text": w[4], + "block_n": w[5], + "line_n": w[6], + "word_n": w[7], + } + for w in textpage.extractWORDS() + ] + links = page.get_links() + pagelayout = PageLayout( + page_number=page.number + 1, + width=page.rect.width, + height=page.rect.height, + boxes=[], + ocrpage=ocrpage, + fulltext=fulltext, + words=words, + links=links, + ) + for box in page.layout_information: + layoutbox = LayoutBox(*box) + clip = pymupdf.Rect(box[:4]) + + if layoutbox.boxclass in ("picture", "formula"): + pix = page.get_pixmap(clip=clip, dpi=document.image_dpi) + layoutbox.image = pix.tobytes(document.image_format) + + elif layoutbox.boxclass == "table": + # This is either a table detected by native TableFinder or by + # MuPDF's table structure recognition (which may fail). + # If the structure was not detected, we output an image. + # A table is represented as a dict with bbox, row_count, + # col_count, cells, extract (2D list of cell texts), and the + # markdown string. + + try: # guard against table structure detection failure + table = [ + tab + for tab in tbf.tables + if pymupdf.table._iou(tab.bbox, clip) > 0.6 + ][0] + cells = [[c for c in row.cells] for row in table.rows] + + if table.header.external: # if the header ioutside table + cells.insert(0, table.header.cells) # insert a row + table.row_count += 1 # increase row count + + layoutbox.table = { + "bbox": list(table.bbox), + "row_count": table.row_count, + "col_count": table.col_count, + "cells": cells, + "extract": table.extract(), + } + layoutbox.table["markdown"] = utils.table_to_markdown( + textpage, layoutbox, markdown=True + ) + except Exception as e: + print(f"table detection error '{e}'") + # table structure not detected: treat like an image + pix = page.get_pixmap(clip=clip, dpi=document.image_dpi) + layoutbox.image = pix.tobytes(document.image_format) + layoutbox.boxclass = "picture" + else: + # Handle text-like box classes: + # Extract text line information within the box. + # Each line is represented as its bbox and a list of spans. + layoutbox.textlines = [ + {"bbox": l[0], "spans": l[1]} + for l in get_raw_lines( + textpage=None, + blocks=pagelayout.fulltext, + clip=clip, + ignore_invisible=not ocrpage, + ) + ] + pagelayout.boxes.append(layoutbox) + document.pages.append(pagelayout) + if mydoc != doc: + mydoc.close() + return document + + +if __name__ == "__main__": + # Example usage + import sys + from pathlib import Path + + filename = sys.argv[1] + pdoc = parse_document(filename) + # Path(filename).with_suffix(".json").write_text(pdoc.to_json()) + # Path(filename).with_suffix(".txt").write_text(pdoc.to_text(footer=False)) + md = pdoc.to_markdown(write_images=True, header=False, footer=False) + Path(filename).with_suffix(".md").write_text(md) diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index 0769dd00..e1a02da9 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -26,7 +26,8 @@ def is_white(text): def get_raw_lines( - textpage, + textpage=None, + blocks=None, clip=None, tolerance=3, ignore_invisible=True, @@ -44,7 +45,10 @@ def get_raw_lines( formats like Markdown or JSON. Args: - textpage: (mandatory) TextPage object + textpage: TextPage object. Can be None if blocks are given. + blocks: (list) if given, use these blocks instead of extracting them + from the TextPage. This allows to re-use blocks extracted + by the caller. clip: (Rect) specifies a sub-rectangle of the textpage rect (which in turn may be based on a sub-rectangle of the full page). tolerance: (float) put spans on the same line if their top or bottom @@ -71,7 +75,7 @@ def sanitize_spans(line): left to right. Arg: - A list of spans - as drived from TextPage.extractDICT() + A list of spans - as derived from TextPage.extractDICT() Returns: A list of sorted, and potentially cleaned-up spans """ @@ -101,14 +105,18 @@ def sanitize_spans(line): line[i - 1] = s0 # update the span return line + if not isinstance(textpage, pymupdf.TextPage) and blocks is None: + raise ValueError("Either textpage or blocks must be provided.") + if clip is None: # use TextPage rect if not provided clip = textpage.rect # extract text blocks - if bbox is not empty - blocks = [ - b - for b in textpage.extractDICT()["blocks"] - if b["type"] == 0 and not pymupdf.Rect(b["bbox"]).is_empty - ] + if blocks is None: + blocks = [ + b + for b in textpage.extractDICT()["blocks"] + if b["type"] == 0 and not pymupdf.Rect(b["bbox"]).is_empty + ] spans = [] # all spans in TextPage here for bno, b in enumerate(blocks): # the numbered blocks for lno, line in enumerate(b["lines"]): # the numbered lines @@ -127,7 +135,7 @@ def sanitize_spans(line): continue if abs(sbbox & clip) < abs(sbbox) * 0.8: # if not in clip continue - if s["flags"] & 1 == 1: # if a superscript, modify bbox + if s["flags"] & 1: # if a superscript, modify bbox # with that of the preceding or following span i = 1 if sno == 0 else sno - 1 if len(line["spans"]) > i: diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index ad6ca0c1..f0c76e6b 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -38,13 +38,14 @@ import os import string from binascii import b2a_base64 +from collections import defaultdict +from dataclasses import dataclass + import pymupdf from pymupdf import mupdf from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white from pymupdf4llm.helpers.multi_column import column_boxes from pymupdf4llm.helpers.progress import ProgressBar -from dataclasses import dataclass -from collections import defaultdict pymupdf.TOOLS.unset_quad_corrections(True) @@ -572,7 +573,7 @@ def write_text( if i in parms.written_images: continue r = parms.img_rects[i] - if r.y1 <= lrect.y0 and ( + if max(r.y0, lrect.y0) < min(r.y1, lrect.y1) and ( 0 or lrect.x0 <= r.x0 < lrect.x1 or lrect.x0 < r.x1 <= lrect.x1 @@ -1024,6 +1025,9 @@ def get_page_output( graphics_count = len([b for b in page.get_bboxlog() if "path" in b[0]]) if GRAPHICS_LIMIT and graphics_count > GRAPHICS_LIMIT: IGNORE_GRAPHICS = True + too_many_graphics = True + else: + too_many_graphics = False # Locate all tables on page parms.written_tables = [] # stores already written tables @@ -1075,7 +1079,7 @@ def get_page_output( else: paths = [] # catch too-many-graphics situation - if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT: + if IGNORE_GRAPHICS: paths = [] # We also ignore vector graphics that only represent @@ -1101,7 +1105,17 @@ def get_page_output( parms.vg_clusters0 = refine_boxes(vg_clusters0) parms.vg_clusters = dict((i, r) for i, r in enumerate(parms.vg_clusters0)) + block_count = len(parms.textpage.extractBLOCKS()) + if block_count > 0: + char_density = len(parms.textpage.extractTEXT()) / block_count + else: + char_density = 0 # identify text bboxes on page, avoiding tables, images and graphics + if too_many_graphics and char_density < 20: + # This page has too many isolated text pieces for meaningful + # layout analysis. Treat whole page as one text block. + text_rects = [parms.clip] + else: text_rects = column_boxes( parms.page, paths=parms.actual_paths, @@ -1200,7 +1214,13 @@ def get_page_output( pages = ProgressBar(pages) for pno in pages: parms = get_page_output( - doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS + doc, + pno, + margins, + textflags, + FILENAME, + IGNORE_IMAGES, + IGNORE_GRAPHICS, ) if page_chunks is False: document_output += parms.md_string diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py new file mode 100644 index 00000000..f9de20e0 --- /dev/null +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -0,0 +1,637 @@ +import pymupdf + +white_spaces = set([chr(i) for i in range(33)]) | {0xA0, 0x2002, 0x2003, 0x2009, 0x202F} + + +def table_cleaner(page, blocks, tbbox): + """Clean the table bbox 'tbbox'. + + 'blocks' is the TextPage.extractDict()["blocks"] list. + + This function must be used AFTER clean_pictures() so we know that tbbox + is complete in terms of includable vectors. + + We check whether the table bbox contains non-rect ("tilted") vectors + and determine which part of tbbox they cover. If this is too large, we + re-classify tbbox as a picture. + Else we check whether the tilted vectors only cover some upper part of the + result. In that case we separate the top part as a picture and keep + the remining area as a table. + """ + bbox = pymupdf.Rect(tbbox[:4]) + + # All vectors inside tbbox. Checking for the top-left corner is enough. + all_vectors = [ + (pymupdf.IRect(b["bbox"]), b["isrect"]) + for b in blocks + if b["type"] == 3 and b["bbox"][:2] in bbox + ] + tilt_vectors = [v for v in all_vectors if not v[1]] + # Early exit if no tilted vectors + if not tilt_vectors: + return None, None + + y0 = min([b[0].y0 for b in tilt_vectors]) + y1 = max([b[0].y1 for b in tilt_vectors]) + x0 = min([b[0].x0 for b in tilt_vectors]) + x1 = max([b[0].x1 for b in tilt_vectors]) + + # Rectangle containing all non-rectangle vectors inside the table bbox + tilted = pymupdf.Rect(x0, y0, x1, y1) + + # if it covers most of the table bbox, we convert to picture + if tilted.width >= bbox.width * 0.8 and tilted.height >= bbox.height * 0.8: + return tbbox[:4] + ["picture"], None + + # Extract text spans. Needed for completing the potential picture area. + span_rects = [ + s["bbox"] + for b in blocks + if b["type"] == 0 + for l in b["lines"] + for s in l["spans"] + if s["bbox"] in bbox + ] + + # Check if non-rect vectors cover some acceptable upper part of tbbox. + if ( + 1 + and tilted.y1 - bbox.y0 <= bbox.height * 0.3 # 30% of tbbox height + and tilted.width >= bbox.width * 0.7 # at least 80% of tbbox width + ): + tilted.y1 += 2 # add some buffer at the bottom + + # include any text that is part of the picture area + for r in span_rects: + if tilted.intersects(r): + tilted |= r + + picture_box = [bbox.x0, bbox.y0, bbox.x1, tilted.y1, "picture"] + table_box = [bbox.x0, tilted.y1 + 1, bbox.x1, bbox.y1, "table"] + return picture_box, table_box + return None, None + + +def clean_tables(page, blocks): + for i in range(len(page.layout_information)): + if page.layout_information[i][4] != "table": + continue + # re-classify some corner cases as "text" + # the layout bbox as a Rect + bbox = pymupdf.Rect(page.layout_information[i][:4]) + + # lines in this bbox + lines = [ + l for b in blocks if b["type"] == 0 for l in b["lines"] if l["bbox"] in bbox + ] + y_vals0 = sorted(set(round(l["bbox"][3]) for l in lines)) + y_vals = [y_vals0[0]] + for y in y_vals0[1:]: + if y - y_vals[-1] > 3: + y_vals.append(y) + if len(y_vals) < 2: # too few distinct line bottoms + # too few text lines to be a table + page.layout_information[i][4] = "text" + continue + # our table minimum dimension, rows x cols, is 2 x 2 + mx_same_baseline = 1 + for y in y_vals: + count = len([l for l in lines if abs(y - l["bbox"][3]) <= 3]) + if count > mx_same_baseline: + mx_same_baseline = count + break + if mx_same_baseline < 2: + # too few text columns to be a table + page.layout_information[i][4] = "text" + continue + rc1, rc2 = table_cleaner(page, blocks, page.layout_information[i]) + if rc1: + if not rc2: + page.layout_information[i] = rc1 + else: + page.layout_information[i] = rc2 + page.layout_information.insert(i, rc1) + i += 1 + return + + +def clean_pictures(page, blocks): + """Extend picture / formula / table bboxes. + + Join layout boxes with intersecting text, image, vectors. + + 'blocks' is the TextPage.extractDict()["blocks"] list. + """ + # all layout boxes + all_bboxes = [pymupdf.Rect(b[:4]) for b in page.layout_information] + + for i in range(len(all_bboxes)): + if page.layout_information[i][4] not in ("picture", "formula", "table"): + # no eligible layout box + continue + + # get its Rect object + bbox = pymupdf.Rect(page.layout_information[i][:4]) + for b in blocks: + if b["type"] not in (0, 1, 3): + continue + block_bbox = pymupdf.IRect(b["bbox"]) + if b["type"] == 3 and block_bbox.is_empty: + block_bbox += (-1, -1, 1, 1) + if bbox.intersects(block_bbox) and not any( + bb.intersects(block_bbox) for j, bb in enumerate(all_bboxes) if j != i + ): + bbox |= block_bbox + page.layout_information[i] = list(bbox) + [page.layout_information[i][4]] + + +def add_image_orphans(page, blocks): + """Add orphan images as layout boxes of class 'picture'. + + 'blocks' is the TextPage.extractDict()["blocks"] list. + """ + # all layout boxes + all_bboxes = [pymupdf.Rect(b[:4]) for b in page.layout_information] + area_limit = abs(page.rect) * 0.9 + images = [] + for img in page.get_image_info(): + r = page.rect & img["bbox"] + if r.is_empty or abs(r) >= area_limit: + continue + images.append(r) + + paths = [] + for b in blocks: + if b["type"] != 3: + continue + r = page.rect & b["bbox"] + if abs(r) >= area_limit: + continue + if r.width < 3 and r.height < 3: + continue + r_low_limit = 0.1 * abs(r) + r_hi_limit = 0.8 * abs(r) + + # ignore vectors that significantly overlap layout bboxes + if any(abs(r & bb) > min(r_low_limit, abs(bb) * 0.1) for bb in all_bboxes): + continue + # ignore vectors that are mostly covered by images + if any(abs(r & i) > r_hi_limit for i in images): + continue + paths.append({"rect": r}) + + # make vector clusters, select only sufficiently large ones + vectors = page.cluster_drawings(drawings=paths, x_tolerance=20, y_tolerance=20) + vectors = [v for v in vectors if v.width > 30 and v.height > 30] + + # resolve mutual containment of images and vectors + imgs = sorted(images + vectors, key=lambda r: abs(r), reverse=True) + + filtered_imgs = [] + for r in imgs: + if not any(r in fr for fr in filtered_imgs): + filtered_imgs.append(r) + + for r in filtered_imgs: + # add picture orphans that do not significantly overlap layout boxes + if not any(abs(r & bbox) > 0.1 * min(abs(r), abs(bbox)) for bbox in all_bboxes): + page.layout_information.append(list(r) + ["picture"]) + all_bboxes.append(r) + return + + +""" +Determine reading order of layout boxes on a document page. + +Layout boxes are defined as classified bounding boxes, with class info as +provided by pymupdf_layout. Each box is a tuple (x0, y0, x1, y1, "class"). + +The main function is "find_reading_order()". +""" + + +def cluster_stripes(boxes, vertical_gap: float = 12): + """ + Divide page into horizontal stripes based on vertical gaps. + + Args: + boxes (list): List of bounding boxes, each defined as (x0, y0, x1, y1). + vertical_gap (float): Minimum vertical gap to separate stripes. + + Returns: + List of disjoint horizontal stripes. Each stripe is a list of boxes. + """ + # Sort top to bottom + sorted_boxes = sorted(boxes, key=lambda b: b[1]) + stripes = [] + if not sorted_boxes: + return stripes + current_stripe = [sorted_boxes[0]] + + for box in sorted_boxes[1:]: + prev_bottom = max(b[3] for b in current_stripe) + if box[1] - prev_bottom > vertical_gap: + stripes.append(current_stripe) + current_stripe = [box] + else: + current_stripe.append(box) + + stripes.append(current_stripe) + return stripes + + +def cluster_columns_in_stripe(stripe: list): + """ + Within a stripe, group boxes into columns based on horizontal proximity. + + Args: + stripe (list): List of boxes within a stripe. + + Returns: + list: List of columns, each column is a list of boxes. + """ + # Sort left to right + sorted_boxes = sorted(stripe, key=lambda b: b[0]) + columns = [] + current_column = [sorted_boxes[0]] + + for box in sorted_boxes[1:]: + prev_right = max([b[2] for b in current_column]) + if box[0] - prev_right >= -1: + columns.append(sorted(current_column, key=lambda b: b[3])) + current_column = [box] + else: + current_column.append(box) + + columns.append(sorted(current_column, key=lambda b: b[3])) + return columns + + +def compute_reading_order(boxes, vertical_gap: float = 12): + """ + Compute reading order of boxes delivered by PyMuPDF-Layout. + + Args: + boxes (list): List of bounding boxes. + vertical_gap (float): Minimum vertical gap to separate stripes. + + Returns: + list: List of boxes in reading order. + """ + # compute adequate vertical_gap based height of union of bboxes + temp = pymupdf.EMPTY_RECT() + for b in boxes: + temp |= pymupdf.Rect(b[:4]) + this_vertical_gap = vertical_gap * temp.height / 800 + stripes = cluster_stripes(boxes, vertical_gap=this_vertical_gap) + ordered = [] + for stripe in stripes: + columns = cluster_columns_in_stripe(stripe) + for col in columns: + ordered.extend(col) + return ordered + + +def find_reading_order(boxes, vertical_gap: float = 12) -> list: + """Given page layout information, return the boxes in reading order. + + Args: + boxes: List of classified bounding boxes with class info as defined + by pymupdf_layout: (x0, y0, x1, y1, "class"). + vertical_gap: Minimum vertical gap to separate stripes. The default + value of 12 works well for most documents. + + Returns: + List of boxes in reading order. + """ + + def is_contained(inner, outer) -> bool: + """Check if inner box is fully contained within outer box.""" + return ( + 1 + and outer[0] <= inner[0] + and outer[1] <= inner[1] + and outer[2] >= inner[2] + and outer[3] >= inner[3] + and inner != outer + ) + + def filter_contained(boxes) -> list: + """Remove boxes that are fully contained within another box.""" + # Sort boxes by descending area + sorted_boxes = sorted( + boxes, key=lambda r: (r[2] - r[0]) * (r[3] - r[1]), reverse=True + ) + result = [] + for r in sorted_boxes: + if not any(is_contained(r, other) for other in result): + result.append(r) + return result + + """ + We expect being passed raw 'layout_information' as provided by + pymupdf_layout. We separate page headers and footers from the + body, bring body boxes into reading order and concatenate the final list. + """ + filtered = filter_contained(boxes) # remove nested boxes first + page_headers = [] # for page headers + page_footers = [] # for page footers + body_boxes = [] # for main body boxes + + # separate boxes by type + for box in filtered: + x0, y0, x1, y1, bclass = box + if bclass == "page-header": + page_headers.append(box) + elif bclass == "page-footer": + page_footers.append(box) + else: + body_boxes.append(box) + + # bring body into reading order + ordered = compute_reading_order(body_boxes, vertical_gap=vertical_gap) + + # Final full boxes list. We do simple sorts for non-body boxes. + final = ( + sorted(page_headers, key=lambda r: (r[1], r[0])) + + ordered + + sorted(page_footers, key=lambda r: (r[1], r[0])) + ) + return final + + +def simplify_vectors(vectors): + new_vectors = [] + if not vectors: + return new_vectors + new_vectors = [vectors[0]] + for v in vectors[1:]: + last_v = new_vectors[-1] + if ( + 1 + and abs(v["bbox"][1] - last_v["bbox"][1]) < 1 + and abs(v["bbox"][3] - last_v["bbox"][3]) < 1 + and v["bbox"][0] <= last_v["bbox"][2] + 1 + ): + # merge horizontally + new_bbox = [ + min(v["bbox"][0], last_v["bbox"][0]), + min(v["bbox"][1], last_v["bbox"][1]), + max(v["bbox"][2], last_v["bbox"][2]), + max(v["bbox"][3], last_v["bbox"][3]), + ] + last_v["bbox"] = new_bbox + else: + new_vectors.append(v) + return new_vectors + + +def find_virtual_lines(page, table_bbox, words, vectors, link_rects): + """Return virtual lines for a given table bbox.""" + + def make_vertical(table_bbox, line_bbox, word_boxes): + # default top and bottom point of vertical line + top = line_bbox.tl - (2, 0) + bottom = pymupdf.Point(top.x, table_bbox.y1) + + # check if this cuts through any word boxes below and adjust bottom y + my_wboxes = sorted( + [ + wr + for wr in word_boxes + if wr.y0 >= top.y and wr.y1 <= bottom.y and wr.x0 < top.x < wr.x1 + ], + key=lambda r: r.y1, + ) + if my_wboxes: # if so, adjust bottom y + bottom.y = my_wboxes[0].y0 + + # same check above + my_wboxes = sorted( + [ + wr + for wr in word_boxes + if wr.y0 >= table_bbox.y0 and wr.y1 <= top.y and wr.x0 < top.x < wr.x1 + ], + key=lambda r: r.y1, + ) + if my_wboxes: # if so, adjust top y + top.y = my_wboxes[-1].y1 + else: # else we can start at top of table + top.y = table_bbox.y0 + + # extender = [((table_bbox.x0, top.y), (table_bbox.x1, top.y)), (top, bottom)] + extender = [(top, bottom)] + return extender + + word_boxes = sorted( + [ + pymupdf.Rect(w[:4]) + for w in words + if (w[3] - w[1]) > 5 and table_bbox.contains(w[:4]) + ], + key=lambda r: r.y1, + ) + + all_lines = [] + all_boxes = [] + for v in vectors: + vbbox = pymupdf.Rect(v["bbox"]).normalize() + vbbox += (0, -0.5, 0, 0.5) # expand vertically a bit + vbbox &= table_bbox + if vbbox.is_empty: + continue + if not v["stroked"] and vbbox.height >= 5 and vbbox.width > 20: + all_lines.append((vbbox.tl, vbbox.tr)) + all_lines.append((vbbox.bl, vbbox.br)) + continue + if ( + vbbox.width > 20 + and vbbox.height <= 3 + and not any(vbbox.intersects(lr) for lr in link_rects) + ): # horizontal line + lines = make_vertical(table_bbox, vbbox, word_boxes) + for line in lines: + all_lines.append(line) + + return all_lines, all_boxes + + +def complete_table_structure(page): + """Add virtual lines for "table" layout bboxes + + Iterate through all "table" layout boxes on the page's layout_information + and return virtual lines and boxes that can help detect table structures. + + Returns: + lists of virtual lines and boxes for the page's TableFinder. + """ + all_lines = [] + all_boxes = [] + textpage = page.get_textpage( + flags=pymupdf.TEXT_ACCURATE_BBOXES + | pymupdf.TEXT_COLLECT_VECTORS + | pymupdf.TEXT_COLLECT_STYLES + ) + words = page.get_text("words", textpage=textpage) + vectors = sorted( + [b for b in textpage.extractDICT()["blocks"] if b["type"] == 3 and b["isrect"]], + key=lambda v: (v["bbox"][3], v["bbox"][0]), + ) + vectors = simplify_vectors(vectors) + link_rects = [l["from"] for l in page.get_links()] + for b in page.layout_information: + if b[-1] != "table": + continue + table_bbox = pymupdf.Rect(b[:4]) + all_boxes.append(table_bbox) + lines, boxes = find_virtual_lines( + page, + table_bbox, + words, + vectors, + link_rects, + ) + all_lines.extend(lines) + all_boxes.extend(boxes) + + return all_lines, all_boxes + + +def extract_cells(textpage, cell, markdown=False): + """Extract text from a rect-like 'cell' as plain or MD styled text. + + This function should ultimately be used to extract text from a table cell. + Markdown output will only work correctly if extraction flag bit + TEXT_COLLECT_STYLES is set. + + Args: + textpage: A PyMuPDF TextPage object. Must have been created with + TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES. + cell: A tuple (x0, y0, x1, y1) defining the cell's bbox. + markdown: If True, return text formatted for Markdown. + + Returns: + A string with the text extracted from the cell. + """ + text = "" + for block in textpage.extractRAWDICT()["blocks"]: + if block["type"] != 0: + continue + for line in block["lines"]: + new_line = True + if text: # must be a new line in the cell + if text.endswith("$"): + text += " " + elif text.endswith("$ "): + pass + else: + text += "
" if markdown else "\n" + + # strikeout detection only works with horizontal text + horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0) + + for span in line["spans"]: + sbbox = span["bbox"] + # only include chars with more than 50% bbox overlap + span_text = "" + for char in span["chars"]: + this_char = char["c"] + bbox = pymupdf.Rect(char["bbox"]) + if abs(bbox & cell) > 0.5 * abs(bbox): + span_text += this_char + elif this_char in white_spaces: + span_text += " " + + if not span_text: + continue # skip empty span + + if not markdown: # no MD styling + text += span_text + continue + + prefix = "" + suffix = "" + if horizontal and span["char_flags"] & pymupdf.mupdf.FZ_STEXT_STRIKEOUT: + prefix += "~~" + suffix = "~~" + suffix + if span["char_flags"] & pymupdf.mupdf.FZ_STEXT_BOLD: + prefix += "**" + suffix = "**" + suffix + if span["flags"] & pymupdf.TEXT_FONT_ITALIC: + prefix += "_" + suffix = "_" + suffix + if span["flags"] & pymupdf.TEXT_FONT_MONOSPACED: + prefix += "`" + suffix = "`" + suffix + + if len(span["chars"]) > 2: + span_text = span_text.rstrip() + + # if span continues previous styling: extend cell text + if (ls := len(suffix)) and text.endswith(suffix): + text = text[:-ls] + span_text + suffix + else: # append the span with new styling + if not span_text.strip(): + text += " " + else: + text += prefix + span_text + suffix + + return text.strip() + + +def table_to_markdown(textpage, table_item, markdown=True): + output = "" + table = table_item.table + row_count = table["row_count"] + col_count = table["col_count"] + cell_boxes = table["cells"] + # make empty cell text list + cells = [[None for i in range(col_count)] for j in range(row_count)] + + # fill None cells with extracted text + # for rows, copy content from left to right + for j in range(row_count): + for i in range(col_count - 1): + if cells[j][i + 1] is None: + cells[j][i + 1] = cells[j][i] + + # for columns, copy top to bottom + for i in range(col_count): + for j in range(row_count - 1): + if cells[j + 1][i] is None: + cells[j + 1][i] = cells[j][i] + + for i, row in enumerate(cell_boxes): + for j, cell in enumerate(row): + if cell is not None: + cells[i][j] = extract_cells( + textpage, cell_boxes[i][j], markdown=markdown + ) + for i, name in enumerate(cells[0]): + if name is None: + if i > 0: + cells[0][i] = cells[0][i - 1] + else: + cells[0][i] = "" + + header = "|" + "|".join(cells[0]) + "|\n" + output += header + # insert GitHub header line separator + output += "|" + "|".join("---" for i in range(col_count)) + "|\n" + + # skip first row in details if header is part of the table + j = 1 # if self.header.external else 1 + + # iterate over detail rows + for row in cells[j:]: + line = "|" + for i, cell in enumerate(row): + # replace None cells with empty string + # use HTML line break tag + if cell is None: + cell = "" + line += cell + "|" + line += "\n" + output += line + return output + "\n" diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index 5d422f48..e4963f8a 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,3 +1,3 @@ # Generated file - do not edit. -MINIMUM_PYMUPDF_VERSION = (1, 26, 3) -VERSION = '0.0.27' +MINIMUM_PYMUPDF_VERSION = (1, 26, 6) +VERSION = '0.1.7' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 7051a8c7..7e435093 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -14,8 +14,8 @@ "Topic :: Utilities", ] -version = "0.0.28" -requires = ["pymupdf>=1.26.3"] +version = "0.1.7" +requires = ["pymupdf>=1.26.6", "tabulate"] text = requires[0].split("=")[1] text = tuple(map(int, text.split("."))) @@ -32,7 +32,7 @@ long_description=readme, long_description_content_type="text/markdown", install_requires=requires, - python_requires=">=3.9", + python_requires=">=3.10", license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License", url="https://github.com/pymupdf/RAG", classifiers=classifiers,