diff --git a/CHANGES.md b/CHANGES.md index 11325052..7e0d5289 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,15 +1,53 @@ # Change Log -## Changes in version 0.0.28 +## Changes in version 0.2.1 ### Fixes: -* [xxx](https://github.com/pymupdf/RAG/issues/xxx) - +* [320](https://github.com/pymupdf/RAG/issues/320) - [Bug] ValueError: min() iterable argument is empty ... +* [319](https://github.com/pymupdf/RAG/issues/319) - [Bug] ValueError: min() arg is an empty sequence ### Other Changes: -* xxx +* OCR invocation now differentiates between full-page OCR and text-only OCR: If the page does contain text but the percentage of unreadable characters exceeds a certain threshold (90%), we only OCR text span boundary boxes and replace span text with OCR'ed text where necessary. +------ + +## Changes in version 0.2.0 + +This version introduces full support of the [PyMuPDF-Layout](https://pypi.org/project/pymupdf-layout/) package. This entails a radically new approach for detecting the layout of document pages using the AI-based features of the layout package. + +Improvements include: + +* Greatly improved table detection +* Support of list item hierachy levels +* Detection of page headers and footers +* Improved detection of text paragraphs, titles and section headers +* New output options beyond Markdown: plain text and JSON +* Automatically detect whether a page needs OCR and invoke Tesseract if both, Tesseract is installed and OpenCV (package [opencv-python](https://pypi.org/project/opencv-python/)) is available. Invocation criteria include absence of readable text, full-page coverage with images, presence of many character-sized vector graphics. + +The PyMuPDF-Layout package is not open-source and has its own license, which is different from PyMuPDF4LLM. It also is dependent on a number of other, fairly large packages like [onnxruntime](https://pypi.org/project/onnxruntime/), [numpy](https://pypi.org/project/numpy/), [sympy](https://pypi.org/project/sympy/) and [OpenCV](https://pypi.org/project/opencv-python/), which each in turn have their own dependencies. + +We therefore keep the use of the layout feature optional. To activate PyMuPDF-Layout support the following import statement **_must be included before_** importing PyMuPDF4LLM itself: + +```python +import pymupdf.layout +import pymupdf4llm +``` + +Thereafter, PyMuPDF's namespace is available. The known method `pymupdf4llm.to_markdown()` automatically works with AI-based empowerment. +In addition, two new methods become available: +* `pymupdf4llm.to_text()` - which works much like markdown output but produces plain text. +* `pymupdf4llm.to_json()` - which outputs the document's metadata and the selected pages in JSON format. + +### Fixes: + + +### Other Changes: + +* If `show_progress=True`, Python package [tqdm](https://pypi.org/project/tqdm/) is automatically used when available to display a progress bar. If tqdm is not installed, our own text-based progress bar is used. + +------ ## Changes in version 0.0.27 diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py index ba82a62c..2e434a8b 100644 --- a/pdf4llm/setup.py +++ b/pdf4llm/setup.py @@ -6,7 +6,7 @@ with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f: readme = f.read() -version = "0.2.0" +version = "0.2.1" classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 8ca76f53..d8139ddc 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -26,6 +26,9 @@ def parse_document( image_format="png", image_path="", pages=None, + output_images=True, + show_progress=False, + force_text=True, ): return DL.parse_document( doc, @@ -34,6 +37,9 @@ def parse_document( image_format=image_format, image_path=image_path, pages=pages, + output_images=output_images, + show_progress=show_progress, + force_text=force_text, ) def to_markdown( @@ -75,6 +81,9 @@ def to_markdown( image_format=image_format, image_path=image_path, pages=pages, + output_images=embed_images or write_images, + show_progress=show_progress, + force_text=force_text, ) return parsed_doc.to_markdown( header=header, @@ -82,6 +91,7 @@ def to_markdown( write_images=write_images, embed_images=embed_images, ignore_code=ignore_code, + show_progress=show_progress, ) def to_json( @@ -92,6 +102,9 @@ def to_json( image_format="png", image_path="", pages=None, + output_images=False, + show_progress=False, + force_text=True, ): parsed_doc = parse_document( doc, @@ -99,6 +112,9 @@ def to_json( image_format=image_format, image_path=image_path, pages=pages, + output_images=output_images, + show_progress=show_progress, + force_text=force_text, ) return parsed_doc.to_json() @@ -109,6 +125,8 @@ def to_text( footer=True, pages=None, ignore_code=False, + show_progress=False, + force_text=True, ): parsed_doc = parse_document( doc, @@ -117,11 +135,15 @@ def to_text( image_format="png", image_path="", pages=pages, + output_images=False, + show_progress=show_progress, + force_text=force_text, ) return parsed_doc.to_text( header=header, footer=footer, ignore_code=ignore_code, + show_progress=show_progress, ) diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py index f9ad27f8..37c658e6 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py +++ b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py @@ -1,28 +1,42 @@ -import pymupdf # PyMuPDF -import numpy as np import cv2 +import numpy as np +import pymupdf # PyMuPDF +from pymupdf4llm.helpers.utils import WHITE_CHARS -WHITE_CHARS = set( - [chr(i) for i in range(33)] - + [ - "\u00a0", # Non-breaking space - "\u2000", # En quad - "\u2001", # Em quad - "\u2002", # En space - "\u2003", # Em space - "\u2004", # Three-per-em space - "\u2005", # Four-per-em space - "\u2006", # Six-per-em space - "\u2007", # Figure space - "\u2008", # Punctuation space - "\u2009", # Thin space - "\u200a", # Hair space - "\u202f", # Narrow no-break space - "\u205f", # Medium mathematical space - "\u3000", # Ideographic space - ] -) +def get_tessocr(page, bbox, dpi=300): + """Return OCR-ed span text using Tesseract. + + Args: + page: pymupdf Page + bbox: pymupdf Rect or its sequence + dpi: resolution for OCR image + Returns: + The OCR-ed text of the bbox. + """ + # Step 1: Make a high-resolution image of the bbox. + pix = page.get_pixmap(dpi=dpi, clip=bbox) + ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes()) + ocrpage = ocrpdf[0] + text = ocrpage.get_text() + text = text.replace("\n", " ").strip() + return text + + +def repair_blocks(input_blocks, page): + repaired_blocks = [] + for block in input_blocks: + if block["type"] == 0: # text block + for line in block["lines"]: + for span in line["spans"]: + if not chr(0xFFFD) in span["text"]: + continue + text = get_tessocr(page, span["bbox"]) + span["text"] = text + repaired_blocks.append(block) + else: + repaired_blocks.append(block) + return repaired_blocks def detect_qr_codes(img): @@ -152,23 +166,38 @@ def should_ocr_page( # Check for text text = page.get_text(flags=0) decision["has_text"] = not WHITE_CHARS.issuperset(text) - if decision["has_text"]: - not_readable_count = len([c for c in text if c == chr(0xFFFD)]) - readability = 1 - not_readable_count / len(text) - decision["readable_text"] = readability >= text_readability_thresh all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]] ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"] decision["has_ocr_text"] = bool(ocr_text_bboxes) + + if decision["has_text"]: + unreadable_count = len([c for c in text if c == chr(0xFFFD)]) + readability = 1 - unreadable_count / len(text) + decision["readable_text"] = readability >= text_readability_thresh + + if decision["has_text"] and not decision["readable_text"]: + decision["should_ocr"] = True + decision["image"], decision["transform"], decision["pixmap"] = get_page_image( + page, dpi=dpi + ) + + if decision["has_text"]: + # early exit if any text exists + print( + f"{decision['has_text']=}, {decision['readable_text']=}, {decision['should_ocr']=}" + ) + return decision + # Check for image coverage - image_rects=[page_rect&img["bbox"] for img in page.get_image_info()] - image_rect=pymupdf.EMPTY_RECT() + image_rects = [page_rect & img["bbox"] for img in page.get_image_info()] + image_rect = pymupdf.EMPTY_RECT() for r in image_rects: - image_rect|=r - image_area=abs(image_rect) + image_rect |= r + image_area = abs(image_rect) if image_area: images_cover = image_area / page_area - else: + else: images_cover = 0.0 decision["image_covers_page"] = images_cover >= image_coverage_thresh @@ -189,16 +218,11 @@ def should_ocr_page( # Final decision if ( - 1 - and not decision["has_text"] - and not decision["readable_text"] - and ( - 0 - or decision["image_covers_page"] - or decision["has_vector_drawings"] - or decision["edge_density"] > edge_thresh - ) + 0 + or decision["image_covers_page"] + or decision["has_vector_drawings"] + or decision["edge_density"] > edge_thresh ): decision["should_ocr"] = True - + return decision diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py index 22ca33a3..a69f5dfe 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py +++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py @@ -1,15 +1,19 @@ import base64 import json import os -from binascii import b2a_base64 from dataclasses import dataclass -from typing import Dict, List, Optional, Union from pathlib import Path +from typing import Dict, List, Optional, Union + import pymupdf import tabulate -from pymupdf4llm.helpers.get_text_lines import get_raw_lines from pymupdf4llm.helpers import utils +from pymupdf4llm.helpers.get_text_lines import get_raw_lines +try: + from tqdm import tqdm as ProgressBar +except ImportError: + from pymupdf4llm.helpers.progress import ProgressBar try: import cv2 from pymupdf4llm.helpers import check_ocr @@ -33,7 +37,7 @@ def omit_if_pua_char(text): """Check if character is in the Private Use Area (PUA) of Unicode.""" - if len(text) > 1: + if len(text) > 1: # only single characters are checked return text o = ord(text) if ( @@ -46,8 +50,7 @@ def omit_if_pua_char(text): def create_list_item_levels(layout_info): - """Create a dictionary that maps the box number of each list-item to - its hierarchy level. + """Map the layout box number of each list-item to its hierarchy level. Args: layout_info (list): the bbox list "page.layout_information" @@ -59,10 +62,10 @@ def create_list_item_levels(layout_info): segment = [] # current segment # Create segments of contiguous list items. Each non-list-item finishes - # the current segment. Also, if two list-items belong to different page - # text columns ends a segment. + # the current segment. Also, two list-items in a row belonging to different + # page text columns end the segment after the first item. for i, item in enumerate(layout_info): - if item.boxclass != "list-item": # bbox class is not list-item + if item.boxclass != "list-item": # bbox class is no list-item if segment: # end and save the current segment segments.append(segment) segment = [] @@ -83,7 +86,7 @@ def create_list_item_levels(layout_info): # walk through segments and assign levels for i, s in enumerate(segments): - if not s: + if not s: # skip empty segments continue s.sort(key=lambda x: x[1].x0) # sort by x0 coordinate of the bbox @@ -119,16 +122,15 @@ def is_monospaced(textlines): def is_superscripted(line): spans = line["spans"] + line_bbox = line["bbox"] if not spans: return False - if spans[0]["flags"] & 1: # check for superscript + span0 = spans[0] + if span0["flags"] & 1: # check for superscript flag return True - if len(spans) < 2: + if len(spans) < 2: # single span line: skip return False - if ( - spans[0]["origin"][1] < spans[1]["origin"][1] - and spans[0]["size"] < spans[1]["size"] - ): + if spans0["origin"][1] < spans[1]["origin"][1] and span0["size"] < spans[1]["size"]: return True return False @@ -138,18 +140,18 @@ def get_plain_text(spans): Parameter is a list of span dictionaries. The spans may come from one or more original "textlines" items. Returns the text string of the boundary box. - The text string always ends with the suffix and a space """ output = "" for i, s in enumerate(spans): - span_text = s["text"].strip() # remove leading/trailing spaces superscript = s["flags"] & 1 - span_text = s["text"].strip() + span_text = s["text"].strip() # remove leading/trailing spaces if superscript: + # enclose superscripted text in brackets if first span if i == 0: span_text = f"[{span_text}] " elif output.endswith(" "): output = output[:-1] + # resolve hyphenation if output.endswith("- ") and len(output.split()[-1]) > 2: output = output[:-2] output += span_text + " " @@ -160,7 +162,7 @@ def list_item_to_text(textlines, level): """ Convert "list-item" bboxes to text. """ - indent = " " * (level - 1) + indent = " " * (level - 1) # indentation based on level output = indent line = textlines[0] x0 = line["bbox"][0] # left of first line @@ -195,11 +197,13 @@ def footnote_to_text(textlines): """ Convert "footnote" bboxes to text. """ + # we render footnotes as blockquotes output = "> " line = textlines[0] spans = line["spans"] for line in textlines[1:]: + # superscripted line starts a new footnote line if is_superscripted(line): line_output = get_plain_text(spans) output += line_output @@ -214,7 +218,10 @@ def footnote_to_text(textlines): def code_block_to_text(textlines): - """Output a code block in plain text format.""" + """Output a code block in plain text format. + + Basic difference is that lines are separated by line breaks. + """ output = "" for line in textlines: line_text = "" @@ -228,15 +235,17 @@ def code_block_to_text(textlines): def text_to_text(textlines, ignore_code: bool = False): """ - Convert "text" bboxes to plain text, as well as other boxclasses + Convert "text" bboxes to plain text, as well as boxclasses not specifically handled elsewhere. - The line text is written without line breaks. At the end, - two newlines are added to separate from the next block. + The text of all spans of all lines is written without line breaks. + At the end, two newlines are added to separate from the next block. """ if not textlines: return "" if is_superscripted(textlines[0]): # check for superscript + # handle mis-classified text boundary box return footnote_to_text(textlines) + # handle completely mnonospaced textlines as code block if not ignore_code and is_monospaced(textlines): return code_block_to_text(textlines) @@ -249,6 +258,47 @@ def text_to_text(textlines, ignore_code: bool = False): return output + "\n\n" +def picture_text_to_text(textlines, ignore_code: bool = False, clip=None): + """ + Convert text extracted from images to plain text format. + """ + output = "----- Start of picture text -----\n" + for tl in textlines: + line_text = " ".join([s["text"] for s in tl["spans"]]) + output += line_text.rstrip() + "\n" + output += "----- End of picture text -----\n" + return output + "\n" + + +def fallback_text_to_text(textlines, ignore_code: bool = False, clip=None): + """Convert text extracted from unrecognized tables. + + We hope for some sort of table structure being present in the text spans: + The maximum span count in the lines is assumed to equal column count. + """ + span_count = max(len(tl["spans"]) for tl in textlines) + lines = [] + output = "" + for tl in textlines: + spans = tl["spans"] + # prepare a row with empty strings in each cell + line = [""] * span_count + if len(spans) < span_count and spans[0]["bbox"][0] > clip[0] + 10: + i = 1 + else: + i = 0 + for j, s in enumerate(spans, start=i): + line[j] = s["text"].strip() + lines.append(line) + tab_text = tabulate.tabulate( + lines, + tablefmt="grid", + maxcolwidths=int(100 / span_count), + ) + output += tab_text + "\n" + return output + "\n" + + def get_styled_text(spans): """Output text with markdown style codes based on font properties. Parameter is a list of span dictionaries. The spans may come from @@ -325,7 +375,7 @@ def list_item_to_md(textlines, level): This post-layout heuristics helps cover cases where more than one list item is contained in a single bbox. """ - indent = " " * (level - 1) + indent = " " * (level - 1) # indentation based on level line = textlines[0] x0 = line["bbox"][0] # left of first line spans = line["spans"] @@ -337,6 +387,7 @@ def list_item_to_md(textlines, level): starter = "1. " if not omit_if_pua_char(span0["text"].strip()): + # bullet was a PUA char: remove it spans.pop(0) if spans: x0 = spans[0]["bbox"][0] @@ -452,6 +503,33 @@ def text_to_md(textlines, ignore_code: bool = False): return output + "\n\n" +def picture_text_to_md(textlines, ignore_code: bool = False, clip=None): + """ + Convert text extracted from images to markdown format. + """ + output = "**----- Start of picture text -----**
\n" + for tl in textlines: + line_text = " ".join([s["text"] for s in tl["spans"]]) + output += line_text.rstrip() + "
" + output += "**----- End of picture text -----**
\n" + return output + "\n\n" + + +def fallback_text_to_md(textlines, ignore_code: bool = False, clip=None): + """ + Convert text extracted from images to markdown format. + """ + span_count = max(len(tl["spans"]) for tl in textlines) + output = "**----- Start of picture text -----**
\n" + output += "|" * (span_count + 1) + "\n" + output += "|" + "|".join(["---"] * span_count) + "|\n" + for tl in textlines: + ltext = "|" + "|".join([s["text"].strip() for s in tl["spans"]]) + "|\n" + output += ltext + output += "**----- End of picture text -----**
\n" + return output + "\n\n" + + @dataclass class LayoutBox: x0: float @@ -502,42 +580,68 @@ def to_markdown( write_images: bool = False, embed_images: bool = False, ignore_code: bool = False, + show_progress: bool = False, ) -> str: """ Serialize ParsedDocument to markdown text. """ output = "" - for page in self.pages: + if show_progress and len(self.pages) > 5: + print(f"Generating markdown text...") + this_iterator = ProgressBar(self.pages) + else: + this_iterator = self.pages + for page in this_iterator: - # make mapping: box number to list item level + # Make a mapping: box number -> list item hierarchy level list_item_levels = create_list_item_levels(page.boxes) + for i, box in enumerate(page.boxes): clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1) btype = box.boxclass + + # skip headers/footers if requested if btype == "page-header" and header is False: continue if btype == "page-footer" and footer is False: continue - if btype in ("picture", "formula") and box.image: - img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}" - if write_images: - filename = os.path.basename(self.filename).replace(" ", "-") - image_filename = os.path.join( - self.image_path, - f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}", - ) - Path(image_filename).write_bytes(box.image) - - output += GRAPHICS_TEXT % img_filename - - elif embed_images: - # make a base64 encoded string of the image - data = b2a_base64(box.image).decode() - data = f"data:image/{self.image_format};base64," + data - output += GRAPHICS_TEXT % data + "\n\n" + # pictures and formulas: either write image file or embed + if btype in ("picture", "formula", "fallback"): + if box.image: + if write_images: + img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}" + filename = os.path.basename(self.filename).replace(" ", "-") + image_filename = os.path.join( + self.image_path, + f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}", + ) + Path(image_filename).write_bytes(box.image) + + output += GRAPHICS_TEXT % img_filename + + elif embed_images: + # make a base64 encoded string of the image + data = base64.b64encode(box.image).decode() + data = f"data:image/{self.image_format};base64," + data + output += GRAPHICS_TEXT % data + "\n\n" else: - output += f"**==> {btype} [{clip.width} x {clip.height}] intentionally omitted <==**\n\n" + output += f"**==> picture [{clip.width} x {clip.height}] intentionally omitted <==**\n\n" + + # output text in image if requested + if box.textlines: + if btype == "picture": + output += picture_text_to_md( + box.textlines, + ignore_code=ignore_code or page.ocrpage, + clip=clip, + ) + elif btype == "fallback": + output += fallback_text_to_md( + box.textlines, + ignore_code=ignore_code or page.ocrpage, + clip=clip, + ) continue if btype == "table": output += box.table["markdown"] + "\n\n" @@ -564,7 +668,7 @@ def to_markdown( return output - def to_json(self) -> str: + def to_json(self, show_progress=False) -> str: # Serialize to JSON class LayoutEncoder(json.JSONEncoder): def default(self, s): @@ -593,13 +697,19 @@ def to_text( header: bool = True, footer: bool = True, ignore_code: bool = False, + show_progress: bool = False, ) -> str: """ Serialize ParsedDocument to plain text. Optionally omit page headers or footers. """ # Flatten all text boxes into plain text output = "" - for page in self.pages: + if show_progress and len(self.pages) > 5: + print(f"Generating plain text ..") + this_iterator = ProgressBar(self.pages) + else: + this_iterator = self.pages + for page in this_iterator: list_item_levels = create_list_item_levels(page.boxes) for i, box in enumerate(page.boxes): clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1) @@ -608,8 +718,21 @@ def to_text( continue if btype == "page-footer" and footer is False: continue - if btype in ("picture", "formula"): - output += f"==> {btype} [{clip.width} x {clip.height}] <==\n\n" + if btype in ("picture", "formula", "fallback"): + output += f"==> picture [{clip.width} x {clip.height}] <==\n\n" + if box.textlines: + if btype == "picture": + output += picture_text_to_text( + box.textlines, + ignore_code=ignore_code or page.ocrpage, + clip=clip, + ) + elif btype == "fallback": + output += fallback_text_to_text( + box.textlines, + ignore_code=ignore_code or page.ocrpage, + clip=clip, + ) continue if btype == "table": output += ( @@ -637,6 +760,9 @@ def parse_document( image_format="png", image_path="", pages=None, + show_progress=False, + output_images=True, + force_text=False, ) -> ParsedDocument: if isinstance(doc, pymupdf.Document): mydoc = doc @@ -651,6 +777,7 @@ def parse_document( document.image_format = image_format document.image_path = image_path document.pages = [] + document.force_text = force_text try: reason = "OpenCV not installed" assert cv2 is not None @@ -677,6 +804,9 @@ def parse_document( raise ValueError( "'pages' parameter must be None, int, or a sequence of ints less than page count" ) + if show_progress and len(page_filter) > 5: + print(f"Parsing {len(page_filter)} pages of '{document.filename}'...") + page_filter = ProgressBar(page_filter) for pno in page_filter: page = mydoc.load_page(pno) @@ -687,28 +817,36 @@ def parse_document( decision = {"should_ocr": False} if decision["should_ocr"]: print(f"Performing OCR on {page.number=}[{page.number+1}]...") - pix = decision["pixmap"] # retrieve the Pixmap - pdf_data = pix.pdfocr_tobytes() # OCR it - ocr_pdf = pymupdf.open("pdf", pdf_data) # get the OCR'd PDF - ocrpage = ocr_pdf[0] # this is its OCR'd page - # remove everything except the text - ocrpage.add_redact_annot(ocrpage.rect) - ocrpage.apply_redactions( - images=pymupdf.PDF_REDACT_IMAGE_REMOVE, - graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, - text=pymupdf.PDF_REDACT_TEXT_NONE, - ) - # copy text over to original page - page.show_pdf_page(page.rect, ocr_pdf, 0) - ocr_pdf.close() # discard temporary OCR PDF - del ocr_pdf + if not decision.get("has_text"): + pix = decision["pixmap"] # retrieve the Pixmap + pdf_data = pix.pdfocr_tobytes() # OCR it + ocr_pdf = pymupdf.open("pdf", pdf_data) # get the OCR'd PDF + ocrpage = ocr_pdf[0] # this is its OCR'd page + # remove everything except the text + ocrpage.add_redact_annot(ocrpage.rect) + ocrpage.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_REMOVE, + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, + text=pymupdf.PDF_REDACT_TEXT_NONE, + ) + # copy text over to original page + page.show_pdf_page(page.rect, ocr_pdf, 0) + ocr_pdf.close() # discard temporary OCR PDF + del ocr_pdf + textpage = page.get_textpage(flags=FLAGS) + blocks = textpage.extractDICT()["blocks"] + else: + textpage = page.get_textpage(flags=FLAGS) + blocks = textpage.extractDICT()["blocks"] + blocks = check_ocr.repair_blocks(blocks, page) + else: + textpage = page.get_textpage(flags=FLAGS) + blocks = textpage.extractDICT()["blocks"] bboxlog = page.get_bboxlog() - ocrpage = decision["should_ocr"] or ( + ocrpage = ( set([b[0] for b in bboxlog if b[0] == "ignore-text"]) == CHECK_OCR_TEXT ) - textpage = page.get_textpage(flags=FLAGS) - blocks = textpage.extractDICT()["blocks"] page.get_layout() utils.clean_pictures(page, blocks) utils.add_image_orphans(page, blocks) @@ -749,8 +887,23 @@ def parse_document( clip = pymupdf.Rect(box[:4]) if layoutbox.boxclass in ("picture", "formula"): - pix = page.get_pixmap(clip=clip, dpi=document.image_dpi) - layoutbox.image = pix.tobytes(document.image_format) + if output_images: + pix = page.get_pixmap(clip=clip, dpi=document.image_dpi) + layoutbox.image = pix.tobytes(document.image_format) + else: + layoutbox.image = None + if layoutbox.boxclass == "picture" and document.force_text: + # extract any text within the image box + layoutbox.textlines = [ + {"bbox": l[0], "spans": l[1]} + for l in get_raw_lines( + textpage=None, + blocks=pagelayout.fulltext, + clip=clip, + ignore_invisible=not ocrpage, + only_horizontal=False, + ) + ] elif layoutbox.boxclass == "table": # This is either a table detected by native TableFinder or by @@ -791,11 +944,23 @@ def parse_document( ) except Exception as e: - print(f"table detection error '{e}' on page {page.number+1}") + # print(f"table detection error '{e}' on page {page.number+1}") + layoutbox.boxclass = "fallback" # table structure not detected: treat like an image - pix = page.get_pixmap(clip=clip, dpi=document.image_dpi) - layoutbox.image = pix.tobytes(document.image_format) - layoutbox.boxclass = "picture" + if output_images: + pix = page.get_pixmap(clip=clip, dpi=document.image_dpi) + layoutbox.image = pix.tobytes(document.image_format) + else: + layoutbox.image = None + layoutbox.textlines = [ + {"bbox": l[0], "spans": l[1]} + for l in get_raw_lines( + textpage=None, + blocks=pagelayout.fulltext, + clip=clip, + ignore_invisible=not ocrpage, + ) + ] else: # Handle text-like box classes: # Extract text line information within the box. diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index f3ef2c94..4f3cc890 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -12,17 +12,16 @@ License GNU Affero GPL 3.0 """ -import string import sys import pymupdf +from pymupdf4llm.helpers.utils import WHITE_CHARS -WHITE = set(string.whitespace) TYPE3_FONT_NAME = "Unnamed-T3" def is_white(text): - return WHITE.issuperset(text) + return WHITE_CHARS.issuperset(text) def get_raw_lines( @@ -31,6 +30,7 @@ def get_raw_lines( clip=None, tolerance=3, ignore_invisible=True, + only_horizontal=True, ): """Extract the text spans from a TextPage in natural reading sequence. @@ -124,7 +124,10 @@ def sanitize_spans(line): spans = [] # all spans in TextPage here for bno, b in enumerate(blocks): # the numbered blocks for lno, line in enumerate(b["lines"]): # the numbered lines - if abs(1 - line["dir"][0]) > 1e-3: # only accept horizontal text + line_dir = line["dir"] + if ( + only_horizontal and abs(1 - line_dir[0]) > 1e-3 + ): # only accept horizontal text continue for sno, s in enumerate(line["spans"]): # the numered spans sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect @@ -150,12 +153,13 @@ def sanitize_spans(line): # include line/block numbers to facilitate separator insertion s["line"] = lno s["block"] = bno + s["dir"] = line_dir spans.append(s) if not spans: # no text at all return [] - spans.sort(key=lambda s: s["bbox"].y1) # sort spans by bottom coord + spans.sort(key=lambda s: (-s["dir"][0], s["bbox"].y1)) # sort spans by bottom coord nlines = [] # final result line = [spans[0]] # collects spans with fitting vertical coordinates lrect = spans[0]["bbox"] # rectangle joined from span rectangles diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py index 4cdd8097..63b966c7 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py +++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py @@ -60,9 +60,8 @@ License GNU Affero GPL 3.0 """ -import string - import pymupdf +from pymupdf4llm.helpers.utils import WHITE_CHARS pymupdf.TOOLS.unset_quad_corrections(True) @@ -88,11 +87,10 @@ def column_boxes( paths: use these drawings instead of extracting here avoid: ignore text in any of these areas """ - WHITE = set(string.whitespace) def is_white(text): """Check for relevant text.""" - return WHITE.issuperset(text) + return WHITE_CHARS.issuperset(text) def in_bbox(bb, bboxes): """Return 1-based number if a bbox contains bb, else return 0.""" diff --git a/pymupdf4llm/pymupdf4llm/helpers/progress.py b/pymupdf4llm/pymupdf4llm/helpers/progress.py index e71e601a..7cbb2eac 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/progress.py +++ b/pymupdf4llm/pymupdf4llm/helpers/progress.py @@ -29,13 +29,9 @@ def __init__(self, items: List[Any], progress_width: int = 40): self._increment = self._progress_width / self._len if self._len else 1 # Init progress bar - sys.stdout.write( - "[%s] (0/%d)" % (" " * self._progress_width, self._len) - ) + sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len)) sys.stdout.flush() - sys.stdout.write( - "\b" * (self._progress_width + len(str(self._len)) + 6) - ) + sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6)) def __iter__(self): return self @@ -61,9 +57,7 @@ def __next__(self): # Update the numerical progress padded_index = str(self._current_index).rjust(self._len_digits) progress_info = f" ({padded_index}/{self._len})" - sys.stdout.write( - "\b" * (self._progress_width + len(progress_info) + 1) - ) + sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1)) sys.stdout.write("[") sys.stdout.write( "=" * int(self._current_index * self._progress_width / self._len) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index f0c76e6b..d0f2e9aa 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -45,26 +45,17 @@ from pymupdf import mupdf from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white from pymupdf4llm.helpers.multi_column import column_boxes -from pymupdf4llm.helpers.progress import ProgressBar +from pymupdf4llm.helpers.utils import BULLETS + +try: + from tqdm import tqdm as ProgressBar +except ImportError: + from pymupdf4llm.helpers.progress import ProgressBar pymupdf.TOOLS.unset_quad_corrections(True) -# Characters recognized as bullets when starting a line. -bullet = tuple( - [ - "- ", - "* ", - "> ", - chr(0xB6), - chr(0xB7), - chr(8224), - chr(8225), - chr(8226), - chr(0xF0A7), - chr(0xF0B7), - ] - + list(map(chr, range(9632, 9680))) -) +# Characters assumed as bullets when starting a line. +bullet = tuple(BULLETS | {"- ", "* ", "> "}) GRAPHICS_TEXT = "\n![](%s)\n" @@ -1116,16 +1107,16 @@ def get_page_output( # layout analysis. Treat whole page as one text block. text_rects = [parms.clip] else: - text_rects = column_boxes( - parms.page, - paths=parms.actual_paths, - no_image_text=not force_text, - textpage=parms.textpage, - avoid=parms.tab_rects0 + parms.vg_clusters0, - footer_margin=margins[3], - header_margin=margins[1], - ignore_images=IGNORE_IMAGES, - ) + text_rects = column_boxes( + parms.page, + paths=parms.actual_paths, + no_image_text=not force_text, + textpage=parms.textpage, + avoid=parms.tab_rects0 + parms.vg_clusters0, + footer_margin=margins[3], + header_margin=margins[1], + ignore_images=IGNORE_IMAGES, + ) """ ------------------------------------------------------------------ diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py index 9df5a3e0..f25fadb1 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/utils.py +++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py @@ -1,6 +1,38 @@ import pymupdf -white_spaces = set([chr(i) for i in range(33)]) | {0xA0, 0x2002, 0x2003, 0x2009, 0x202F} +WHITE_CHARS = set( + [chr(i) for i in range(33)] + + [ + "\u00a0", # Non-breaking space + "\u2000", # En quad + "\u2001", # Em quad + "\u2002", # En space + "\u2003", # Em space + "\u2004", # Three-per-em space + "\u2005", # Four-per-em space + "\u2006", # Six-per-em space + "\u2007", # Figure space + "\u2008", # Punctuation space + "\u2009", # Thin space + "\u200a", # Hair space + "\u202f", # Narrow no-break space + "\u205f", # Medium mathematical space + "\u3000", # Ideographic space + ] +) + +BULLETS = set( + [ + chr(0xB6), + chr(0xB7), + chr(0x2020), + chr(0x2021), + chr(0x2022), + chr(0xF0A7), + chr(0xF0B7), + ] + + list(map(chr, range(0x25A0, 0x2600))) +) def table_cleaner(page, blocks, tbbox): @@ -156,19 +188,29 @@ def add_image_orphans(page, blocks): images = [] for img in page.get_image_info(): r = page.rect & img["bbox"] + if r.width <= 3 or r.height <= 3: + continue if r.is_empty or abs(r) >= area_limit: continue images.append(r) paths = [] - for b in blocks: - if b["type"] != 3: - continue - r = page.rect & b["bbox"] + vectors = sorted( + [ + page.rect & b["bbox"] + for b in blocks + if b["type"] == 3 + and b["bbox"][3] - b["bbox"][1] > 3 + and b["bbox"][2] - b["bbox"][0] > 3 + ], + key=lambda v: abs(v), + reverse=True, + ) + vectors = vectors[:500] + + for r in vectors: if abs(r) >= area_limit: continue - if r.width < 3 and r.height < 3: - continue r_low_limit = 0.1 * abs(r) r_hi_limit = 0.8 * abs(r) @@ -186,7 +228,7 @@ def add_image_orphans(page, blocks): # resolve mutual containment of images and vectors imgs = sorted(images + vectors, key=lambda r: abs(r), reverse=True) - + imgs = imgs[:500] filtered_imgs = [] for r in imgs: if not any(r in fr for fr in filtered_imgs): @@ -405,27 +447,35 @@ def filter_contained(boxes) -> list: body_boxes.append(box) # compute joined boxes of body - joined_boxes = pymupdf.Rect( - min(b[0] for b in body_boxes), - min(b[1] for b in body_boxes), - max(b[2] for b in body_boxes), - max(b[3] for b in body_boxes), - ) + if not body_boxes: + joined_boxes = pymupdf.EMPTY_RECT() + else: + joined_boxes = pymupdf.Rect( + min(b[0] for b in body_boxes), + min(b[1] for b in body_boxes), + max(b[2] for b in body_boxes), + max(b[3] for b in body_boxes), + ) # extract vectors contained in the TextPage - min_bbox_height = min(b[3] - b[1] for b in body_boxes) - vectors = [ - pymupdf.Rect(b["bbox"]) - for b in blocks - if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and b["bbox"] in joined_boxes - ] - # bring body into reading order - ordered = compute_reading_order( - body_boxes, - joined_boxes, - vectors, - vertical_gap=this_vertical_gap, - ) + if not joined_boxes.is_empty: + min_bbox_height = min(b[3] - b[1] for b in body_boxes) + vectors = [ + pymupdf.Rect(b["bbox"]) + for b in blocks + if b["bbox"][3] - b["bbox"][1] >= min_bbox_height + and b["bbox"] in joined_boxes + ] + # bring body into reading order + ordered = compute_reading_order( + body_boxes, + joined_boxes, + vectors, + vertical_gap=this_vertical_gap, + ) + else: + ordered = [] + # Final full boxes list. We do simple sorts for non-body boxes. final = ( sorted(page_headers, key=lambda r: (r[1], r[0])) @@ -633,7 +683,7 @@ def outside_cell(bbox, cell): bbox = pymupdf.Rect(char["bbox"]) if abs(bbox & cell) > 0.5 * abs(bbox): span_text += this_char - elif this_char in white_spaces: + elif this_char in WHITE_CHARS: span_text += " " if not span_text: diff --git a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py index b178d996..e2217498 100644 --- a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py +++ b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py @@ -21,9 +21,7 @@ class PDFMarkdownReader(BaseReader): def __init__( self, - meta_filter: Optional[ - Callable[[Dict[str, Any]], Dict[str, Any]] - ] = None, + meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, ): self.meta_filter = meta_filter @@ -79,15 +77,14 @@ def _process_doc_page( **load_kwargs: Any, ): """Processes a single page of a PDF document.""" - extra_info = self._process_doc_meta( - doc, file_path, page_number, extra_info - ) + extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info) if self.meta_filter: extra_info = self.meta_filter(extra_info) text = to_markdown( - doc, pages=[page_number], + doc, + pages=[page_number], hdr_info=hdr_info, **load_kwargs, ) diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py index afbe9821..f6a11125 100644 --- a/pymupdf4llm/pymupdf4llm/versions_file.py +++ b/pymupdf4llm/pymupdf4llm/versions_file.py @@ -1,3 +1,3 @@ # Generated file - do not edit. MINIMUM_PYMUPDF_VERSION = (1, 26, 6) -VERSION = '0.2.0' +VERSION = '0.2.1' diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 5c7cd2c6..ca3ca28a 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -14,7 +14,7 @@ "Topic :: Utilities", ] -version = "0.2.0" +version = "0.2.1" requires = ["pymupdf>=1.26.6", "tabulate"] text = requires[0].split("=")[1]