From 9df602b98a3477d7e49b77449f1321baefbd1cb9 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Tue, 23 Jul 2024 08:47:29 -0400 Subject: [PATCH] Some fixes * Extend the list of known bullet point Unicodes * Fix typo for detecting a "quad" drawing --- docs/src/changes.rst | 15 +++++++++++++ pymupdf4llm/README.md | 8 +++---- .../pymupdf4llm/helpers/get_text_lines.py | 21 ++++++++++--------- .../pymupdf4llm/helpers/pymupdf_rag.py | 8 +++---- 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/docs/src/changes.rst b/docs/src/changes.rst index 1fb9436a..17091a46 100644 --- a/docs/src/changes.rst +++ b/docs/src/changes.rst @@ -4,6 +4,21 @@ Change Log =========================================================================== +Changes in version 0.0.11 +-------------------------- + +Fixes: +~~~~~~~ + +* `90 `_ "'Quad' object has no attribute 'tl'" +* `88 `_ "Bug in is_significant function" + + +Improvements: +~~~~~~~~~~~~~~ +* Extended the list of known bullet point characters. + + Changes in version 0.0.10 -------------------------- diff --git a/pymupdf4llm/README.md b/pymupdf4llm/README.md index 823424e5..724ce422 100644 --- a/pymupdf4llm/README.md +++ b/pymupdf4llm/README.md @@ -33,15 +33,15 @@ pathlib.Path("output.md").write_bytes(md_text.encode()) Instead of the filename string as above, one can also provide a PyMuPDF `Document`. By default, all pages in the PDF will be processed. If desired, the parameter `pages=[...]` can be used to provide a list of zero-based page numbers to consider. -**New features as of v0.0.2:** +**Feature Overview:** * Support for pages with **_multiple text columns_**. * Support for **_image and vector graphics extraction_**: 1. Specify `pymupdf4llm.to_markdown("input.pdf", write_images=True)`. Default is `False`. - 2. Each image or vector graphic on the page will be extracted and stored as a PNG image named `"input.pdf-pno-index.png"` in the folder of `"input.pdf"`. Where `pno` is the 0-based page number and `index` is some sequence number. - 3. The image files will have width and height equal to the values on the page. - 4. Any text contained in the images or graphics will not be extracted, but become visible as image parts. + 2. Each image or vector graphic on the page will be extracted and stored as an image named `"input.pdf-pno-index.extension"` in a folder of your choice. The image `extension` can be chosen to represent a PyMuPDF-supported image format (for instance "png" or "jpg"), `pno` is the 0-based page number and `index` is some sequence number. + 3. The image files will have width and height equal to the values on the page. The desired resolution can be chosen via parameter `dpi` (default: `dpi=150`). + 4. Any text contained in the images or graphics will be extracted and **also become visible as part of the generated image**. This behavior can be changed via `force_text=False` (text only apears as part of the image). * Support for **page chunks**: Instead of returning one large string for the whole document, a list of dictionaries can be generated: one for each page. Specify `data = pymupdf4llm.to_markdown("input.pdf", page_chunks=True)`. Then, for instance the first item, `data[0]` will contain a dictionary for the first page with the text and some metadata. diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index 6b972113..333f77ca 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -69,7 +69,9 @@ def sanitize_spans(line): Returns: A list of sorted, and potentially cleaned-up spans """ - line.sort(key=lambda s: s["bbox"].x0) # sort left to right + # sort ascending horizontally + line.sort(key=lambda s: s["bbox"].x0) + # join spans, delete duplicates for i in range(len(line) - 1, 0, -1): # iterate back to front s0 = line[i - 1] s1 = line[i] @@ -78,13 +80,17 @@ def sanitize_spans(line): delta = s1["size"] * 0.1 if s0["bbox"].x1 + delta < s1["bbox"].x0: continue # all good: no joining neded + + # We need to join bbox and text of two consecutive spans + # On occasion, spans may also be duplicated. + if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]: + s0["text"] += s1["text"] s0["bbox"] |= s1["bbox"] # join boundary boxes - s0["text"] += s1["text"] # join the text del line[i] # delete the joined-in span line[i - 1] = s0 # update the span return line - if clip is None: # use TextPage if not provided + if clip is None: # use TextPage rect if not provided clip = textpage.rect # extract text blocks - if bbox is not empty blocks = [ @@ -126,10 +132,7 @@ def sanitize_spans(line): sbbox = s["bbox"] # this bbox sbbox0 = line[-1]["bbox"] # previous bbox # if any of top or bottom coordinates are close enough, join... - if ( - abs(sbbox.y1 - sbbox0.y1) <= y_delta - or abs(sbbox.y0 - sbbox0.y0) <= y_delta - ): + if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta: line.append(s) # append to this line lrect |= sbbox # extend line rectangle continue @@ -150,9 +153,7 @@ def sanitize_spans(line): return nlines -def get_text_lines( - page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False -): +def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False): """Extract text by line keeping natural reading sequence. Notes: diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index d60d0f7b..05413d86 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -40,15 +40,15 @@ if fitz.pymupdf_version_tuple < (1, 24, 2): raise NotImplementedError("PyMuPDF version 1.24.2 or later is needed.") -bullet = ( +bullet = [ "- ", "* ", chr(0xF0A7), chr(0xF0B7), chr(0xB7), chr(8226), - chr(9679), -) +] + list(map(chr, range(9642, 9680))) + GRAPHICS_TEXT = "\n![](%s)\n" @@ -193,7 +193,7 @@ def is_significant(box, paths): for itm in p["items"]: if itm[0] in ("l", "c"): # line or curve points.extend(itm[1:]) # append all the points - elif itm[0] == "q": # quad + elif itm[0] == "qu": # quad q = itm[1] # follow corners anti-clockwise points.extend([q.ul, q.ll, q.lr, q.ur, q.ul])