pymupdf · JorjMcKie · Aug 31, 2024 · Aug 31, 2024
diff --git a/docs/src/changes.rst b/docs/src/changes.rst
@@ -4,6 +4,22 @@
 Change Log
 ===========================================================================
 
+Changes in version 0.0.13
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `112 <https://github.com/pymupdf/RAG/issues/112>`_ "Invalid bandwriter header dimensions/setup"
+
+
+Improvements:
+~~~~~~~~~~~~~~
+* New parameter `ignore_code` suppresses special formatting of text in mono-spaced fonts.
+* New parameter `extract_words` enforces `page_chunks=True` and adds a "words" list to each page dictionary.
+
+
+
 Changes in version 0.0.11
 --------------------------
 

diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.12"
+__version__ = "0.0.13"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 

diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -15,10 +15,7 @@
 import string
 import sys
 
-try:
-    import pymupdf as fitz  # available with v1.24.3
-except ImportError:
-    import fitz
+import pymupdf
 
 WHITE = set(string.whitespace)
 
@@ -96,13 +93,13 @@ def sanitize_spans(line):
     blocks = [
         b
         for b in textpage.extractDICT()["blocks"]
-        if b["type"] == 0 and not fitz.Rect(b["bbox"]).is_empty
+        if b["type"] == 0 and not pymupdf.Rect(b["bbox"]).is_empty
     ]
     spans = []  # all spans in TextPage here
     for bno, b in enumerate(blocks):  # the numbered blocks
         for lno, line in enumerate(b["lines"]):  # the numbered lines
             for sno, s in enumerate(line["spans"]):  # the numered spans
-                sbbox = fitz.Rect(s["bbox"])  # span bbox as a Rect
+                sbbox = pymupdf.Rect(s["bbox"])  # span bbox as a Rect
                 mpoint = (sbbox.tl + sbbox.br) / 2  # middle point
                 if mpoint not in clip:
                     continue
@@ -165,16 +162,16 @@ def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr
         cases of text replaced by way of redaction annotations.
 
     Args:
-        page: (fitz.Page)
+        page: (pymupdf.Page)
         textpage: (TextPage) if None a temporary one is created.
         clip: (rect-like) only consider spans inside this area
         sep: (str) use this string when joining multiple MuPDF lines.
     Returns:
         String of plain text in reading sequence.
     """
-    textflags = fitz.TEXT_MEDIABOX_CLIP
+    textflags = pymupdf.TEXT_MEDIABOX_CLIP
     page.remove_rotation()
-    prect = page.rect if not clip else fitz.Rect(clip)  # area to consider
+    prect = page.rect if not clip else pymupdf.Rect(clip)  # area to consider
 
     xsep = sep if sep == "|" else ""
 
@@ -255,7 +252,7 @@ def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr
     import pathlib
 
     filename = sys.argv[1]
-    doc = fitz.open(filename)
+    doc = pymupdf.open(filename)
     text = ""
     for page in doc:
         text += get_text_lines(page, sep=" ") + "\n" + chr(12) + "\n"

diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
@@ -43,7 +43,7 @@
   # for each page execute
   bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
 
-  bboxes is a list of fitz.IRect objects, that are sorted ascending by their
+  bboxes is a list of pymupdf.IRect objects, that are sorted ascending by their
   y0, then x0 coordinates. Their text content can be extracted by all PyMuPDF
   get_text() variants, like for instance the following:
   for rect in bboxes:
@@ -62,10 +62,7 @@
 
 import string
 
-try:
-    import pymupdf as fitz
-except ImportError:
-    import fitz
+import pymupdf
 
 
 def column_boxes(
@@ -103,7 +100,7 @@ def is_white(text):
         paths = page.get_drawings()
 
     if textpage is None:
-        textpage = page.get_textpage(clip=clip, flags=fitz.TEXTFLAGS_TEXT)
+        textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)
 
     bboxes = []
 
@@ -151,44 +148,6 @@ def can_extend(temp, bb, bboxlist, vert_bboxes):
 
         return True
 
-    # def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
-    #     """Extend a bbox to the right page border.
-
-    #     Whenever there is no text to the right of a bbox, enlarge it up
-    #     to the right page border.
-
-    #     Args:
-    #         bboxes: (list[IRect]) bboxes to check
-    #         width: (int) page width
-    #         path_bboxes: (list[IRect]) bboxes with a background color
-    #         vert_bboxes: (list[IRect]) bboxes with vertical text
-    #         img_bboxes: (list[IRect]) bboxes of images
-    #     Returns:
-    #         Potentially modified bboxes.
-    #     """
-    #     for i, bb in enumerate(bboxes):
-    #         # do not extend text with background color
-    #         if in_bbox(bb, path_bboxes):
-    #             continue
-
-    #         # do not extend text in images
-    #         if in_bbox(bb, img_bboxes):
-    #             continue
-
-    #         # temp extends bb to the right page border
-    #         temp = +bb
-    #         temp.x1 = width
-
-    #         # do not cut through colored background or images
-    #         if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
-    #             continue
-
-    #         # also, do not intersect other text bboxes
-    #         check = can_extend(temp, bb, bboxes, vert_bboxes)
-    #         if check:
-    #             bboxes[i] = temp  # replace with enlarged bbox
-
-    #     return [b for b in bboxes if b != None]
 
     def join_rects_phase1(bboxes):
         """Postprocess identified text blocks, phase 1.
@@ -336,7 +295,7 @@ def clean_nblocks(nblocks):
 
     # Make block rectangles, ignoring non-horizontal text
     for b in blocks:
-        bbox = fitz.IRect(b["bbox"])  # bbox of the block
+        bbox = pymupdf.IRect(b["bbox"])  # bbox of the block
 
         # ignore text written upon images
         if no_image_text and in_bbox(bbox, img_bboxes):
@@ -352,9 +311,9 @@ def clean_nblocks(nblocks):
             vert_bboxes.append(bbox)
             continue
 
-        srect = fitz.EMPTY_IRECT()
+        srect = pymupdf.EMPTY_IRECT()
         for line in b["lines"]:
-            lbbox = fitz.IRect(line["bbox"])
+            lbbox = pymupdf.IRect(line["bbox"])
             text = "".join([s["text"].strip() for s in line["spans"]])
             if len(text) > 1:
                 srect |= lbbox
@@ -435,7 +394,7 @@ def clean_nblocks(nblocks):
     """
     import sys
 
-    RED = fitz.pdfcolor["red"]
+    RED = pymupdf.pdfcolor["red"]
     # get the file name
     filename = sys.argv[1]
 
@@ -452,7 +411,7 @@ def clean_nblocks(nblocks):
         header_margin = 50
 
     # open document
-    doc = fitz.open(filename)
+    doc = pymupdf.open(filename)
 
     # iterate over the pages
     for page in doc: