From 8ea59e6d67dd5d937720535099010da9b94b5e39 Mon Sep 17 00:00:00 2001
From: "Jorj X. McKie" <jorj.x.mckie@outlook.de>
Date: Mon, 3 Nov 2025 05:51:39 -0400
Subject: [PATCH] Version 0.1.7

more updates
---
 pdf4llm/setup.py                              |   9 +-
 pymupdf4llm/pymupdf4llm/__init__.py           | 124 ++-
 pymupdf4llm/pymupdf4llm/helpers/check_ocr.py  | 204 +++++
 .../pymupdf4llm/helpers/document_layout.py    | 818 ++++++++++++++++++
 .../pymupdf4llm/helpers/get_text_lines.py     |  26 +-
 .../pymupdf4llm/helpers/pymupdf_rag.py        |  30 +-
 pymupdf4llm/pymupdf4llm/helpers/utils.py      | 637 ++++++++++++++
 pymupdf4llm/pymupdf4llm/versions_file.py      |   4 +-
 pymupdf4llm/setup.py                          |   6 +-
 9 files changed, 1833 insertions(+), 25 deletions(-)
 create mode 100644 pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
 create mode 100644 pymupdf4llm/pymupdf4llm/helpers/document_layout.py
 create mode 100644 pymupdf4llm/pymupdf4llm/helpers/utils.py

diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
index 8edcc3a3..45638cc9 100644
--- a/pdf4llm/setup.py
+++ b/pdf4llm/setup.py
@@ -6,6 +6,8 @@
 with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
     readme = f.read()
 
+version = "0.1.7"
+
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Environment :: Console",
@@ -13,11 +15,12 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf4llm==0.0.28"]
+
+requires = [f"pymupdf4llm=={version}"]
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.28",
+    version=version,
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
@@ -25,7 +28,7 @@
     long_description=readme,
     long_description_content_type="text/markdown",
     install_requires=requires,
-    python_requires=">=3.9",
+    python_requires=">=3.10",
     license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
     url="https://github.com/pymupdf/RAG",
     classifiers=classifiers,
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
index 50055ece..4b41313a 100644
--- a/pymupdf4llm/pymupdf4llm/__init__.py
+++ b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,14 +1,132 @@
-import pymupdf
-from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
+try:
+    import pymupdf.layout
+except ImportError:
+    import pymupdf
+
 from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION
 
 if tuple(map(int, pymupdf.__version__.split("."))) < MINIMUM_PYMUPDF_VERSION:
-    raise ImportError(f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}")
+    raise ImportError(
+        f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}"
+    )
 
 __version__ = VERSION
 version = VERSION
 version_tuple = tuple(map(int, version.split(".")))
 
+if not callable(pymupdf._get_layout):
+    from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
+
+    pymupdf._warn_layout_once()  # recommend pymupdf_layout
+
+else:
+    from .helpers import document_layout as DL
+
+    def parse_document(
+        doc,
+        filename="",
+        image_dpi=150,
+        image_format="png",
+        image_path="",
+        pages=None,
+    ):
+        return DL.parse_document(
+            doc,
+            filename=filename,
+            image_dpi=image_dpi,
+            image_format=image_format,
+            image_path=image_path,
+            pages=pages,
+        )
+
+    def to_markdown(
+        doc,
+        *,
+        header=True,
+        footer=True,
+        pages=None,
+        hdr_info=None,
+        write_images=False,
+        embed_images=False,
+        ignore_images=False,
+        ignore_graphics=False,
+        detect_bg_color=True,
+        image_path="",
+        image_format="png",
+        image_size_limit=0.05,
+        filename="",
+        force_text=True,
+        page_chunks=False,
+        page_separators=False,
+        margins=0,
+        dpi=150,
+        page_width=612,
+        page_height=None,
+        table_strategy="lines_strict",
+        graphics_limit=None,
+        fontsize_limit=3,
+        ignore_code=False,
+        extract_words=False,
+        show_progress=False,
+        use_glyphs=False,
+        ignore_alpha=False,
+    ):
+        parsed_doc = parse_document(
+            doc,
+            filename=filename,
+            image_dpi=dpi,
+            image_format=image_format,
+            image_path=image_path,
+            pages=pages,
+        )
+        return parsed_doc.to_markdown(
+            header=header,
+            footer=footer,
+            write_images=write_images,
+            embed_images=embed_images,
+            ignore_code=ignore_code,
+        )
+
+    def to_json(
+        doc,
+        header=True,
+        footer=True,
+        image_dpi=150,
+        image_format="png",
+        image_path="",
+        pages=None,
+    ):
+        parsed_doc = parse_document(
+            doc,
+            image_dpi=image_dpi,
+            image_format=image_format,
+            image_path=image_path,
+            pages=pages,
+        )
+        return parsed_doc.to_json()
+
+    def to_text(
+        doc,
+        filename="",
+        header=True,
+        footer=True,
+        pages=None,
+        ignore_code=False,
+    ):
+        parsed_doc = parse_document(
+            doc,
+            filename=filename,
+            image_dpi=150,
+            image_format="png",
+            image_path="",
+            pages=pages,
+        )
+        return parsed_doc.to_text(
+            header=header,
+            footer=footer,
+            ignore_code=ignore_code,
+        )
+
 
 def LlamaMarkdownReader(*args, **kwargs):
     from .llama import pdf_markdown_reader
diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
new file mode 100644
index 00000000..f9ad27f8
--- /dev/null
+++ b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
@@ -0,0 +1,204 @@
+import pymupdf  # PyMuPDF
+import numpy as np
+import cv2
+
+
+WHITE_CHARS = set(
+    [chr(i) for i in range(33)]
+    + [
+        "\u00a0",  # Non-breaking space
+        "\u2000",  # En quad
+        "\u2001",  # Em quad
+        "\u2002",  # En space
+        "\u2003",  # Em space
+        "\u2004",  # Three-per-em space
+        "\u2005",  # Four-per-em space
+        "\u2006",  # Six-per-em space
+        "\u2007",  # Figure space
+        "\u2008",  # Punctuation space
+        "\u2009",  # Thin space
+        "\u200a",  # Hair space
+        "\u202f",  # Narrow no-break space
+        "\u205f",  # Medium mathematical space
+        "\u3000",  # Ideographic space
+    ]
+)
+
+
+def detect_qr_codes(img):
+    detector = cv2.QRCodeDetector()
+    data, points, _ = detector.detectAndDecode(img)
+
+    if points is not None and data:
+        pts = points[0].astype(int)
+        return {"data": data, "bbox": pts.tolist()}
+    return None
+
+
+def detect_barcodes(img):
+    try:
+        from pyzbar.pyzbar import decode as barcode_decode
+    except ImportError:
+        raise ImportError("pyzbar is required for barcode detection")
+    gray = img
+    barcodes = barcode_decode(gray)
+    results = []
+
+    for barcode in barcodes:
+        results.append(
+            {
+                "type": barcode.type,
+                "data": barcode.data.decode("utf-8"),
+                "bbox": [(p.x, p.y) for p in barcode.polygon],
+            }
+        )
+    return results
+
+
+def get_page_image(page, dpi=150):
+    pix = page.get_pixmap(dpi=dpi)
+    matrix = pymupdf.Rect(pix.irect).torect(page.rect)
+    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+        pix.height, pix.width, pix.n
+    )
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    return gray, matrix, pix
+
+
+def detect_lines(img, min_length=50, max_gap=10, matrix=pymupdf.Identity):
+    gray = img
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+    pix_lines = cv2.HoughLinesP(
+        edges,
+        1,
+        np.pi / 180,
+        threshold=100,
+        minLineLength=min_length,
+        maxLineGap=max_gap,
+    )
+    lines = []
+    for np_linesr in pix_lines:
+        for r in np_linesr:
+            p0 = pymupdf.Point(r[0], r[1]) * matrix
+            p1 = pymupdf.Point(r[2], r[3]) * matrix
+            lines.append((p0, p1))
+    return lines  # array of (point1, point2)
+
+
+def detect_curves(img, matrix=pymupdf.Identity):
+    gray = img
+    _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
+    contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
+
+    curves = []
+    for cnt in contours:
+        if len(cnt) > 5:
+            ellipse = cv2.fitEllipse(cnt)
+            curves.append(ellipse)
+    return curves
+
+
+def detect_rectangles(img, min_area=1000, matrix=pymupdf.Identity):
+    gray
+    _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
+    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    rectangles = []
+    for cnt in contours:
+        approx = cv2.approxPolyDP(cnt, 0.02 * cv2.arcLength(cnt, True), True)
+        if len(approx) == 4 and cv2.contourArea(cnt) > min_area:
+            r = pymupdf.Rect(approx) * matrix
+            rectangles.append(r)
+    return rectangles
+
+
+def should_ocr_page(
+    page,
+    dpi=150,
+    edge_thresh=0.015,
+    vector_thresh=500,
+    image_coverage_thresh=0.9,
+    text_readability_thresh=0.9,
+):
+    """
+    Decide whether a PyMuPDF page should be OCR'd.
+
+    Parameters:
+        page: PyMuPDF page object
+        dpi: DPI used for rasterization
+        edge_thresh: minimum edge density to suggest text presence
+        vector_thresh: minimum number of vector paths to suggest glyph simulation
+        image_coverage_thresh: fraction of page area covered by images to trigger OCR
+        text_readability_thresh: fraction of readable characters to skip OCR
+
+    Returns:
+        dict with decision and diagnostic flags
+    """
+    decision = {
+        "should_ocr": False,
+        "has_ocr_text": False,
+        "has_text": False,
+        "readable_text": False,
+        "image_covers_page": False,
+        "has_vector_drawings": False,
+        "transform": pymupdf.Identity,
+        "pixmap": None,
+        "image": None,
+        "edge_density": 0.0,
+        "vector_count": 0,
+    }
+    page_rect = page.rect
+    page_area = abs(page_rect)  # size of the full page
+    # Check for text
+    text = page.get_text(flags=0)
+    decision["has_text"] = not WHITE_CHARS.issuperset(text)
+    if decision["has_text"]:
+        not_readable_count = len([c for c in text if c == chr(0xFFFD)])
+        readability = 1 - not_readable_count / len(text)
+        decision["readable_text"] = readability >= text_readability_thresh
+
+    all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
+    ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
+    decision["has_ocr_text"] = bool(ocr_text_bboxes)
+    # Check for image coverage
+    image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
+    image_rect=pymupdf.EMPTY_RECT()
+    for r in image_rects:
+        image_rect|=r
+    image_area=abs(image_rect)
+    if image_area:
+        images_cover = image_area / page_area
+    else:        
+        images_cover = 0.0
+    decision["image_covers_page"] = images_cover >= image_coverage_thresh
+
+    # Check vector drawings
+    drawings = [
+        p for p in page.get_drawings() if p["rect"].width > 3 or p["rect"].height > 3
+    ]
+    decision["vector_count"] = len(drawings)
+    decision["has_vector_drawings"] = len(drawings) >= vector_thresh
+
+    # Rasterize and analyze edge density
+    img, matrix, pix = get_page_image(page, dpi=dpi)
+    decision["transform"] = matrix
+    decision["pixmap"] = pix
+    decision["image"] = img
+    edges = cv2.Canny(img, 100, 200)
+    decision["edge_density"] = np.sum(edges > 0) / edges.size
+
+    # Final decision
+    if (
+        1
+        and not decision["has_text"]
+        and not decision["readable_text"]
+        and (
+            0
+            or decision["image_covers_page"]
+            or decision["has_vector_drawings"]
+            or decision["edge_density"] > edge_thresh
+        )
+    ):
+        decision["should_ocr"] = True
+    
+    return decision
diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
new file mode 100644
index 00000000..9074981c
--- /dev/null
+++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
@@ -0,0 +1,818 @@
+import base64
+import json
+import os
+from binascii import b2a_base64
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+from pathlib import Path
+import pymupdf
+import tabulate
+from pymupdf4llm.helpers.get_text_lines import get_raw_lines
+from pymupdf4llm.helpers import utils, check_ocr
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+
+pymupdf.TOOLS.unset_quad_corrections(True)
+
+GRAPHICS_TEXT = "\n![](%s)\n"
+CHECK_OCR_TEXT = {"ignore-text"}
+OCR_FONTNAME = "GlyphLessFont"  # if encountered do not use "code" style
+FLAGS = (
+    0
+    | pymupdf.TEXT_COLLECT_STYLES
+    | pymupdf.TEXT_COLLECT_VECTORS
+    | pymupdf.TEXT_PRESERVE_IMAGES
+    | pymupdf.TEXT_ACCURATE_BBOXES
+    | pymupdf.TEXT_MEDIABOX_CLIP
+)
+
+
+def omit_if_pua_char(text):
+    """Check if character is in the Private Use Area (PUA) of Unicode."""
+    if len(text) > 1:
+        return text
+    o = ord(text)
+    if (
+        (0xE000 <= o <= 0xF8FF)
+        or (0xF0000 <= o <= 0xFFFFD)
+        or (0x100000 <= o <= 0x10FFFD)
+    ):
+        return ""
+    return text
+
+
+def create_list_item_levels(layout_info):
+    """Create a dictionary that maps the box number of each list-item to
+    its hierarchy level.
+
+    Args:
+        layout_info (list): the bbox list "page.layout_information"
+
+    Returns:
+        dict: {bbox sequence number: level} where level is 1 for top-level.
+    """
+    segments = []  # list of item segments
+    segment = []  # current segment
+
+    # Create segments of contiguous list items. Each non-list-item finishes
+    # the current segment. Also, if two list-items belong to different page
+    # text columns ends a segment.
+    for i, item in enumerate(layout_info):
+        if item.boxclass != "list-item":  # bbox class is not list-item
+            if segment:  # end and save the current segment
+                segments.append(segment)
+                segment = []
+            continue
+        if segment:  # check if we need to end the current segment
+            _, prev_item = segment[-1]
+            if item.x0 > prev_item.x1 or item.y1 < prev_item.y0:
+                # end and save the current segment
+                segments.append(segment)
+                segment = []
+        segment.append((i, item))  # append item to segment
+    if segment:
+        segments.append(segment)  # append last segment
+
+    item_dict = {}  # dictionary of item index -> (level
+    if not segments:  # no list items found
+        return item_dict
+
+    # walk through segments and assign levels
+    for i, s in enumerate(segments):
+        if not s:
+            continue
+        s.sort(key=lambda x: x[1].x0)  # sort by x0 coordinate of the bbox
+
+        # list of leveled items in the segment: (idx, bbox, level)
+        # first item has level 1
+        leveled_items = [(s[0][0], s[0][1], 1)]
+        for idx, bbox in s[1:]:
+            prev_idx, prev_bbox, prev_lvl = leveled_items[-1]
+            # x0 coordinate increased by more than 10 points: increase level
+            if bbox.x0 > prev_bbox.x0 + 10:
+                curr_lvl = prev_lvl + 1
+                leveled_items.append((idx, bbox, curr_lvl))
+            else:
+                leveled_items.append((idx, bbox, prev_lvl))
+        for idx, bbox, lvl in leveled_items:
+            item_dict[idx] = lvl
+    return item_dict
+
+
+def is_monospaced(textlines):
+    """Detect text bboxes with all mono-spaced lines."""
+    line_count = len(textlines)
+    mono = 0
+
+    for l in textlines:
+        all_mono = all(
+            bool(s["flags"] & 8 and s["font"] != OCR_FONTNAME) for s in l["spans"]
+        )
+        if all_mono:
+            mono += 1
+    return mono == line_count
+
+
+def is_superscripted(line):
+    spans = line["spans"]
+    if not spans:
+        return False
+    if spans[0]["flags"] & 1:  # check for superscript
+        return True
+    if len(spans) < 2:
+        return False
+    if (
+        spans[0]["origin"][1] < spans[1]["origin"][1]
+        and spans[0]["size"] < spans[1]["size"]
+    ):
+        return True
+    return False
+
+
+def get_plain_text(spans):
+    """Output text without any markdown or other styling.
+    Parameter is a list of span dictionaries. The spans may come from
+    one or more original "textlines" items.
+    Returns the text string of the boundary box.
+    The text string always ends with the suffix and a space
+    """
+    output = ""
+    for i, s in enumerate(spans):
+        span_text = s["text"].strip()  # remove leading/trailing spaces
+        superscript = s["flags"] & 1
+        span_text = s["text"].strip()
+        if superscript:
+            if i == 0:
+                span_text = f"[{span_text}] "
+            elif output.endswith(" "):
+                output = output[:-1]
+        if output.endswith("- ") and len(output.split()[-1]) > 2:
+            output = output[:-2]
+        output += span_text + " "
+    return output
+
+
+def list_item_to_text(textlines, level):
+    """
+    Convert "list-item" bboxes to text.
+    """
+    indent = "   " * (level - 1)
+    output = indent
+    line = textlines[0]
+    x0 = line["bbox"][0]  # left of first line
+    spans = line["spans"]
+    span0 = line["spans"][0]
+    span0_text = span0["text"].strip()
+
+    if not omit_if_pua_char(span0_text):
+        spans.pop(0)
+        if spans:
+            x0 = spans[0]["bbox"][0]
+
+    for line in textlines[1:]:
+        this_x0 = line["bbox"][0]
+        if this_x0 < x0 - 2:
+            line_output = get_plain_text(spans)
+            output += line_output
+            output = output.rstrip() + f"\n\n{indent}"
+            spans = line["spans"]
+            if not omit_if_pua_char(spans[0]["text"].strip()):
+                spans.pop(0)
+        else:
+            spans.extend(line["spans"])
+        x0 = this_x0  # store this left coordinate
+    line_output = get_plain_text(spans)
+    output += line_output
+
+    return output.rstrip() + "\n\n"
+
+
+def footnote_to_text(textlines):
+    """
+    Convert "footnote" bboxes to text.
+    """
+    output = "> "
+    line = textlines[0]
+    spans = line["spans"]
+
+    for line in textlines[1:]:
+        if is_superscripted(line):
+            line_output = get_plain_text(spans)
+            output += line_output
+            output = output.rstrip() + "\n\n> "
+            spans = line["spans"]
+        else:
+            spans.extend(line["spans"])
+    line_output = get_plain_text(spans)
+    output += line_output
+
+    return output.rstrip() + "\n\n"
+
+
+def code_block_to_text(textlines):
+    """Output a code block in plain text format."""
+    output = ""
+    for line in textlines:
+        line_text = ""
+        for s in line["spans"]:
+            span_text = s["text"]
+            line_text += span_text
+        output += line_text.rstrip() + "\n"
+    output += "\n\n"
+    return output
+
+
+def text_to_text(textlines, ignore_code: bool = False):
+    """
+    Convert "text" bboxes to plain text, as well as other boxclasses
+    not specifically handled elsewhere.
+    The line text is written without line breaks. At the end,
+    two newlines are added to separate from the next block.
+    """
+    if not textlines:
+        return ""
+    if is_superscripted(textlines[0]):  # check for superscript
+        return footnote_to_text(textlines)
+    if not ignore_code and is_monospaced(textlines):
+        return code_block_to_text(textlines)
+
+    spans = []
+    for l in textlines:
+        for s in l["spans"]:
+            assert isinstance(s, dict)
+            spans.append(s)
+    output = get_plain_text(spans)
+    return output + "\n\n"
+
+
+def get_styled_text(spans):
+    """Output text with markdown style codes based on font properties.
+    Parameter is a list of span dictionaries. The spans may come from
+    one or more original "textlines" items.
+    Returns the text string and the suffix for continuing styles.
+    The text string always ends with the suffix and a space
+    """
+    output = ""
+    old_line = 0
+    old_block = 0
+    suffix = ""
+    for i, s in enumerate(spans):
+        # decode font properties
+        prefix = ""
+        superscript = s["flags"] & 1
+        mono = s["flags"] & 8 and s["font"] != OCR_FONTNAME
+        bold = s["flags"] & 16 or s["char_flags"] & 8
+        italic = s["flags"] & 2
+        strikeout = s["char_flags"] & 1
+
+        # compute styling prefix and suffix
+        if mono:
+            prefix = "`" + prefix
+        if bold:
+            prefix = "**" + prefix
+        if italic:
+            prefix = "_" + prefix
+        if strikeout:
+            prefix = "~~" + prefix
+
+        suffix = "".join(reversed(prefix))  # reverse of prefix
+
+        span_text = s["text"].strip()  # remove leading/trailing spaces
+        # convert intersecting link to markdown syntax
+        # ltext = resolve_links(parms.links, s)
+        ltext = ""  # TODO: implement link resolution
+        if ltext:
+            text = f"{hdr_string}{prefix}{ltext}{suffix} "
+        else:
+            text = f"{prefix}{span_text}{suffix} "
+
+        # Extend output string taking care of styles staying the same.
+        if output.endswith(f"{suffix} "):
+            output = output[: -len(suffix) - 1]
+            # resolve hyphenation if old_block and old_line are not the same
+            if (
+                1
+                and (old_block, old_line) != (s["block"], s["line"])
+                and output.endswith("-")
+                and len(output.split()[-1]) > 2
+            ):
+                output = output[:-1]
+                text = span_text + suffix + " "
+            elif superscript:
+                text = span_text + suffix + " "
+            else:
+                text = " " + span_text + suffix + " "
+
+        old_line = s["line"]
+        old_block = s["block"]
+        output += text
+    return output, suffix
+
+
+def list_item_to_md(textlines, level):
+    """
+    Convert "list-item" bboxes to markdown.
+    The first line is prefixed with "- ". Subsequent lines are appended
+    without line break if their rectangle does not start to the left
+    of the previous line.
+    Otherwise, a linebreak and "- " are added to the output string.
+    2 units of tolerance is used to avoid spurious line breaks.
+
+    This post-layout heuristics helps cover cases where more than
+    one list item is contained in a single bbox.
+    """
+    indent = "   " * (level - 1)
+    line = textlines[0]
+    x0 = line["bbox"][0]  # left of first line
+    spans = line["spans"]
+    span0 = line["spans"][0]
+    span0_text = span0["text"].strip()
+
+    starter = "- "
+    if span0_text.endswith(".") and span0_text[:-1].isdigit():
+        starter = "1. "
+
+    if not omit_if_pua_char(span0["text"].strip()):
+        spans.pop(0)
+        if spans:
+            x0 = spans[0]["bbox"][0]
+
+    output = indent + starter
+    for line in textlines[1:]:
+        this_x0 = line["bbox"][0]
+        if this_x0 < x0 - 2:
+            line_output, suffix = get_styled_text(spans)
+            output += line_output + f"\n\n{indent}{starter}"
+            spans = line["spans"]
+            if not omit_if_pua_char(spans[0]["text"].strip()):
+                spans.pop(0)
+        else:
+            spans.extend(line["spans"])
+        x0 = this_x0  # store this left coordinate
+    line_output, suffix = get_styled_text(spans)
+    output += line_output
+
+    return output + "\n\n"
+
+
+def footnote_to_md(textlines):
+    """
+    Convert "footnote" bboxes to markdown.
+    The first line is prefixed with "> ". Subsequent lines are appended
+    without line break if they do not start with a superscript.
+    Otherwise, a linebreak and "> " are added to the output string.
+
+    This post-layout heuristics helps cover cases where more than
+    one list item is contained in a single bbox.
+    """
+    line = textlines[0]
+    spans = line["spans"]
+    output = "> "
+    for line in textlines[1:]:
+        if is_superscripted(line):
+            line_output, suffix = get_styled_text(spans)
+            output += line_output + "\n\n> "
+            spans = line["spans"]
+        else:
+            spans.extend(line["spans"])
+    line_output, suffix = get_styled_text(spans)
+    output += line_output
+
+    return output + "\n\n"
+
+
+def section_hdr_to_md(textlines):
+    """
+    Convert "section-header" bboxes to markdown.
+    This is treated as a level 2 header (##).
+    The line text itself is handled like normal text.
+    """
+    spans = []
+    for l in textlines:
+        for s in l["spans"]:
+            assert isinstance(s, dict)
+            spans.append(s)
+    output, suffix = get_styled_text(spans)
+    return f"## {output}\n\n"
+
+
+def title_to_md(textlines):
+    """
+    Convert "title" bboxes to markdown.
+    This is treated as a level 1 header (#).
+    The line text itself is handled like normal text.
+    """
+    spans = []
+    for l in textlines:
+        for s in l["spans"]:
+            assert isinstance(s, dict)
+            spans.append(s)
+    output, suffix = get_styled_text(spans)
+    return f"# {output}\n\n"
+
+
+def code_block_to_md(textlines):
+    """Output a code block in markdown format."""
+    output = "```\n"
+    for line in textlines:
+        line_text = ""
+        for s in line["spans"]:
+            span_text = s["text"]
+            line_text += span_text
+        output += line_text.rstrip() + "\n"
+    output += "```\n\n"
+    return output
+
+
+def text_to_md(textlines, ignore_code: bool = False):
+    """
+    Convert "text" bboxes to markdown, as well as other boxclasses
+    not specifically handled elsewhere.
+    The line text is written without line breaks. At the end,
+    two newlines are added to separate from the next block.
+    """
+    if not textlines:
+        return ""
+    if is_superscripted(textlines[0]):
+        # exec advanced superscript detector
+        return footnote_to_md(textlines)
+    if not ignore_code and is_monospaced(textlines):
+        return code_block_to_md(textlines)
+
+    spans = []
+    for l in textlines:
+        for s in l["spans"]:
+            assert isinstance(s, dict)
+            spans.append(s)
+    output, suffix = get_styled_text(spans)
+    return output + "\n\n"
+
+
+@dataclass
+class LayoutBox:
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    boxclass: str  # e.g. 'text', 'picture', 'table', etc.
+
+    # if boxclass == 'picture' or 'formula', store image bytes
+    image: Optional[bytes] = None
+
+    # if boxclass == 'table'
+    table: Optional[Dict] = None
+
+    # text line information for text-type boxclasses
+    textlines: Optional[List[Dict]] = None
+
+
+@dataclass
+class PageLayout:
+    page_number: int
+    width: float
+    height: float
+    boxes: List[LayoutBox]
+    ocrpage: bool = False  # whether the page is an OCR page
+    fulltext: Optional[List[Dict]] = None  # full page text in extractDICT format
+    words: Optional[List[Dict]] = None  # list of words with bbox
+    links: Optional[List[Dict]] = None
+
+
+@dataclass
+class ParsedDocument:
+    filename: Optional[str] = None  # source file name
+    page_count: int = None
+    toc: Optional[List[List]] = None  # e.g. [{'title': 'Intro', 'page': 1}]
+    pages: List[PageLayout] = None
+    metadata: Optional[Dict] = None
+    from_bytes: bool = False  # whether loaded from bytes
+    image_dpi: int = 150  # image resolution
+    image_format: str = "png"  # 'png' or 'jpg'
+    image_path: str = ""  # path to save images
+    use_ocr: bool = True  # whether to invoke OCR if beneficial
+
+    def to_markdown(
+        self,
+        header: bool = True,
+        footer: bool = True,
+        write_images: bool = False,
+        embed_images: bool = False,
+        ignore_code: bool = False,
+    ) -> str:
+        """
+        Serialize ParsedDocument to markdown text.
+        """
+        output = ""
+        for page in self.pages:
+
+            # make mapping: box number to list item level
+            list_item_levels = create_list_item_levels(page.boxes)
+            for i, box in enumerate(page.boxes):
+                clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1)
+                btype = box.boxclass
+                if btype == "page-header" and header is False:
+                    continue
+                if btype == "page-footer" and footer is False:
+                    continue
+                if btype in ("picture", "formula") and box.image:
+                    img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}"
+                    if write_images:
+                        filename = os.path.basename(self.filename).replace(" ", "-")
+                        image_filename = os.path.join(
+                            self.image_path,
+                            f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}",
+                        )
+                        Path(image_filename).write_bytes(box.image)
+
+                        output += GRAPHICS_TEXT % img_filename
+
+                    elif embed_images:
+                        # make a base64 encoded string of the image
+                        data = b2a_base64(box.image).decode()
+                        data = f"data:image/{self.image_format};base64," + data
+                        output += GRAPHICS_TEXT % data + "\n\n"
+
+                    else:
+                        output += f"**==> {btype} [{clip.width} x {clip.height}] intentionally omitted <==**\n\n"
+                    continue
+                if btype == "table":
+                    output += box.table["markdown"] + "\n\n"
+                    continue
+                if not hasattr(box, "textlines"):
+                    print(f"Warning: box {btype} has no textlines")
+                    continue
+                if btype == "title":
+                    output += title_to_md(box.textlines)
+                elif btype == "section-header":
+                    output += section_hdr_to_md(box.textlines)
+                elif btype == "list-item":
+                    output += list_item_to_md(box.textlines, list_item_levels[i])
+                elif btype == "footnote":
+                    output += footnote_to_md(box.textlines)
+                elif not header and btype == "page-header":
+                    continue
+                elif not footer and btype == "page-footer":
+                    continue
+                else:  # treat as normal MD text
+                    output += text_to_md(
+                        box.textlines, ignore_code=ignore_code or page.ocrpage
+                    )
+
+        return output
+
+    def to_json(self) -> str:
+        # Serialize to JSON
+        class LayoutEncoder(json.JSONEncoder):
+            def default(self, s):
+                if isinstance(s, (bytes, bytearray)):
+                    return base64.b64encode(s).decode()
+                if isinstance(
+                    s,
+                    (
+                        pymupdf.Rect,
+                        pymupdf.Point,
+                        pymupdf.Matrix,
+                        pymupdf.IRect,
+                        pymupdf.Quad,
+                    ),
+                ):
+                    return list(s)
+                if hasattr(s, "__dict__"):
+                    return s.__dict__
+                return self.super().default(s)
+
+        js = json.dumps(self, cls=LayoutEncoder, indent=1)
+        return js
+
+    def to_text(
+        self,
+        header: bool = True,
+        footer: bool = True,
+        ignore_code: bool = False,
+    ) -> str:
+        """
+        Serialize ParsedDocument to plain text. Optionally omit page headers or footers.
+        """
+        # Flatten all text boxes into plain text
+        output = ""
+        for page in self.pages:
+            list_item_levels = create_list_item_levels(page.boxes)
+            for i, box in enumerate(page.boxes):
+                clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1)
+                btype = box.boxclass
+                if btype == "page-header" and header is False:
+                    continue
+                if btype == "page-footer" and footer is False:
+                    continue
+                if btype in ("picture", "formula"):
+                    output += f"==> {btype} [{clip.width} x {clip.height}] <==\n\n"
+                    continue
+                if btype == "table":
+                    output += (
+                        tabulate.tabulate(box.table["extract"], tablefmt="grid")
+                        + "\n\n"
+                    )
+                    continue
+                if btype == "list-item":
+                    output += list_item_to_text(box.textlines, list_item_levels[i])
+                    continue
+                if btype == "footnote":
+                    output += footnote_to_text(box.textlines)
+                    continue
+                output += text_to_text(
+                    box.textlines, ignore_code=ignore_code or page.ocrpage
+                )
+                continue
+        return output
+
+
+def parse_document(
+    doc,
+    filename="",
+    image_dpi=150,
+    image_format="png",
+    image_path="",
+    pages=None,
+) -> ParsedDocument:
+    if isinstance(doc, pymupdf.Document):
+        mydoc = doc
+    else:
+        mydoc = pymupdf.open(doc)
+    document = ParsedDocument()
+    document.filename = mydoc.name if mydoc.name else filename
+    document.toc = mydoc.get_toc(simple=True)
+    document.page_count = mydoc.page_count
+    document.metadata = mydoc.metadata
+    document.image_dpi = image_dpi
+    document.image_format = image_format
+    document.image_path = image_path
+    document.pages = []
+    try:
+        reason = "OpenCV not installed"
+        assert cv2 is not None
+        reason = "Tesseract language data not found"
+        assert pymupdf.get_tessdata()
+        document.use_ocr = True
+    except Exception as e:
+        print(f"{reason}. Disabling OCR.")
+        document.use_ocr = False
+    if pages is None:
+        page_filter = range(mydoc.page_count)
+    elif isinstance(pages, int):
+        while pages < 0:
+            pages += mydoc.page_count
+        page_filter = [pages]
+    elif not hasattr(pages, "__getitem__"):
+        raise ValueError("'pages' parameter must be an int, or a sequence of ints")
+    else:
+        page_filter = sorted(set(pages))
+    if (
+        not all(isinstance(p, int) for p in page_filter)
+        or page_filter[-1] >= mydoc.page_count
+    ):
+        raise ValueError(
+            "'pages' parameter must be None, int, or a sequence of ints less than page count"
+        )
+    for pno in page_filter:
+        page = mydoc.load_page(pno)
+
+        # check if this page should be OCR'd
+        if document.use_ocr:
+            decision = check_ocr.should_ocr_page(page, dpi=600)
+        else:
+            decision = {"should_ocr": False}
+        if decision["should_ocr"]:
+            print(f"Performing OCR on {page.number=}[{page.number+1}]...")
+            pix = decision["pixmap"]  # retrieve the Pixmap
+            pdf_data = pix.pdfocr_tobytes()  # OCR it
+            ocr_pdf = pymupdf.open("pdf", pdf_data)  # get the OCR'd PDF
+            ocrpage = ocr_pdf[0]  # this is its OCR'd page
+            # remove everything except the text
+            ocrpage.add_redact_annot(ocrpage.rect)
+            ocrpage.apply_redactions(
+                images=pymupdf.PDF_REDACT_IMAGE_REMOVE,
+                graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
+                text=pymupdf.PDF_REDACT_TEXT_NONE,
+            )
+            # copy text over to original page
+            page.show_pdf_page(page.rect, ocr_pdf, 0)
+            ocr_pdf.close()  # discard temporary OCR PDF
+            del ocr_pdf
+
+        bboxlog = page.get_bboxlog()
+        ocrpage = decision["should_ocr"] or (
+            set([b[0] for b in bboxlog if b[0] == "ignore-text"]) == CHECK_OCR_TEXT
+        )
+        textpage = page.get_textpage(flags=FLAGS)
+        blocks = textpage.extractDICT()["blocks"]
+        page.get_layout()
+        utils.clean_pictures(page, blocks)
+        utils.add_image_orphans(page, blocks)
+        utils.clean_tables(page, blocks)
+        page.layout_information = utils.find_reading_order(page.layout_information)
+
+        # identify vector graphics to help find tables
+        all_lines, all_boxes = utils.complete_table_structure(page)
+        tbf = page.find_tables(
+            strategy="lines_strict", add_lines=all_lines, add_boxes=all_boxes
+        )
+        fulltext = [b for b in blocks if b["type"] == 0]
+        words = [
+            {
+                "bbox": pymupdf.Rect(w[:4]),
+                "text": w[4],
+                "block_n": w[5],
+                "line_n": w[6],
+                "word_n": w[7],
+            }
+            for w in textpage.extractWORDS()
+        ]
+        links = page.get_links()
+        pagelayout = PageLayout(
+            page_number=page.number + 1,
+            width=page.rect.width,
+            height=page.rect.height,
+            boxes=[],
+            ocrpage=ocrpage,
+            fulltext=fulltext,
+            words=words,
+            links=links,
+        )
+        for box in page.layout_information:
+            layoutbox = LayoutBox(*box)
+            clip = pymupdf.Rect(box[:4])
+
+            if layoutbox.boxclass in ("picture", "formula"):
+                pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
+                layoutbox.image = pix.tobytes(document.image_format)
+
+            elif layoutbox.boxclass == "table":
+                # This is either a table detected by native TableFinder or by
+                # MuPDF's table structure recognition (which may fail).
+                # If the structure was not detected, we output an image.
+                # A table is represented as a dict with bbox, row_count,
+                # col_count, cells, extract (2D list of cell texts), and the
+                # markdown string.
+
+                try:  # guard against table structure detection failure
+                    table = [
+                        tab
+                        for tab in tbf.tables
+                        if pymupdf.table._iou(tab.bbox, clip) > 0.6
+                    ][0]
+                    cells = [[c for c in row.cells] for row in table.rows]
+
+                    if table.header.external:  # if the header ioutside table
+                        cells.insert(0, table.header.cells)  # insert a row
+                        table.row_count += 1  # increase row count
+
+                    layoutbox.table = {
+                        "bbox": list(table.bbox),
+                        "row_count": table.row_count,
+                        "col_count": table.col_count,
+                        "cells": cells,
+                        "extract": table.extract(),
+                    }
+                    layoutbox.table["markdown"] = utils.table_to_markdown(
+                        textpage, layoutbox, markdown=True
+                    )
+                except Exception as e:
+                    print(f"table detection error '{e}'")
+                    # table structure not detected: treat like an image
+                    pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
+                    layoutbox.image = pix.tobytes(document.image_format)
+                    layoutbox.boxclass = "picture"
+            else:
+                # Handle text-like box classes:
+                # Extract text line information within the box.
+                # Each line is represented as its bbox and a list of spans.
+                layoutbox.textlines = [
+                    {"bbox": l[0], "spans": l[1]}
+                    for l in get_raw_lines(
+                        textpage=None,
+                        blocks=pagelayout.fulltext,
+                        clip=clip,
+                        ignore_invisible=not ocrpage,
+                    )
+                ]
+            pagelayout.boxes.append(layoutbox)
+        document.pages.append(pagelayout)
+    if mydoc != doc:
+        mydoc.close()
+    return document
+
+
+if __name__ == "__main__":
+    # Example usage
+    import sys
+    from pathlib import Path
+
+    filename = sys.argv[1]
+    pdoc = parse_document(filename)
+    # Path(filename).with_suffix(".json").write_text(pdoc.to_json())
+    # Path(filename).with_suffix(".txt").write_text(pdoc.to_text(footer=False))
+    md = pdoc.to_markdown(write_images=True, header=False, footer=False)
+    Path(filename).with_suffix(".md").write_text(md)
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
index 0769dd00..e1a02da9 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -26,7 +26,8 @@ def is_white(text):
 
 
 def get_raw_lines(
-    textpage,
+    textpage=None,
+    blocks=None,
     clip=None,
     tolerance=3,
     ignore_invisible=True,
@@ -44,7 +45,10 @@ def get_raw_lines(
     formats like Markdown or JSON.
 
     Args:
-        textpage: (mandatory) TextPage object
+        textpage: TextPage object. Can be None if blocks are given.
+        blocks: (list) if given, use these blocks instead of extracting them
+              from the TextPage. This allows to re-use blocks extracted
+              by the caller.
         clip: (Rect) specifies a sub-rectangle of the textpage rect (which in
               turn may be based on a sub-rectangle of the full page).
         tolerance: (float) put spans on the same line if their top or bottom
@@ -71,7 +75,7 @@ def sanitize_spans(line):
         left to right.
 
         Arg:
-            A list of spans - as drived from TextPage.extractDICT()
+            A list of spans - as derived from TextPage.extractDICT()
         Returns:
             A list of sorted, and potentially cleaned-up spans
         """
@@ -101,14 +105,18 @@ def sanitize_spans(line):
             line[i - 1] = s0  # update the span
         return line
 
+    if not isinstance(textpage, pymupdf.TextPage) and blocks is None:
+        raise ValueError("Either textpage or blocks must be provided.")
+
     if clip is None:  # use TextPage rect if not provided
         clip = textpage.rect
     # extract text blocks - if bbox is not empty
-    blocks = [
-        b
-        for b in textpage.extractDICT()["blocks"]
-        if b["type"] == 0 and not pymupdf.Rect(b["bbox"]).is_empty
-    ]
+    if blocks is None:
+        blocks = [
+            b
+            for b in textpage.extractDICT()["blocks"]
+            if b["type"] == 0 and not pymupdf.Rect(b["bbox"]).is_empty
+        ]
     spans = []  # all spans in TextPage here
     for bno, b in enumerate(blocks):  # the numbered blocks
         for lno, line in enumerate(b["lines"]):  # the numbered lines
@@ -127,7 +135,7 @@ def sanitize_spans(line):
                     continue
                 if abs(sbbox & clip) < abs(sbbox) * 0.8:  # if not in clip
                     continue
-                if s["flags"] & 1 == 1:  # if a superscript, modify bbox
+                if s["flags"] & 1:  # if a superscript, modify bbox
                     # with that of the preceding or following span
                     i = 1 if sno == 0 else sno - 1
                     if len(line["spans"]) > i:
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index ad6ca0c1..f0c76e6b 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -38,13 +38,14 @@
 import os
 import string
 from binascii import b2a_base64
+from collections import defaultdict
+from dataclasses import dataclass
+
 import pymupdf
 from pymupdf import mupdf
 from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
 from pymupdf4llm.helpers.multi_column import column_boxes
 from pymupdf4llm.helpers.progress import ProgressBar
-from dataclasses import dataclass
-from collections import defaultdict
 
 pymupdf.TOOLS.unset_quad_corrections(True)
 
@@ -572,7 +573,7 @@ def write_text(
                     if i in parms.written_images:
                         continue
                     r = parms.img_rects[i]
-                    if r.y1 <= lrect.y0 and (
+                    if max(r.y0, lrect.y0) < min(r.y1, lrect.y1) and (
                         0
                         or lrect.x0 <= r.x0 < lrect.x1
                         or lrect.x0 < r.x1 <= lrect.x1
@@ -1024,6 +1025,9 @@ def get_page_output(
         graphics_count = len([b for b in page.get_bboxlog() if "path" in b[0]])
         if GRAPHICS_LIMIT and graphics_count > GRAPHICS_LIMIT:
             IGNORE_GRAPHICS = True
+            too_many_graphics = True
+        else:
+            too_many_graphics = False
 
         # Locate all tables on page
         parms.written_tables = []  # stores already written tables
@@ -1075,7 +1079,7 @@ def get_page_output(
         else:
             paths = []
         # catch too-many-graphics situation
-        if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT:
+        if IGNORE_GRAPHICS:
             paths = []
 
         # We also ignore vector graphics that only represent
@@ -1101,7 +1105,17 @@ def get_page_output(
         parms.vg_clusters0 = refine_boxes(vg_clusters0)
 
         parms.vg_clusters = dict((i, r) for i, r in enumerate(parms.vg_clusters0))
+        block_count = len(parms.textpage.extractBLOCKS())
+        if block_count > 0:
+            char_density = len(parms.textpage.extractTEXT()) / block_count
+        else:
+            char_density = 0
         # identify text bboxes on page, avoiding tables, images and graphics
+        if too_many_graphics and char_density < 20:
+            # This page has too many isolated text pieces for meaningful
+            # layout analysis. Treat whole page as one text block.
+            text_rects = [parms.clip]
+        else:
         text_rects = column_boxes(
             parms.page,
             paths=parms.actual_paths,
@@ -1200,7 +1214,13 @@ def get_page_output(
         pages = ProgressBar(pages)
     for pno in pages:
         parms = get_page_output(
-            doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS
+            doc,
+            pno,
+            margins,
+            textflags,
+            FILENAME,
+            IGNORE_IMAGES,
+            IGNORE_GRAPHICS,
         )
         if page_chunks is False:
             document_output += parms.md_string
diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py
new file mode 100644
index 00000000..f9de20e0
--- /dev/null
+++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py
@@ -0,0 +1,637 @@
+import pymupdf
+
+white_spaces = set([chr(i) for i in range(33)]) | {0xA0, 0x2002, 0x2003, 0x2009, 0x202F}
+
+
+def table_cleaner(page, blocks, tbbox):
+    """Clean the table bbox 'tbbox'.
+
+    'blocks' is the TextPage.extractDict()["blocks"] list.
+
+    This function must be used AFTER clean_pictures() so we know that tbbox
+    is complete in terms of includable vectors.
+
+    We check whether the table bbox contains non-rect ("tilted") vectors
+    and determine which part of tbbox they cover. If this is too large, we
+    re-classify tbbox as a picture.
+    Else we check whether the tilted vectors only cover some upper part of the
+    result. In that case we separate the top part as a picture and keep
+    the remining area as a table.
+    """
+    bbox = pymupdf.Rect(tbbox[:4])
+
+    # All vectors inside tbbox. Checking for the top-left corner is enough.
+    all_vectors = [
+        (pymupdf.IRect(b["bbox"]), b["isrect"])
+        for b in blocks
+        if b["type"] == 3 and b["bbox"][:2] in bbox
+    ]
+    tilt_vectors = [v for v in all_vectors if not v[1]]
+    # Early exit if no tilted vectors
+    if not tilt_vectors:
+        return None, None
+
+    y0 = min([b[0].y0 for b in tilt_vectors])
+    y1 = max([b[0].y1 for b in tilt_vectors])
+    x0 = min([b[0].x0 for b in tilt_vectors])
+    x1 = max([b[0].x1 for b in tilt_vectors])
+
+    # Rectangle containing all non-rectangle vectors inside the table bbox
+    tilted = pymupdf.Rect(x0, y0, x1, y1)
+
+    # if it covers most of the table bbox, we convert to picture
+    if tilted.width >= bbox.width * 0.8 and tilted.height >= bbox.height * 0.8:
+        return tbbox[:4] + ["picture"], None
+
+    # Extract text spans. Needed for completing the potential picture area.
+    span_rects = [
+        s["bbox"]
+        for b in blocks
+        if b["type"] == 0
+        for l in b["lines"]
+        for s in l["spans"]
+        if s["bbox"] in bbox
+    ]
+
+    # Check if non-rect vectors cover some acceptable upper part of tbbox.
+    if (
+        1
+        and tilted.y1 - bbox.y0 <= bbox.height * 0.3  # 30% of tbbox height
+        and tilted.width >= bbox.width * 0.7  # at least 80% of tbbox width
+    ):
+        tilted.y1 += 2  # add some buffer at the bottom
+
+        # include any text that is part of the picture area
+        for r in span_rects:
+            if tilted.intersects(r):
+                tilted |= r
+
+        picture_box = [bbox.x0, bbox.y0, bbox.x1, tilted.y1, "picture"]
+        table_box = [bbox.x0, tilted.y1 + 1, bbox.x1, bbox.y1, "table"]
+        return picture_box, table_box
+    return None, None
+
+
+def clean_tables(page, blocks):
+    for i in range(len(page.layout_information)):
+        if page.layout_information[i][4] != "table":
+            continue
+        # re-classify some corner cases as "text"
+        # the layout bbox as a Rect
+        bbox = pymupdf.Rect(page.layout_information[i][:4])
+
+        # lines in this bbox
+        lines = [
+            l for b in blocks if b["type"] == 0 for l in b["lines"] if l["bbox"] in bbox
+        ]
+        y_vals0 = sorted(set(round(l["bbox"][3]) for l in lines))
+        y_vals = [y_vals0[0]]
+        for y in y_vals0[1:]:
+            if y - y_vals[-1] > 3:
+                y_vals.append(y)
+        if len(y_vals) < 2:  # too few distinct line bottoms
+            # too few text lines to be a table
+            page.layout_information[i][4] = "text"
+            continue
+        # our table minimum dimension, rows x cols, is 2 x 2
+        mx_same_baseline = 1
+        for y in y_vals:
+            count = len([l for l in lines if abs(y - l["bbox"][3]) <= 3])
+            if count > mx_same_baseline:
+                mx_same_baseline = count
+                break
+        if mx_same_baseline < 2:
+            # too few text columns to be a table
+            page.layout_information[i][4] = "text"
+            continue
+        rc1, rc2 = table_cleaner(page, blocks, page.layout_information[i])
+        if rc1:
+            if not rc2:
+                page.layout_information[i] = rc1
+            else:
+                page.layout_information[i] = rc2
+                page.layout_information.insert(i, rc1)
+                i += 1
+    return
+
+
+def clean_pictures(page, blocks):
+    """Extend picture / formula / table bboxes.
+
+    Join layout boxes with intersecting text, image, vectors.
+
+    'blocks' is the TextPage.extractDict()["blocks"] list.
+    """
+    # all layout boxes
+    all_bboxes = [pymupdf.Rect(b[:4]) for b in page.layout_information]
+
+    for i in range(len(all_bboxes)):
+        if page.layout_information[i][4] not in ("picture", "formula", "table"):
+            # no eligible layout box
+            continue
+
+        # get its Rect object
+        bbox = pymupdf.Rect(page.layout_information[i][:4])
+        for b in blocks:
+            if b["type"] not in (0, 1, 3):
+                continue
+            block_bbox = pymupdf.IRect(b["bbox"])
+            if b["type"] == 3 and block_bbox.is_empty:
+                block_bbox += (-1, -1, 1, 1)
+            if bbox.intersects(block_bbox) and not any(
+                bb.intersects(block_bbox) for j, bb in enumerate(all_bboxes) if j != i
+            ):
+                bbox |= block_bbox
+        page.layout_information[i] = list(bbox) + [page.layout_information[i][4]]
+
+
+def add_image_orphans(page, blocks):
+    """Add orphan images as layout boxes of class 'picture'.
+
+    'blocks' is the TextPage.extractDict()["blocks"] list.
+    """
+    # all layout boxes
+    all_bboxes = [pymupdf.Rect(b[:4]) for b in page.layout_information]
+    area_limit = abs(page.rect) * 0.9
+    images = []
+    for img in page.get_image_info():
+        r = page.rect & img["bbox"]
+        if r.is_empty or abs(r) >= area_limit:
+            continue
+        images.append(r)
+
+    paths = []
+    for b in blocks:
+        if b["type"] != 3:
+            continue
+        r = page.rect & b["bbox"]
+        if abs(r) >= area_limit:
+            continue
+        if r.width < 3 and r.height < 3:
+            continue
+        r_low_limit = 0.1 * abs(r)
+        r_hi_limit = 0.8 * abs(r)
+
+        # ignore vectors that significantly overlap layout bboxes
+        if any(abs(r & bb) > min(r_low_limit, abs(bb) * 0.1) for bb in all_bboxes):
+            continue
+        # ignore vectors that are mostly covered by images
+        if any(abs(r & i) > r_hi_limit for i in images):
+            continue
+        paths.append({"rect": r})
+
+    # make vector clusters, select only sufficiently large ones
+    vectors = page.cluster_drawings(drawings=paths, x_tolerance=20, y_tolerance=20)
+    vectors = [v for v in vectors if v.width > 30 and v.height > 30]
+
+    # resolve mutual containment of images and vectors
+    imgs = sorted(images + vectors, key=lambda r: abs(r), reverse=True)
+
+    filtered_imgs = []
+    for r in imgs:
+        if not any(r in fr for fr in filtered_imgs):
+            filtered_imgs.append(r)
+
+    for r in filtered_imgs:
+        # add picture orphans that do not significantly overlap layout boxes
+        if not any(abs(r & bbox) > 0.1 * min(abs(r), abs(bbox)) for bbox in all_bboxes):
+            page.layout_information.append(list(r) + ["picture"])
+            all_bboxes.append(r)
+    return
+
+
+"""
+Determine reading order of layout boxes on a document page.
+
+Layout boxes are defined as classified bounding boxes, with class info as
+provided by pymupdf_layout. Each box is a tuple (x0, y0, x1, y1, "class").
+
+The main function is "find_reading_order()".
+"""
+
+
+def cluster_stripes(boxes, vertical_gap: float = 12):
+    """
+    Divide page into horizontal stripes based on vertical gaps.
+
+    Args:
+        boxes (list): List of bounding boxes, each defined as (x0, y0, x1, y1).
+        vertical_gap (float): Minimum vertical gap to separate stripes.
+
+    Returns:
+        List of disjoint horizontal stripes. Each stripe is a list of boxes.
+    """
+    # Sort top to bottom
+    sorted_boxes = sorted(boxes, key=lambda b: b[1])
+    stripes = []
+    if not sorted_boxes:
+        return stripes
+    current_stripe = [sorted_boxes[0]]
+
+    for box in sorted_boxes[1:]:
+        prev_bottom = max(b[3] for b in current_stripe)
+        if box[1] - prev_bottom > vertical_gap:
+            stripes.append(current_stripe)
+            current_stripe = [box]
+        else:
+            current_stripe.append(box)
+
+    stripes.append(current_stripe)
+    return stripes
+
+
+def cluster_columns_in_stripe(stripe: list):
+    """
+    Within a stripe, group boxes into columns based on horizontal proximity.
+
+    Args:
+        stripe (list): List of boxes within a stripe.
+
+    Returns:
+        list: List of columns, each column is a list of boxes.
+    """
+    # Sort left to right
+    sorted_boxes = sorted(stripe, key=lambda b: b[0])
+    columns = []
+    current_column = [sorted_boxes[0]]
+
+    for box in sorted_boxes[1:]:
+        prev_right = max([b[2] for b in current_column])
+        if box[0] - prev_right >= -1:
+            columns.append(sorted(current_column, key=lambda b: b[3]))
+            current_column = [box]
+        else:
+            current_column.append(box)
+
+    columns.append(sorted(current_column, key=lambda b: b[3]))
+    return columns
+
+
+def compute_reading_order(boxes, vertical_gap: float = 12):
+    """
+    Compute reading order of boxes delivered by PyMuPDF-Layout.
+
+    Args:
+        boxes (list): List of bounding boxes.
+        vertical_gap (float): Minimum vertical gap to separate stripes.
+
+    Returns:
+        list: List of boxes in reading order.
+    """
+    # compute adequate vertical_gap based height of union of bboxes
+    temp = pymupdf.EMPTY_RECT()
+    for b in boxes:
+        temp |= pymupdf.Rect(b[:4])
+    this_vertical_gap = vertical_gap * temp.height / 800
+    stripes = cluster_stripes(boxes, vertical_gap=this_vertical_gap)
+    ordered = []
+    for stripe in stripes:
+        columns = cluster_columns_in_stripe(stripe)
+        for col in columns:
+            ordered.extend(col)
+    return ordered
+
+
+def find_reading_order(boxes, vertical_gap: float = 12) -> list:
+    """Given page layout information, return the boxes in reading order.
+
+    Args:
+        boxes: List of classified bounding boxes with class info as defined
+               by pymupdf_layout: (x0, y0, x1, y1, "class").
+        vertical_gap: Minimum vertical gap to separate stripes. The default
+                      value of 12 works well for most documents.
+
+    Returns:
+        List of boxes in reading order.
+    """
+
+    def is_contained(inner, outer) -> bool:
+        """Check if inner box is fully contained within outer box."""
+        return (
+            1
+            and outer[0] <= inner[0]
+            and outer[1] <= inner[1]
+            and outer[2] >= inner[2]
+            and outer[3] >= inner[3]
+            and inner != outer
+        )
+
+    def filter_contained(boxes) -> list:
+        """Remove boxes that are fully contained within another box."""
+        # Sort boxes by descending area
+        sorted_boxes = sorted(
+            boxes, key=lambda r: (r[2] - r[0]) * (r[3] - r[1]), reverse=True
+        )
+        result = []
+        for r in sorted_boxes:
+            if not any(is_contained(r, other) for other in result):
+                result.append(r)
+        return result
+
+    """
+    We expect being passed raw 'layout_information' as provided by
+    pymupdf_layout. We separate page headers and footers from the
+    body, bring body boxes into reading order and concatenate the final list.
+    """
+    filtered = filter_contained(boxes)  # remove nested boxes first
+    page_headers = []  # for page headers
+    page_footers = []  # for page footers
+    body_boxes = []  # for main body boxes
+
+    # separate boxes by type
+    for box in filtered:
+        x0, y0, x1, y1, bclass = box
+        if bclass == "page-header":
+            page_headers.append(box)
+        elif bclass == "page-footer":
+            page_footers.append(box)
+        else:
+            body_boxes.append(box)
+
+    # bring body into reading order
+    ordered = compute_reading_order(body_boxes, vertical_gap=vertical_gap)
+
+    # Final full boxes list. We do simple sorts for non-body boxes.
+    final = (
+        sorted(page_headers, key=lambda r: (r[1], r[0]))
+        + ordered
+        + sorted(page_footers, key=lambda r: (r[1], r[0]))
+    )
+    return final
+
+
+def simplify_vectors(vectors):
+    new_vectors = []
+    if not vectors:
+        return new_vectors
+    new_vectors = [vectors[0]]
+    for v in vectors[1:]:
+        last_v = new_vectors[-1]
+        if (
+            1
+            and abs(v["bbox"][1] - last_v["bbox"][1]) < 1
+            and abs(v["bbox"][3] - last_v["bbox"][3]) < 1
+            and v["bbox"][0] <= last_v["bbox"][2] + 1
+        ):
+            # merge horizontally
+            new_bbox = [
+                min(v["bbox"][0], last_v["bbox"][0]),
+                min(v["bbox"][1], last_v["bbox"][1]),
+                max(v["bbox"][2], last_v["bbox"][2]),
+                max(v["bbox"][3], last_v["bbox"][3]),
+            ]
+            last_v["bbox"] = new_bbox
+        else:
+            new_vectors.append(v)
+    return new_vectors
+
+
+def find_virtual_lines(page, table_bbox, words, vectors, link_rects):
+    """Return virtual lines for a given table bbox."""
+
+    def make_vertical(table_bbox, line_bbox, word_boxes):
+        # default top and bottom point of vertical line
+        top = line_bbox.tl - (2, 0)
+        bottom = pymupdf.Point(top.x, table_bbox.y1)
+
+        # check if this cuts through any word boxes below and adjust bottom y
+        my_wboxes = sorted(
+            [
+                wr
+                for wr in word_boxes
+                if wr.y0 >= top.y and wr.y1 <= bottom.y and wr.x0 < top.x < wr.x1
+            ],
+            key=lambda r: r.y1,
+        )
+        if my_wboxes:  # if so, adjust bottom y
+            bottom.y = my_wboxes[0].y0
+
+        # same check above
+        my_wboxes = sorted(
+            [
+                wr
+                for wr in word_boxes
+                if wr.y0 >= table_bbox.y0 and wr.y1 <= top.y and wr.x0 < top.x < wr.x1
+            ],
+            key=lambda r: r.y1,
+        )
+        if my_wboxes:  # if so, adjust top y
+            top.y = my_wboxes[-1].y1
+        else:  # else we can start at top of table
+            top.y = table_bbox.y0
+
+        # extender = [((table_bbox.x0, top.y), (table_bbox.x1, top.y)), (top, bottom)]
+        extender = [(top, bottom)]
+        return extender
+
+    word_boxes = sorted(
+        [
+            pymupdf.Rect(w[:4])
+            for w in words
+            if (w[3] - w[1]) > 5 and table_bbox.contains(w[:4])
+        ],
+        key=lambda r: r.y1,
+    )
+
+    all_lines = []
+    all_boxes = []
+    for v in vectors:
+        vbbox = pymupdf.Rect(v["bbox"]).normalize()
+        vbbox += (0, -0.5, 0, 0.5)  # expand vertically a bit
+        vbbox &= table_bbox
+        if vbbox.is_empty:
+            continue
+        if not v["stroked"] and vbbox.height >= 5 and vbbox.width > 20:
+            all_lines.append((vbbox.tl, vbbox.tr))
+            all_lines.append((vbbox.bl, vbbox.br))
+            continue
+        if (
+            vbbox.width > 20
+            and vbbox.height <= 3
+            and not any(vbbox.intersects(lr) for lr in link_rects)
+        ):  # horizontal line
+            lines = make_vertical(table_bbox, vbbox, word_boxes)
+            for line in lines:
+                all_lines.append(line)
+
+    return all_lines, all_boxes
+
+
+def complete_table_structure(page):
+    """Add virtual lines for "table" layout bboxes
+
+    Iterate through all "table" layout boxes on the page's layout_information
+    and return virtual lines and boxes that can help detect table structures.
+
+    Returns:
+        lists of virtual lines and boxes for the page's TableFinder.
+    """
+    all_lines = []
+    all_boxes = []
+    textpage = page.get_textpage(
+        flags=pymupdf.TEXT_ACCURATE_BBOXES
+        | pymupdf.TEXT_COLLECT_VECTORS
+        | pymupdf.TEXT_COLLECT_STYLES
+    )
+    words = page.get_text("words", textpage=textpage)
+    vectors = sorted(
+        [b for b in textpage.extractDICT()["blocks"] if b["type"] == 3 and b["isrect"]],
+        key=lambda v: (v["bbox"][3], v["bbox"][0]),
+    )
+    vectors = simplify_vectors(vectors)
+    link_rects = [l["from"] for l in page.get_links()]
+    for b in page.layout_information:
+        if b[-1] != "table":
+            continue
+        table_bbox = pymupdf.Rect(b[:4])
+        all_boxes.append(table_bbox)
+        lines, boxes = find_virtual_lines(
+            page,
+            table_bbox,
+            words,
+            vectors,
+            link_rects,
+        )
+        all_lines.extend(lines)
+        all_boxes.extend(boxes)
+
+    return all_lines, all_boxes
+
+
+def extract_cells(textpage, cell, markdown=False):
+    """Extract text from a rect-like 'cell' as plain or MD styled text.
+
+    This function should ultimately be used to extract text from a table cell.
+    Markdown output will only work correctly if extraction flag bit
+    TEXT_COLLECT_STYLES is set.
+
+    Args:
+        textpage: A PyMuPDF TextPage object. Must have been created with
+            TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
+        cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
+        markdown: If True, return text formatted for Markdown.
+
+    Returns:
+        A string with the text extracted from the cell.
+    """
+    text = ""
+    for block in textpage.extractRAWDICT()["blocks"]:
+        if block["type"] != 0:
+            continue
+        for line in block["lines"]:
+            new_line = True
+            if text:  # must be a new line in the cell
+                if text.endswith("$"):
+                    text += " "
+                elif text.endswith("$ "):
+                    pass
+                else:
+                    text += "<br>" if markdown else "\n"
+
+            # strikeout detection only works with horizontal text
+            horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
+
+            for span in line["spans"]:
+                sbbox = span["bbox"]
+                # only include chars with more than 50% bbox overlap
+                span_text = ""
+                for char in span["chars"]:
+                    this_char = char["c"]
+                    bbox = pymupdf.Rect(char["bbox"])
+                    if abs(bbox & cell) > 0.5 * abs(bbox):
+                        span_text += this_char
+                    elif this_char in white_spaces:
+                        span_text += " "
+
+                if not span_text:
+                    continue  # skip empty span
+
+                if not markdown:  # no MD styling
+                    text += span_text
+                    continue
+
+                prefix = ""
+                suffix = ""
+                if horizontal and span["char_flags"] & pymupdf.mupdf.FZ_STEXT_STRIKEOUT:
+                    prefix += "~~"
+                    suffix = "~~" + suffix
+                if span["char_flags"] & pymupdf.mupdf.FZ_STEXT_BOLD:
+                    prefix += "**"
+                    suffix = "**" + suffix
+                if span["flags"] & pymupdf.TEXT_FONT_ITALIC:
+                    prefix += "_"
+                    suffix = "_" + suffix
+                if span["flags"] & pymupdf.TEXT_FONT_MONOSPACED:
+                    prefix += "`"
+                    suffix = "`" + suffix
+
+                if len(span["chars"]) > 2:
+                    span_text = span_text.rstrip()
+
+                # if span continues previous styling: extend cell text
+                if (ls := len(suffix)) and text.endswith(suffix):
+                    text = text[:-ls] + span_text + suffix
+                else:  # append the span with new styling
+                    if not span_text.strip():
+                        text += " "
+                    else:
+                        text += prefix + span_text + suffix
+
+    return text.strip()
+
+
+def table_to_markdown(textpage, table_item, markdown=True):
+    output = ""
+    table = table_item.table
+    row_count = table["row_count"]
+    col_count = table["col_count"]
+    cell_boxes = table["cells"]
+    # make empty cell text list
+    cells = [[None for i in range(col_count)] for j in range(row_count)]
+
+    # fill None cells with extracted text
+    # for rows, copy content from left to right
+    for j in range(row_count):
+        for i in range(col_count - 1):
+            if cells[j][i + 1] is None:
+                cells[j][i + 1] = cells[j][i]
+
+    # for columns, copy top to bottom
+    for i in range(col_count):
+        for j in range(row_count - 1):
+            if cells[j + 1][i] is None:
+                cells[j + 1][i] = cells[j][i]
+
+    for i, row in enumerate(cell_boxes):
+        for j, cell in enumerate(row):
+            if cell is not None:
+                cells[i][j] = extract_cells(
+                    textpage, cell_boxes[i][j], markdown=markdown
+                )
+    for i, name in enumerate(cells[0]):
+        if name is None:
+            if i > 0:
+                cells[0][i] = cells[0][i - 1]
+            else:
+                cells[0][i] = ""
+
+    header = "|" + "|".join(cells[0]) + "|\n"
+    output += header
+    # insert GitHub header line separator
+    output += "|" + "|".join("---" for i in range(col_count)) + "|\n"
+
+    # skip first row in details if header is part of the table
+    j = 1  # if self.header.external else 1
+
+    # iterate over detail rows
+    for row in cells[j:]:
+        line = "|"
+        for i, cell in enumerate(row):
+            # replace None cells with empty string
+            # use HTML line break tag
+            if cell is None:
+                cell = ""
+            line += cell + "|"
+        line += "\n"
+        output += line
+    return output + "\n"
diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py
index 5d422f48..e4963f8a 100644
--- a/pymupdf4llm/pymupdf4llm/versions_file.py
+++ b/pymupdf4llm/pymupdf4llm/versions_file.py
@@ -1,3 +1,3 @@
 # Generated file - do not edit.
-MINIMUM_PYMUPDF_VERSION = (1, 26, 3)
-VERSION = '0.0.27'
+MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
+VERSION = '0.1.7'
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
index 7051a8c7..7e435093 100644
--- a/pymupdf4llm/setup.py
+++ b/pymupdf4llm/setup.py
@@ -14,8 +14,8 @@
     "Topic :: Utilities",
 ]
 
-version = "0.0.28"
-requires = ["pymupdf>=1.26.3"]
+version = "0.1.7"
+requires = ["pymupdf>=1.26.6", "tabulate"]
 
 text = requires[0].split("=")[1]
 text = tuple(map(int, text.split(".")))
@@ -32,7 +32,7 @@
     long_description=readme,
     long_description_content_type="text/markdown",
     install_requires=requires,
-    python_requires=">=3.9",
+    python_requires=">=3.10",
     license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
     url="https://github.com/pymupdf/RAG",
     classifiers=classifiers,