diff --git a/CHANGES.md b/CHANGES.md
index 11325052..7e0d5289 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,15 +1,53 @@
 # Change Log
 
-## Changes in version 0.0.28
+## Changes in version 0.2.1
 
 ### Fixes:
 
-* [xxx](https://github.com/pymupdf/RAG/issues/xxx) - 
+* [320](https://github.com/pymupdf/RAG/issues/320) - [Bug] ValueError: min() iterable argument is empty ...
+* [319](https://github.com/pymupdf/RAG/issues/319) - [Bug] ValueError: min() arg is an empty sequence 
 
 ### Other Changes:
 
-* xxx
+* OCR invocation now differentiates between full-page OCR and text-only OCR: If the page does contain text but the percentage of unreadable characters exceeds a certain threshold (90%), we only OCR text span boundary boxes and replace span text with OCR'ed text where necessary.
 
+------
+
+## Changes in version 0.2.0
+
+This version introduces full support of the [PyMuPDF-Layout](https://pypi.org/project/pymupdf-layout/) package. This entails a radically new approach for detecting the layout of document pages using the AI-based features of the layout package.
+
+Improvements include:
+
+* Greatly improved table detection
+* Support of list item hierachy levels
+* Detection of page headers and footers
+* Improved detection of text paragraphs, titles and section headers
+* New output options beyond Markdown: plain text and JSON
+* Automatically detect whether a page needs OCR and invoke Tesseract if both, Tesseract is installed and OpenCV (package [opencv-python](https://pypi.org/project/opencv-python/)) is available. Invocation criteria include absence of readable text, full-page coverage with images, presence of many character-sized vector graphics.
+
+The PyMuPDF-Layout package is not open-source and has its own license, which is different from PyMuPDF4LLM. It also is dependent on a number of other, fairly large packages like [onnxruntime](https://pypi.org/project/onnxruntime/), [numpy](https://pypi.org/project/numpy/), [sympy](https://pypi.org/project/sympy/) and [OpenCV](https://pypi.org/project/opencv-python/), which each in turn have their own dependencies.
+
+We therefore keep the use of the layout feature optional. To activate PyMuPDF-Layout support the following import statement **_must be included before_** importing PyMuPDF4LLM itself:
+
+```python
+import pymupdf.layout
+import pymupdf4llm
+```
+
+Thereafter, PyMuPDF's namespace is available. The known method `pymupdf4llm.to_markdown()` automatically works with AI-based empowerment.
+In addition, two new methods become available:
+* `pymupdf4llm.to_text()` - which works much like markdown output but produces plain text.
+* `pymupdf4llm.to_json()` - which outputs the document's metadata and the selected pages in JSON format.
+
+### Fixes:
+
+
+### Other Changes:
+
+* If `show_progress=True`, Python package [tqdm](https://pypi.org/project/tqdm/) is automatically used when available to display a progress bar. If tqdm is not installed, our own text-based progress bar is used.
+
+------
 
 ## Changes in version 0.0.27
 
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
index ba82a62c..2e434a8b 100644
--- a/pdf4llm/setup.py
+++ b/pdf4llm/setup.py
@@ -6,7 +6,7 @@
 with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
     readme = f.read()
 
-version = "0.2.0"
+version = "0.2.1"
 
 classifiers = [
     "Development Status :: 5 - Production/Stable",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
index 8ca76f53..d8139ddc 100644
--- a/pymupdf4llm/pymupdf4llm/__init__.py
+++ b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -26,6 +26,9 @@ def parse_document(
         image_format="png",
         image_path="",
         pages=None,
+        output_images=True,
+        show_progress=False,
+        force_text=True,
     ):
         return DL.parse_document(
             doc,
@@ -34,6 +37,9 @@ def parse_document(
             image_format=image_format,
             image_path=image_path,
             pages=pages,
+            output_images=output_images,
+            show_progress=show_progress,
+            force_text=force_text,
         )
 
     def to_markdown(
@@ -75,6 +81,9 @@ def to_markdown(
             image_format=image_format,
             image_path=image_path,
             pages=pages,
+            output_images=embed_images or write_images,
+            show_progress=show_progress,
+            force_text=force_text,
         )
         return parsed_doc.to_markdown(
             header=header,
@@ -82,6 +91,7 @@ def to_markdown(
             write_images=write_images,
             embed_images=embed_images,
             ignore_code=ignore_code,
+            show_progress=show_progress,
         )
 
     def to_json(
@@ -92,6 +102,9 @@ def to_json(
         image_format="png",
         image_path="",
         pages=None,
+        output_images=False,
+        show_progress=False,
+        force_text=True,
     ):
         parsed_doc = parse_document(
             doc,
@@ -99,6 +112,9 @@ def to_json(
             image_format=image_format,
             image_path=image_path,
             pages=pages,
+            output_images=output_images,
+            show_progress=show_progress,
+            force_text=force_text,
         )
         return parsed_doc.to_json()
 
@@ -109,6 +125,8 @@ def to_text(
         footer=True,
         pages=None,
         ignore_code=False,
+        show_progress=False,
+        force_text=True,
     ):
         parsed_doc = parse_document(
             doc,
@@ -117,11 +135,15 @@ def to_text(
             image_format="png",
             image_path="",
             pages=pages,
+            output_images=False,
+            show_progress=show_progress,
+            force_text=force_text,
         )
         return parsed_doc.to_text(
             header=header,
             footer=footer,
             ignore_code=ignore_code,
+            show_progress=show_progress,
         )
 
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
index f9ad27f8..37c658e6 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
@@ -1,28 +1,42 @@
-import pymupdf  # PyMuPDF
-import numpy as np
 import cv2
+import numpy as np
+import pymupdf  # PyMuPDF
+from pymupdf4llm.helpers.utils import WHITE_CHARS
 
 
-WHITE_CHARS = set(
-    [chr(i) for i in range(33)]
-    + [
-        "\u00a0",  # Non-breaking space
-        "\u2000",  # En quad
-        "\u2001",  # Em quad
-        "\u2002",  # En space
-        "\u2003",  # Em space
-        "\u2004",  # Three-per-em space
-        "\u2005",  # Four-per-em space
-        "\u2006",  # Six-per-em space
-        "\u2007",  # Figure space
-        "\u2008",  # Punctuation space
-        "\u2009",  # Thin space
-        "\u200a",  # Hair space
-        "\u202f",  # Narrow no-break space
-        "\u205f",  # Medium mathematical space
-        "\u3000",  # Ideographic space
-    ]
-)
+def get_tessocr(page, bbox, dpi=300):
+    """Return OCR-ed span text using Tesseract.
+
+    Args:
+        page: pymupdf Page
+        bbox: pymupdf Rect or its sequence
+        dpi: resolution for OCR image
+    Returns:
+        The OCR-ed text of the bbox.
+    """
+    # Step 1: Make a high-resolution image of the bbox.
+    pix = page.get_pixmap(dpi=dpi, clip=bbox)
+    ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
+    ocrpage = ocrpdf[0]
+    text = ocrpage.get_text()
+    text = text.replace("\n", " ").strip()
+    return text
+
+
+def repair_blocks(input_blocks, page):
+    repaired_blocks = []
+    for block in input_blocks:
+        if block["type"] == 0:  # text block
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    if not chr(0xFFFD) in span["text"]:
+                        continue
+                    text = get_tessocr(page, span["bbox"])
+                    span["text"] = text
+            repaired_blocks.append(block)
+        else:
+            repaired_blocks.append(block)
+    return repaired_blocks
 
 
 def detect_qr_codes(img):
@@ -152,23 +166,38 @@ def should_ocr_page(
     # Check for text
     text = page.get_text(flags=0)
     decision["has_text"] = not WHITE_CHARS.issuperset(text)
-    if decision["has_text"]:
-        not_readable_count = len([c for c in text if c == chr(0xFFFD)])
-        readability = 1 - not_readable_count / len(text)
-        decision["readable_text"] = readability >= text_readability_thresh
 
     all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
     ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
     decision["has_ocr_text"] = bool(ocr_text_bboxes)
+
+    if decision["has_text"]:
+        unreadable_count = len([c for c in text if c == chr(0xFFFD)])
+        readability = 1 - unreadable_count / len(text)
+        decision["readable_text"] = readability >= text_readability_thresh
+
+    if decision["has_text"] and not decision["readable_text"]:
+        decision["should_ocr"] = True
+        decision["image"], decision["transform"], decision["pixmap"] = get_page_image(
+            page, dpi=dpi
+        )
+
+    if decision["has_text"]:
+        # early exit if any text exists
+        print(
+            f"{decision['has_text']=}, {decision['readable_text']=}, {decision['should_ocr']=}"
+        )
+        return decision
+
     # Check for image coverage
-    image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
-    image_rect=pymupdf.EMPTY_RECT()
+    image_rects = [page_rect & img["bbox"] for img in page.get_image_info()]
+    image_rect = pymupdf.EMPTY_RECT()
     for r in image_rects:
-        image_rect|=r
-    image_area=abs(image_rect)
+        image_rect |= r
+    image_area = abs(image_rect)
     if image_area:
         images_cover = image_area / page_area
-    else:        
+    else:
         images_cover = 0.0
     decision["image_covers_page"] = images_cover >= image_coverage_thresh
 
@@ -189,16 +218,11 @@ def should_ocr_page(
 
     # Final decision
     if (
-        1
-        and not decision["has_text"]
-        and not decision["readable_text"]
-        and (
-            0
-            or decision["image_covers_page"]
-            or decision["has_vector_drawings"]
-            or decision["edge_density"] > edge_thresh
-        )
+        0
+        or decision["image_covers_page"]
+        or decision["has_vector_drawings"]
+        or decision["edge_density"] > edge_thresh
     ):
         decision["should_ocr"] = True
-    
+
     return decision
diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
index 22ca33a3..a69f5dfe 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
@@ -1,15 +1,19 @@
 import base64
 import json
 import os
-from binascii import b2a_base64
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
 from pathlib import Path
+from typing import Dict, List, Optional, Union
+
 import pymupdf
 import tabulate
-from pymupdf4llm.helpers.get_text_lines import get_raw_lines
 from pymupdf4llm.helpers import utils
+from pymupdf4llm.helpers.get_text_lines import get_raw_lines
 
+try:
+    from tqdm import tqdm as ProgressBar
+except ImportError:
+    from pymupdf4llm.helpers.progress import ProgressBar
 try:
     import cv2
     from pymupdf4llm.helpers import check_ocr
@@ -33,7 +37,7 @@
 
 def omit_if_pua_char(text):
     """Check if character is in the Private Use Area (PUA) of Unicode."""
-    if len(text) > 1:
+    if len(text) > 1:  # only single characters are checked
         return text
     o = ord(text)
     if (
@@ -46,8 +50,7 @@ def omit_if_pua_char(text):
 
 
 def create_list_item_levels(layout_info):
-    """Create a dictionary that maps the box number of each list-item to
-    its hierarchy level.
+    """Map the layout box number of each list-item to its hierarchy level.
 
     Args:
         layout_info (list): the bbox list "page.layout_information"
@@ -59,10 +62,10 @@ def create_list_item_levels(layout_info):
     segment = []  # current segment
 
     # Create segments of contiguous list items. Each non-list-item finishes
-    # the current segment. Also, if two list-items belong to different page
-    # text columns ends a segment.
+    # the current segment. Also, two list-items in a row belonging to different
+    # page text columns end the segment after the first item.
     for i, item in enumerate(layout_info):
-        if item.boxclass != "list-item":  # bbox class is not list-item
+        if item.boxclass != "list-item":  # bbox class is no list-item
             if segment:  # end and save the current segment
                 segments.append(segment)
                 segment = []
@@ -83,7 +86,7 @@ def create_list_item_levels(layout_info):
 
     # walk through segments and assign levels
     for i, s in enumerate(segments):
-        if not s:
+        if not s:  # skip empty segments
             continue
         s.sort(key=lambda x: x[1].x0)  # sort by x0 coordinate of the bbox
 
@@ -119,16 +122,15 @@ def is_monospaced(textlines):
 
 def is_superscripted(line):
     spans = line["spans"]
+    line_bbox = line["bbox"]
     if not spans:
         return False
-    if spans[0]["flags"] & 1:  # check for superscript
+    span0 = spans[0]
+    if span0["flags"] & 1:  # check for superscript flag
         return True
-    if len(spans) < 2:
+    if len(spans) < 2:  # single span line: skip
         return False
-    if (
-        spans[0]["origin"][1] < spans[1]["origin"][1]
-        and spans[0]["size"] < spans[1]["size"]
-    ):
+    if spans0["origin"][1] < spans[1]["origin"][1] and span0["size"] < spans[1]["size"]:
         return True
     return False
 
@@ -138,18 +140,18 @@ def get_plain_text(spans):
     Parameter is a list of span dictionaries. The spans may come from
     one or more original "textlines" items.
     Returns the text string of the boundary box.
-    The text string always ends with the suffix and a space
     """
     output = ""
     for i, s in enumerate(spans):
-        span_text = s["text"].strip()  # remove leading/trailing spaces
         superscript = s["flags"] & 1
-        span_text = s["text"].strip()
+        span_text = s["text"].strip()  # remove leading/trailing spaces
         if superscript:
+            # enclose superscripted text in brackets if first span
             if i == 0:
                 span_text = f"[{span_text}] "
             elif output.endswith(" "):
                 output = output[:-1]
+        # resolve hyphenation
         if output.endswith("- ") and len(output.split()[-1]) > 2:
             output = output[:-2]
         output += span_text + " "
@@ -160,7 +162,7 @@ def list_item_to_text(textlines, level):
     """
     Convert "list-item" bboxes to text.
     """
-    indent = "   " * (level - 1)
+    indent = "   " * (level - 1)  # indentation based on level
     output = indent
     line = textlines[0]
     x0 = line["bbox"][0]  # left of first line
@@ -195,11 +197,13 @@ def footnote_to_text(textlines):
     """
     Convert "footnote" bboxes to text.
     """
+    # we render footnotes as blockquotes
     output = "> "
     line = textlines[0]
     spans = line["spans"]
 
     for line in textlines[1:]:
+        # superscripted line starts a new footnote line
         if is_superscripted(line):
             line_output = get_plain_text(spans)
             output += line_output
@@ -214,7 +218,10 @@ def footnote_to_text(textlines):
 
 
 def code_block_to_text(textlines):
-    """Output a code block in plain text format."""
+    """Output a code block in plain text format.
+
+    Basic difference is that lines are separated by line breaks.
+    """
     output = ""
     for line in textlines:
         line_text = ""
@@ -228,15 +235,17 @@ def code_block_to_text(textlines):
 
 def text_to_text(textlines, ignore_code: bool = False):
     """
-    Convert "text" bboxes to plain text, as well as other boxclasses
+    Convert "text" bboxes to plain text, as well as boxclasses
     not specifically handled elsewhere.
-    The line text is written without line breaks. At the end,
-    two newlines are added to separate from the next block.
+    The text of all spans of all lines is written without line breaks.
+    At the end, two newlines are added to separate from the next block.
     """
     if not textlines:
         return ""
     if is_superscripted(textlines[0]):  # check for superscript
+        # handle mis-classified text boundary box
         return footnote_to_text(textlines)
+    # handle completely mnonospaced textlines as code block
     if not ignore_code and is_monospaced(textlines):
         return code_block_to_text(textlines)
 
@@ -249,6 +258,47 @@ def text_to_text(textlines, ignore_code: bool = False):
     return output + "\n\n"
 
 
+def picture_text_to_text(textlines, ignore_code: bool = False, clip=None):
+    """
+    Convert text extracted from images to plain text format.
+    """
+    output = "----- Start of picture text -----\n"
+    for tl in textlines:
+        line_text = " ".join([s["text"] for s in tl["spans"]])
+        output += line_text.rstrip() + "\n"
+    output += "----- End of picture text -----\n"
+    return output + "\n"
+
+
+def fallback_text_to_text(textlines, ignore_code: bool = False, clip=None):
+    """Convert text extracted from unrecognized tables.
+
+    We hope for some sort of table structure being present in the text spans:
+    The maximum span count in the lines is assumed to equal column count.
+    """
+    span_count = max(len(tl["spans"]) for tl in textlines)
+    lines = []
+    output = ""
+    for tl in textlines:
+        spans = tl["spans"]
+        # prepare a row with empty strings in each cell
+        line = [""] * span_count
+        if len(spans) < span_count and spans[0]["bbox"][0] > clip[0] + 10:
+            i = 1
+        else:
+            i = 0
+        for j, s in enumerate(spans, start=i):
+            line[j] = s["text"].strip()
+        lines.append(line)
+    tab_text = tabulate.tabulate(
+        lines,
+        tablefmt="grid",
+        maxcolwidths=int(100 / span_count),
+    )
+    output += tab_text + "\n"
+    return output + "\n"
+
+
 def get_styled_text(spans):
     """Output text with markdown style codes based on font properties.
     Parameter is a list of span dictionaries. The spans may come from
@@ -325,7 +375,7 @@ def list_item_to_md(textlines, level):
     This post-layout heuristics helps cover cases where more than
     one list item is contained in a single bbox.
     """
-    indent = "   " * (level - 1)
+    indent = "   " * (level - 1)  # indentation based on level
     line = textlines[0]
     x0 = line["bbox"][0]  # left of first line
     spans = line["spans"]
@@ -337,6 +387,7 @@ def list_item_to_md(textlines, level):
         starter = "1. "
 
     if not omit_if_pua_char(span0["text"].strip()):
+        # bullet was a PUA char: remove it
         spans.pop(0)
         if spans:
             x0 = spans[0]["bbox"][0]
@@ -452,6 +503,33 @@ def text_to_md(textlines, ignore_code: bool = False):
     return output + "\n\n"
 
 
+def picture_text_to_md(textlines, ignore_code: bool = False, clip=None):
+    """
+    Convert text extracted from images to markdown format.
+    """
+    output = "**----- Start of picture text -----**<br>\n"
+    for tl in textlines:
+        line_text = " ".join([s["text"] for s in tl["spans"]])
+        output += line_text.rstrip() + "<br>"
+    output += "**----- End of picture text -----**<br>\n"
+    return output + "\n\n"
+
+
+def fallback_text_to_md(textlines, ignore_code: bool = False, clip=None):
+    """
+    Convert text extracted from images to markdown format.
+    """
+    span_count = max(len(tl["spans"]) for tl in textlines)
+    output = "**----- Start of picture text -----**<br>\n"
+    output += "|" * (span_count + 1) + "\n"
+    output += "|" + "|".join(["---"] * span_count) + "|\n"
+    for tl in textlines:
+        ltext = "|" + "|".join([s["text"].strip() for s in tl["spans"]]) + "|\n"
+        output += ltext
+    output += "**----- End of picture text -----**<br>\n"
+    return output + "\n\n"
+
+
 @dataclass
 class LayoutBox:
     x0: float
@@ -502,42 +580,68 @@ def to_markdown(
         write_images: bool = False,
         embed_images: bool = False,
         ignore_code: bool = False,
+        show_progress: bool = False,
     ) -> str:
         """
         Serialize ParsedDocument to markdown text.
         """
         output = ""
-        for page in self.pages:
+        if show_progress and len(self.pages) > 5:
+            print(f"Generating markdown text...")
+            this_iterator = ProgressBar(self.pages)
+        else:
+            this_iterator = self.pages
+        for page in this_iterator:
 
-            # make mapping: box number to list item level
+            # Make a mapping: box number -> list item hierarchy level
             list_item_levels = create_list_item_levels(page.boxes)
+
             for i, box in enumerate(page.boxes):
                 clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1)
                 btype = box.boxclass
+
+                # skip headers/footers if requested
                 if btype == "page-header" and header is False:
                     continue
                 if btype == "page-footer" and footer is False:
                     continue
-                if btype in ("picture", "formula") and box.image:
-                    img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}"
-                    if write_images:
-                        filename = os.path.basename(self.filename).replace(" ", "-")
-                        image_filename = os.path.join(
-                            self.image_path,
-                            f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}",
-                        )
-                        Path(image_filename).write_bytes(box.image)
-
-                        output += GRAPHICS_TEXT % img_filename
-
-                    elif embed_images:
-                        # make a base64 encoded string of the image
-                        data = b2a_base64(box.image).decode()
-                        data = f"data:image/{self.image_format};base64," + data
-                        output += GRAPHICS_TEXT % data + "\n\n"
 
+                # pictures and formulas: either write image file or embed
+                if btype in ("picture", "formula", "fallback"):
+                    if box.image:
+                        if write_images:
+                            img_filename = f"{self.filename}-{page.page_number:04d}-{i:02d}.{self.image_format}"
+                            filename = os.path.basename(self.filename).replace(" ", "-")
+                            image_filename = os.path.join(
+                                self.image_path,
+                                f"{filename}-{page.page_number:04d}-{i:02d}.{self.image_format}",
+                            )
+                            Path(image_filename).write_bytes(box.image)
+
+                            output += GRAPHICS_TEXT % img_filename
+
+                        elif embed_images:
+                            # make a base64 encoded string of the image
+                            data = base64.b64encode(box.image).decode()
+                            data = f"data:image/{self.image_format};base64," + data
+                            output += GRAPHICS_TEXT % data + "\n\n"
                     else:
-                        output += f"**==> {btype} [{clip.width} x {clip.height}] intentionally omitted <==**\n\n"
+                        output += f"**==> picture [{clip.width} x {clip.height}] intentionally omitted <==**\n\n"
+
+                    # output text in image if requested
+                    if box.textlines:
+                        if btype == "picture":
+                            output += picture_text_to_md(
+                                box.textlines,
+                                ignore_code=ignore_code or page.ocrpage,
+                                clip=clip,
+                            )
+                        elif btype == "fallback":
+                            output += fallback_text_to_md(
+                                box.textlines,
+                                ignore_code=ignore_code or page.ocrpage,
+                                clip=clip,
+                            )
                     continue
                 if btype == "table":
                     output += box.table["markdown"] + "\n\n"
@@ -564,7 +668,7 @@ def to_markdown(
 
         return output
 
-    def to_json(self) -> str:
+    def to_json(self, show_progress=False) -> str:
         # Serialize to JSON
         class LayoutEncoder(json.JSONEncoder):
             def default(self, s):
@@ -593,13 +697,19 @@ def to_text(
         header: bool = True,
         footer: bool = True,
         ignore_code: bool = False,
+        show_progress: bool = False,
     ) -> str:
         """
         Serialize ParsedDocument to plain text. Optionally omit page headers or footers.
         """
         # Flatten all text boxes into plain text
         output = ""
-        for page in self.pages:
+        if show_progress and len(self.pages) > 5:
+            print(f"Generating plain text ..")
+            this_iterator = ProgressBar(self.pages)
+        else:
+            this_iterator = self.pages
+        for page in this_iterator:
             list_item_levels = create_list_item_levels(page.boxes)
             for i, box in enumerate(page.boxes):
                 clip = pymupdf.IRect(box.x0, box.y0, box.x1, box.y1)
@@ -608,8 +718,21 @@ def to_text(
                     continue
                 if btype == "page-footer" and footer is False:
                     continue
-                if btype in ("picture", "formula"):
-                    output += f"==> {btype} [{clip.width} x {clip.height}] <==\n\n"
+                if btype in ("picture", "formula", "fallback"):
+                    output += f"==> picture [{clip.width} x {clip.height}] <==\n\n"
+                    if box.textlines:
+                        if btype == "picture":
+                            output += picture_text_to_text(
+                                box.textlines,
+                                ignore_code=ignore_code or page.ocrpage,
+                                clip=clip,
+                            )
+                        elif btype == "fallback":
+                            output += fallback_text_to_text(
+                                box.textlines,
+                                ignore_code=ignore_code or page.ocrpage,
+                                clip=clip,
+                            )
                     continue
                 if btype == "table":
                     output += (
@@ -637,6 +760,9 @@ def parse_document(
     image_format="png",
     image_path="",
     pages=None,
+    show_progress=False,
+    output_images=True,
+    force_text=False,
 ) -> ParsedDocument:
     if isinstance(doc, pymupdf.Document):
         mydoc = doc
@@ -651,6 +777,7 @@ def parse_document(
     document.image_format = image_format
     document.image_path = image_path
     document.pages = []
+    document.force_text = force_text
     try:
         reason = "OpenCV not installed"
         assert cv2 is not None
@@ -677,6 +804,9 @@ def parse_document(
         raise ValueError(
             "'pages' parameter must be None, int, or a sequence of ints less than page count"
         )
+    if show_progress and len(page_filter) > 5:
+        print(f"Parsing {len(page_filter)} pages of '{document.filename}'...")
+        page_filter = ProgressBar(page_filter)
     for pno in page_filter:
         page = mydoc.load_page(pno)
 
@@ -687,28 +817,36 @@ def parse_document(
             decision = {"should_ocr": False}
         if decision["should_ocr"]:
             print(f"Performing OCR on {page.number=}[{page.number+1}]...")
-            pix = decision["pixmap"]  # retrieve the Pixmap
-            pdf_data = pix.pdfocr_tobytes()  # OCR it
-            ocr_pdf = pymupdf.open("pdf", pdf_data)  # get the OCR'd PDF
-            ocrpage = ocr_pdf[0]  # this is its OCR'd page
-            # remove everything except the text
-            ocrpage.add_redact_annot(ocrpage.rect)
-            ocrpage.apply_redactions(
-                images=pymupdf.PDF_REDACT_IMAGE_REMOVE,
-                graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
-                text=pymupdf.PDF_REDACT_TEXT_NONE,
-            )
-            # copy text over to original page
-            page.show_pdf_page(page.rect, ocr_pdf, 0)
-            ocr_pdf.close()  # discard temporary OCR PDF
-            del ocr_pdf
+            if not decision.get("has_text"):
+                pix = decision["pixmap"]  # retrieve the Pixmap
+                pdf_data = pix.pdfocr_tobytes()  # OCR it
+                ocr_pdf = pymupdf.open("pdf", pdf_data)  # get the OCR'd PDF
+                ocrpage = ocr_pdf[0]  # this is its OCR'd page
+                # remove everything except the text
+                ocrpage.add_redact_annot(ocrpage.rect)
+                ocrpage.apply_redactions(
+                    images=pymupdf.PDF_REDACT_IMAGE_REMOVE,
+                    graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
+                    text=pymupdf.PDF_REDACT_TEXT_NONE,
+                )
+                # copy text over to original page
+                page.show_pdf_page(page.rect, ocr_pdf, 0)
+                ocr_pdf.close()  # discard temporary OCR PDF
+                del ocr_pdf
+                textpage = page.get_textpage(flags=FLAGS)
+                blocks = textpage.extractDICT()["blocks"]
+            else:
+                textpage = page.get_textpage(flags=FLAGS)
+                blocks = textpage.extractDICT()["blocks"]
+                blocks = check_ocr.repair_blocks(blocks, page)
+        else:
+            textpage = page.get_textpage(flags=FLAGS)
+            blocks = textpage.extractDICT()["blocks"]
 
         bboxlog = page.get_bboxlog()
-        ocrpage = decision["should_ocr"] or (
+        ocrpage = (
             set([b[0] for b in bboxlog if b[0] == "ignore-text"]) == CHECK_OCR_TEXT
         )
-        textpage = page.get_textpage(flags=FLAGS)
-        blocks = textpage.extractDICT()["blocks"]
         page.get_layout()
         utils.clean_pictures(page, blocks)
         utils.add_image_orphans(page, blocks)
@@ -749,8 +887,23 @@ def parse_document(
             clip = pymupdf.Rect(box[:4])
 
             if layoutbox.boxclass in ("picture", "formula"):
-                pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
-                layoutbox.image = pix.tobytes(document.image_format)
+                if output_images:
+                    pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
+                    layoutbox.image = pix.tobytes(document.image_format)
+                else:
+                    layoutbox.image = None
+                if layoutbox.boxclass == "picture" and document.force_text:
+                    # extract any text within the image box
+                    layoutbox.textlines = [
+                        {"bbox": l[0], "spans": l[1]}
+                        for l in get_raw_lines(
+                            textpage=None,
+                            blocks=pagelayout.fulltext,
+                            clip=clip,
+                            ignore_invisible=not ocrpage,
+                            only_horizontal=False,
+                        )
+                    ]
 
             elif layoutbox.boxclass == "table":
                 # This is either a table detected by native TableFinder or by
@@ -791,11 +944,23 @@ def parse_document(
                     )
 
                 except Exception as e:
-                    print(f"table detection error '{e}' on page {page.number+1}")
+                    # print(f"table detection error '{e}' on page {page.number+1}")
+                    layoutbox.boxclass = "fallback"
                     # table structure not detected: treat like an image
-                    pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
-                    layoutbox.image = pix.tobytes(document.image_format)
-                    layoutbox.boxclass = "picture"
+                    if output_images:
+                        pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
+                        layoutbox.image = pix.tobytes(document.image_format)
+                    else:
+                        layoutbox.image = None
+                    layoutbox.textlines = [
+                        {"bbox": l[0], "spans": l[1]}
+                        for l in get_raw_lines(
+                            textpage=None,
+                            blocks=pagelayout.fulltext,
+                            clip=clip,
+                            ignore_invisible=not ocrpage,
+                        )
+                    ]
             else:
                 # Handle text-like box classes:
                 # Extract text line information within the box.
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
index f3ef2c94..4f3cc890 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -12,17 +12,16 @@
 License GNU Affero GPL 3.0
 """
 
-import string
 import sys
 
 import pymupdf
+from pymupdf4llm.helpers.utils import WHITE_CHARS
 
-WHITE = set(string.whitespace)
 TYPE3_FONT_NAME = "Unnamed-T3"
 
 
 def is_white(text):
-    return WHITE.issuperset(text)
+    return WHITE_CHARS.issuperset(text)
 
 
 def get_raw_lines(
@@ -31,6 +30,7 @@ def get_raw_lines(
     clip=None,
     tolerance=3,
     ignore_invisible=True,
+    only_horizontal=True,
 ):
     """Extract the text spans from a TextPage in natural reading sequence.
 
@@ -124,7 +124,10 @@ def sanitize_spans(line):
     spans = []  # all spans in TextPage here
     for bno, b in enumerate(blocks):  # the numbered blocks
         for lno, line in enumerate(b["lines"]):  # the numbered lines
-            if abs(1 - line["dir"][0]) > 1e-3:  # only accept horizontal text
+            line_dir = line["dir"]
+            if (
+                only_horizontal and abs(1 - line_dir[0]) > 1e-3
+            ):  # only accept horizontal text
                 continue
             for sno, s in enumerate(line["spans"]):  # the numered spans
                 sbbox = pymupdf.Rect(s["bbox"])  # span bbox as a Rect
@@ -150,12 +153,13 @@ def sanitize_spans(line):
                 # include line/block numbers to facilitate separator insertion
                 s["line"] = lno
                 s["block"] = bno
+                s["dir"] = line_dir
                 spans.append(s)
 
     if not spans:  # no text at all
         return []
 
-    spans.sort(key=lambda s: s["bbox"].y1)  # sort spans by bottom coord
+    spans.sort(key=lambda s: (-s["dir"][0], s["bbox"].y1))  # sort spans by bottom coord
     nlines = []  # final result
     line = [spans[0]]  # collects spans with fitting vertical coordinates
     lrect = spans[0]["bbox"]  # rectangle joined from span rectangles
diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
index 4cdd8097..63b966c7 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
@@ -60,9 +60,8 @@
 License GNU Affero GPL 3.0
 """
 
-import string
-
 import pymupdf
+from pymupdf4llm.helpers.utils import WHITE_CHARS
 
 pymupdf.TOOLS.unset_quad_corrections(True)
 
@@ -88,11 +87,10 @@ def column_boxes(
         paths: use these drawings instead of extracting here
         avoid: ignore text in any of these areas
     """
-    WHITE = set(string.whitespace)
 
     def is_white(text):
         """Check for relevant text."""
-        return WHITE.issuperset(text)
+        return WHITE_CHARS.issuperset(text)
 
     def in_bbox(bb, bboxes):
         """Return 1-based number if a bbox contains bb, else return 0."""
diff --git a/pymupdf4llm/pymupdf4llm/helpers/progress.py b/pymupdf4llm/pymupdf4llm/helpers/progress.py
index e71e601a..7cbb2eac 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/progress.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/progress.py
@@ -29,13 +29,9 @@ def __init__(self, items: List[Any], progress_width: int = 40):
         self._increment = self._progress_width / self._len if self._len else 1
 
         # Init progress bar
-        sys.stdout.write(
-            "[%s] (0/%d)" % (" " * self._progress_width, self._len)
-        )
+        sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len))
         sys.stdout.flush()
-        sys.stdout.write(
-            "\b" * (self._progress_width + len(str(self._len)) + 6)
-        )
+        sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6))
 
     def __iter__(self):
         return self
@@ -61,9 +57,7 @@ def __next__(self):
         # Update the numerical progress
         padded_index = str(self._current_index).rjust(self._len_digits)
         progress_info = f" ({padded_index}/{self._len})"
-        sys.stdout.write(
-            "\b" * (self._progress_width + len(progress_info) + 1)
-        )
+        sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1))
         sys.stdout.write("[")
         sys.stdout.write(
             "=" * int(self._current_index * self._progress_width / self._len)
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
index f0c76e6b..d0f2e9aa 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -45,26 +45,17 @@
 from pymupdf import mupdf
 from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white
 from pymupdf4llm.helpers.multi_column import column_boxes
-from pymupdf4llm.helpers.progress import ProgressBar
+from pymupdf4llm.helpers.utils import BULLETS
+
+try:
+    from tqdm import tqdm as ProgressBar
+except ImportError:
+    from pymupdf4llm.helpers.progress import ProgressBar
 
 pymupdf.TOOLS.unset_quad_corrections(True)
 
-# Characters recognized as bullets when starting a line.
-bullet = tuple(
-    [
-        "- ",
-        "* ",
-        "> ",
-        chr(0xB6),
-        chr(0xB7),
-        chr(8224),
-        chr(8225),
-        chr(8226),
-        chr(0xF0A7),
-        chr(0xF0B7),
-    ]
-    + list(map(chr, range(9632, 9680)))
-)
+# Characters assumed as bullets when starting a line.
+bullet = tuple(BULLETS | {"- ", "* ", "> "})
 
 GRAPHICS_TEXT = "\n![](%s)\n"
 
@@ -1116,16 +1107,16 @@ def get_page_output(
             # layout analysis. Treat whole page as one text block.
             text_rects = [parms.clip]
         else:
-        text_rects = column_boxes(
-            parms.page,
-            paths=parms.actual_paths,
-            no_image_text=not force_text,
-            textpage=parms.textpage,
-            avoid=parms.tab_rects0 + parms.vg_clusters0,
-            footer_margin=margins[3],
-            header_margin=margins[1],
-            ignore_images=IGNORE_IMAGES,
-        )
+            text_rects = column_boxes(
+                parms.page,
+                paths=parms.actual_paths,
+                no_image_text=not force_text,
+                textpage=parms.textpage,
+                avoid=parms.tab_rects0 + parms.vg_clusters0,
+                footer_margin=margins[3],
+                header_margin=margins[1],
+                ignore_images=IGNORE_IMAGES,
+            )
 
         """
         ------------------------------------------------------------------
diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py
index 9df5a3e0..f25fadb1 100644
--- a/pymupdf4llm/pymupdf4llm/helpers/utils.py
+++ b/pymupdf4llm/pymupdf4llm/helpers/utils.py
@@ -1,6 +1,38 @@
 import pymupdf
 
-white_spaces = set([chr(i) for i in range(33)]) | {0xA0, 0x2002, 0x2003, 0x2009, 0x202F}
+WHITE_CHARS = set(
+    [chr(i) for i in range(33)]
+    + [
+        "\u00a0",  # Non-breaking space
+        "\u2000",  # En quad
+        "\u2001",  # Em quad
+        "\u2002",  # En space
+        "\u2003",  # Em space
+        "\u2004",  # Three-per-em space
+        "\u2005",  # Four-per-em space
+        "\u2006",  # Six-per-em space
+        "\u2007",  # Figure space
+        "\u2008",  # Punctuation space
+        "\u2009",  # Thin space
+        "\u200a",  # Hair space
+        "\u202f",  # Narrow no-break space
+        "\u205f",  # Medium mathematical space
+        "\u3000",  # Ideographic space
+    ]
+)
+
+BULLETS = set(
+    [
+        chr(0xB6),
+        chr(0xB7),
+        chr(0x2020),
+        chr(0x2021),
+        chr(0x2022),
+        chr(0xF0A7),
+        chr(0xF0B7),
+    ]
+    + list(map(chr, range(0x25A0, 0x2600)))
+)
 
 
 def table_cleaner(page, blocks, tbbox):
@@ -156,19 +188,29 @@ def add_image_orphans(page, blocks):
     images = []
     for img in page.get_image_info():
         r = page.rect & img["bbox"]
+        if r.width <= 3 or r.height <= 3:
+            continue
         if r.is_empty or abs(r) >= area_limit:
             continue
         images.append(r)
 
     paths = []
-    for b in blocks:
-        if b["type"] != 3:
-            continue
-        r = page.rect & b["bbox"]
+    vectors = sorted(
+        [
+            page.rect & b["bbox"]
+            for b in blocks
+            if b["type"] == 3
+            and b["bbox"][3] - b["bbox"][1] > 3
+            and b["bbox"][2] - b["bbox"][0] > 3
+        ],
+        key=lambda v: abs(v),
+        reverse=True,
+    )
+    vectors = vectors[:500]
+
+    for r in vectors:
         if abs(r) >= area_limit:
             continue
-        if r.width < 3 and r.height < 3:
-            continue
         r_low_limit = 0.1 * abs(r)
         r_hi_limit = 0.8 * abs(r)
 
@@ -186,7 +228,7 @@ def add_image_orphans(page, blocks):
 
     # resolve mutual containment of images and vectors
     imgs = sorted(images + vectors, key=lambda r: abs(r), reverse=True)
-
+    imgs = imgs[:500]
     filtered_imgs = []
     for r in imgs:
         if not any(r in fr for fr in filtered_imgs):
@@ -405,27 +447,35 @@ def filter_contained(boxes) -> list:
             body_boxes.append(box)
 
     # compute joined boxes of body
-    joined_boxes = pymupdf.Rect(
-        min(b[0] for b in body_boxes),
-        min(b[1] for b in body_boxes),
-        max(b[2] for b in body_boxes),
-        max(b[3] for b in body_boxes),
-    )
+    if not body_boxes:
+        joined_boxes = pymupdf.EMPTY_RECT()
+    else:
+        joined_boxes = pymupdf.Rect(
+            min(b[0] for b in body_boxes),
+            min(b[1] for b in body_boxes),
+            max(b[2] for b in body_boxes),
+            max(b[3] for b in body_boxes),
+        )
 
     # extract vectors contained in the TextPage
-    min_bbox_height = min(b[3] - b[1] for b in body_boxes)
-    vectors = [
-        pymupdf.Rect(b["bbox"])
-        for b in blocks
-        if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and b["bbox"] in joined_boxes
-    ]
-    # bring body into reading order
-    ordered = compute_reading_order(
-        body_boxes,
-        joined_boxes,
-        vectors,
-        vertical_gap=this_vertical_gap,
-    )
+    if not joined_boxes.is_empty:
+        min_bbox_height = min(b[3] - b[1] for b in body_boxes)
+        vectors = [
+            pymupdf.Rect(b["bbox"])
+            for b in blocks
+            if b["bbox"][3] - b["bbox"][1] >= min_bbox_height
+            and b["bbox"] in joined_boxes
+        ]
+        # bring body into reading order
+        ordered = compute_reading_order(
+            body_boxes,
+            joined_boxes,
+            vectors,
+            vertical_gap=this_vertical_gap,
+        )
+    else:
+        ordered = []
+
     # Final full boxes list. We do simple sorts for non-body boxes.
     final = (
         sorted(page_headers, key=lambda r: (r[1], r[0]))
@@ -633,7 +683,7 @@ def outside_cell(bbox, cell):
                     bbox = pymupdf.Rect(char["bbox"])
                     if abs(bbox & cell) > 0.5 * abs(bbox):
                         span_text += this_char
-                    elif this_char in white_spaces:
+                    elif this_char in WHITE_CHARS:
                         span_text += " "
 
                 if not span_text:
diff --git a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py
index b178d996..e2217498 100644
--- a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py
+++ b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py
@@ -21,9 +21,7 @@ class PDFMarkdownReader(BaseReader):
 
     def __init__(
         self,
-        meta_filter: Optional[
-            Callable[[Dict[str, Any]], Dict[str, Any]]
-        ] = None,
+        meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
     ):
         self.meta_filter = meta_filter
 
@@ -79,15 +77,14 @@ def _process_doc_page(
         **load_kwargs: Any,
     ):
         """Processes a single page of a PDF document."""
-        extra_info = self._process_doc_meta(
-            doc, file_path, page_number, extra_info
-        )
+        extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info)
 
         if self.meta_filter:
             extra_info = self.meta_filter(extra_info)
 
         text = to_markdown(
-            doc, pages=[page_number], 
+            doc,
+            pages=[page_number],
             hdr_info=hdr_info,
             **load_kwargs,
         )
diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py
index afbe9821..f6a11125 100644
--- a/pymupdf4llm/pymupdf4llm/versions_file.py
+++ b/pymupdf4llm/pymupdf4llm/versions_file.py
@@ -1,3 +1,3 @@
 # Generated file - do not edit.
 MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
-VERSION = '0.2.0'
+VERSION = '0.2.1'
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
index 5c7cd2c6..ca3ca28a 100644
--- a/pymupdf4llm/setup.py
+++ b/pymupdf4llm/setup.py
@@ -14,7 +14,7 @@
     "Topic :: Utilities",
 ]
 
-version = "0.2.0"
+version = "0.2.1"
 requires = ["pymupdf>=1.26.6", "tabulate"]
 
 text = requires[0].split("=")[1]