Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
"Programming Language :: Python :: 3",
"Topic :: Utilities",
]
requires = ["pymupdf4llm==0.0.22"]
requires = ["pymupdf4llm==0.0.23"]

setuptools.setup(
name="pdf4llm",
version="0.0.22",
version="0.0.23",
author="Artifex",
author_email="support@artifex.com",
description="PyMuPDF Utilities for LLM/RAG",
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown

__version__ = "0.0.22"
__version__ = "0.0.23"
version = __version__
version_tuple = tuple(map(int, version.split(".")))

Expand Down
12 changes: 10 additions & 2 deletions pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ def is_white(text):
return WHITE.issuperset(text)


def get_raw_lines(textpage, clip=None, tolerance=3):
def get_raw_lines(
textpage,
clip=None,
tolerance=3,
ignore_invisible=True,
):
"""Extract the text spans from a TextPage in natural reading sequence.

All spans roughly on the same line are joined to generate an improved line.
Expand All @@ -43,6 +48,8 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
turn may be based on a sub-rectangle of the full page).
tolerance: (float) put spans on the same line if their top or bottom
coordinate differ by no more than this value.
ignore_invisible: (bool) if True, invisible text is ignored. This may
have been set to False for pages with OCR text.

Returns:
A sorted list of items (rect, [spans]), each representing one line. The
Expand Down Expand Up @@ -109,7 +116,8 @@ def sanitize_spans(line):
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
if is_white(s["text"]): # ignore white text
continue
if s["alpha"] == 0: # ignore invisible text
# ignore invisible text
if s["alpha"] == 0 and ignore_invisible:
continue
if abs(sbbox & clip) < abs(sbbox) * 0.8: # if not in clip
continue
Expand Down
50 changes: 36 additions & 14 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __init__(
self,
doc: str,
pages: list = None,
body_limit: float = 11, # default if no text found
body_limit: float = 12, # force this to be body text
max_levels: int = 6, # accept this many header levels
):
"""Read all text and make a dictionary of fontsizes.
Expand Down Expand Up @@ -135,7 +135,7 @@ def __init__(
)
if temp:
# most frequent font size
self.body_limit = min(body_limit, temp[-1][0])
self.body_limit = max(body_limit, temp[-1][0])
else:
self.body_limit = body_limit

Expand Down Expand Up @@ -203,7 +203,7 @@ def get_header_id(self, span: dict, page=None) -> str:
return ""
# check if the span matches a TOC entry
text = span["text"].strip()
for t in toc:
for t in my_toc:
title = t[1].strip() # title of TOC entry
lvl = t[0] # level of TOC entry
if text.startswith(title) or title.startswith(text):
Expand Down Expand Up @@ -494,7 +494,12 @@ def write_text(
out_string = ""

# This is a list of tuples (linerect, spanlist)
nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3)
nlines = get_raw_lines(
parms.textpage,
clip=clip,
tolerance=3,
ignore_invisible=not parms.accept_invisible,
)
nlines = [
l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values())
]
Expand Down Expand Up @@ -821,6 +826,16 @@ def output_images(parms, text_rect, force_text):

return this_md

def page_is_ocr(page):
"""Check if page exclusivley contains OCR text.

For this to be true, all text must be written as "ignore-text".
"""
text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]])
if text_types == {"ignore-text"}:
return True
return False

def get_bg_color(page):
"""Determine the background color of the page.

Expand Down Expand Up @@ -919,6 +934,7 @@ def get_page_output(
parms.graphics = []
parms.words = []
parms.line_rects = []
parms.accept_invisible = page_is_ocr(page) # accept invisible text

# determine background color
parms.bg_color = get_bg_color(page)
Expand Down Expand Up @@ -968,11 +984,17 @@ def get_page_output(

parms.img_rects = [i["bbox"] for i in parms.images]

# catch too-many-graphics situation
graphics_count = len([b for b in page.get_bboxlog() if "path" in b[0]])
if GRAPHICS_LIMIT and graphics_count > GRAPHICS_LIMIT:
IGNORE_GRAPHICS = True
table_strategy = None

# Locate all tables on page
parms.written_tables = [] # stores already written tables
omitted_table_rects = []
if table_strategy is None:
parms.tabs = []
parms.tabs = None
else:
parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
# remove tables with too few rows or columns
Expand All @@ -986,14 +1008,15 @@ def get_page_output(
# Make a list of table boundary boxes.
# Must include the header bbox (which may exist outside tab.bbox)
tab_rects = {}
for i, t in enumerate(parms.tabs.tables):
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
tab_dict = {
"bbox": tuple(tab_rects[i]),
"rows": t.row_count,
"columns": t.col_count,
}
parms.tables.append(tab_dict)
if parms.tabs is not None:
for i, t in enumerate(parms.tabs.tables):
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
tab_dict = {
"bbox": tuple(tab_rects[i]),
"rows": t.row_count,
"columns": t.col_count,
}
parms.tables.append(tab_dict)
parms.tab_rects = tab_rects
# list of table rectangles
parms.tab_rects0 = list(tab_rects.values())
Expand Down Expand Up @@ -1084,7 +1107,6 @@ def get_page_output(
parms.md_string += output_tables(parms, None)
parms.md_string += output_images(parms, None, force_text)

parms.md_string += "\n-----\n\n"
while parms.md_string.startswith("\n"):
parms.md_string = parms.md_string[1:]
parms.md_string = parms.md_string.replace(chr(0), chr(0xFFFD))
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setuptools.setup(
name="pymupdf4llm",
version="0.0.22",
version="0.0.23",
author="Artifex",
author_email="support@artifex.com",
description="PyMuPDF Utilities for LLM/RAG",
Expand Down