Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# Change Log

## Changes in version 0.0.27

### Fixes:

* [296](https://github.com/pymupdf/RAG/issues/296) - [Bug] A specific diagram recognized as significant ...
* [294](https://github.com/pymupdf/RAG/issues/294) - Unable to extract images from Page
* [272](https://github.com/pymupdf/RAG/issues/272) - Disappeared page breaks

### Other Changes:

* Added new parameter to `to_markdown`: `page_separators=False`. If `True` and `page_chunks=False` a line like `--- end of page=nnn ---` is appended to each pages markdown text. The page number is 0-based. Intended for debugging purposes.


## Changes in version 0.0.26

### Fixes:
Expand All @@ -14,7 +27,7 @@

* The class `TocHeaders` is now a top-level import and can now be directly used.

* Method `to_markdown` has a new parameter `detect_bg_color=True` which guesses the page's background color. If detection is successful, vectors having this fill color are ignored (default). Setting this to `False` will "fill" vectors to always be considered in vector graphics detection.
* Method `to_markdown` has a new parameter `detect_bg_color=True` (default) which guesses the page's background color. If a background is detected, fill-only vectors having this color are ignored. `False` will always consider "fill" vectors in vector graphics detection.

* Text written with a `Type 3` font will now always be considered. Previously, this text was always treated as invisible and was hence suppressed.

Expand Down
9 changes: 4 additions & 5 deletions pdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
"Programming Language :: Python :: 3",
"Topic :: Utilities",
]
requires = ["pymupdf4llm==0.0.26"]
requires = ["pymupdf4llm==0.0.27"]

setuptools.setup(
name="pdf4llm",
version="0.0.26",
version="0.0.27",
author="Artifex",
author_email="support@artifex.com",
description="PyMuPDF Utilities for LLM/RAG",
Expand All @@ -29,13 +29,12 @@
license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
url="https://github.com/pymupdf/RAG",
classifiers=classifiers,
package_data={
"pdf4llm": ["LICENSE"],
},
package_data={},
project_urls={
"Documentation": "https://pymupdf.readthedocs.io/",
"Source": "https://github.com/pymupdf/RAG/tree/main/pdf4llm/pdf4llm",
"Tracker": "https://github.com/pymupdf/RAG/issues",
"Changelog": "https://github.com/pymupdf/RAG/blob/main/CHANGES.md",
"License": "https://github.com/pymupdf/RAG/blob/main/LICENSE",
},
)
66 changes: 35 additions & 31 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def to_markdown(
filename=None,
force_text=True,
page_chunks=False,
page_separators=False,
margins=0,
dpi=150,
page_width=612,
Expand All @@ -341,6 +342,7 @@ def to_markdown(
image_format: (str) use this image format. Choose a supported one.
force_text: (bool) output text despite of image background.
page_chunks: (bool) whether to segment output by page.
page_separators: (bool) whether to include page separators in output.
margins: omit content overlapping margin areas.
dpi: (int) desired resolution for generated images.
page_width: (float) assumption if page layout is variable.
Expand Down Expand Up @@ -381,7 +383,7 @@ def to_markdown(
IGNORE_IMAGES = ignore_images
IGNORE_GRAPHICS = ignore_graphics
DETECT_BG_COLOR = detect_bg_color
if doc.is_form_pdf or doc.has_annots():
if doc.is_form_pdf or (doc.is_pdf and doc.has_annots()):
doc.bake()

# for reflowable documents allow making 1 page for the whole document
Expand Down Expand Up @@ -560,6 +562,7 @@ def write_text(
)
parms.line_rects.extend(cells)
parms.written_tables.append(i)
prev_hdr_string = None

# ------------------------------------------------------------
# Pick up images / graphics ABOVE this text block
Expand Down Expand Up @@ -592,6 +595,7 @@ def write_text(
if not is_white(img_txt):
out_string += img_txt
parms.written_images.append(i)
prev_hdr_string = None

parms.line_rects.append(lrect)
# if line rect is far away from the previous one, add a line break
Expand Down Expand Up @@ -751,7 +755,7 @@ def output_tables(parms, text_rect):
):
if i in parms.written_tables:
continue
this_md += parms.tabs[i].to_markdown(clean=False)
this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
Expand All @@ -772,7 +776,7 @@ def output_tables(parms, text_rect):
for i, trect in parms.tab_rects.items():
if i in parms.written_tables:
continue
this_md += parms.tabs[i].to_markdown(clean=False)
this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
if EXTRACT_WORDS:
# for "words" extraction, add table cells as line rects
cells = sorted(
Expand Down Expand Up @@ -954,7 +958,7 @@ def get_page_output(
) # accept invisible text

# determine background color
parms.bg_color = get_bg_color(page) if DETECT_BG_COLOR else None
parms.bg_color = None if not DETECT_BG_COLOR else get_bg_color(page)

left, top, right, bottom = margins
parms.clip = page.rect + (left, top, -right, -bottom)
Expand Down Expand Up @@ -994,12 +998,12 @@ def get_page_output(
if img_info:
img_max_size = abs(parms.clip) * 0.9
sane = [i for i in img_info if abs(i["bbox"] & parms.clip) < img_max_size]
if len(sane) < len(img_info): # found some
img_info = sane # use those images instead
# output full page image
name = save_image(parms, parms.clip, "full")
if name:
parms.md_string += GRAPHICS_TEXT % name
if len(sane) < len(img_info): # found some
img_info = sane # use those images instead
# output full page image
name = save_image(parms, parms.clip, "full")
if name:
parms.md_string += GRAPHICS_TEXT % name

img_info = img_info[:30] # only accept the largest up to 30 images
# run from back to front (= small to large)
Expand All @@ -1024,31 +1028,31 @@ def get_page_output(
# Locate all tables on page
parms.written_tables = [] # stores already written tables
omitted_table_rects = []
parms.tabs = []
if IGNORE_GRAPHICS or not table_strategy:
# do not try to extract tables
parms.tabs = None
pass
else:
parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
# remove tables with too few rows or columns
for i in range(len(parms.tabs.tables) - 1, -1, -1):
t = parms.tabs.tables[i]
tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
for t in tabs.tables:
# remove tables with too few rows or columns
if t.row_count < 2 or t.col_count < 2:
omitted_table_rects.append(pymupdf.Rect(t.bbox))
del parms.tabs.tables[i]
parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1]))
continue
parms.tabs.append(t)
parms.tabs.sort(key=lambda t: (t.bbox[0], t.bbox[1]))

# Make a list of table boundary boxes.
# Must include the header bbox (which may exist outside tab.bbox)
tab_rects = {}
if parms.tabs is not None:
for i, t in enumerate(parms.tabs.tables):
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
tab_dict = {
"bbox": tuple(tab_rects[i]),
"rows": t.row_count,
"columns": t.col_count,
}
parms.tables.append(tab_dict)
for i, t in enumerate(parms.tabs):
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
tab_dict = {
"bbox": tuple(tab_rects[i]),
"rows": t.row_count,
"columns": t.col_count,
}
parms.tables.append(tab_dict)
parms.tab_rects = tab_rects
# list of table rectangles
parms.tab_rects0 = list(tab_rects.values())
Expand All @@ -1064,15 +1068,12 @@ def get_page_output(
and p["rect"].width < parms.clip.width
and p["rect"].height < parms.clip.height
and (p["rect"].width > 3 or p["rect"].height > 3)
and not (p["fill"] == parms.bg_color and p["fill"] != None)
and not intersects_rects(
p["rect"], parms.tab_rects0 + omitted_table_rects
)
and not (p["type"] == "f" and p["fill"] == parms.bg_color)
and not intersects_rects(p["rect"], parms.tab_rects0)
and not intersects_rects(p["rect"], parms.annot_rects)
]
else:
paths = []

# catch too-many-graphics situation
if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT:
paths = []
Expand Down Expand Up @@ -1168,6 +1169,9 @@ def get_page_output(
else:
words = []
parms.words = words
if page_separators:
# add page separators to output
parms.md_string += f"\n\n--- end of page={parms.page.number} ---\n\n"
return parms

if page_chunks is False:
Expand Down
3 changes: 2 additions & 1 deletion pymupdf4llm/pymupdf4llm/versions_file.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# Generated file - do not edit.
MINIMUM_PYMUPDF_VERSION = (1, 26, 3)
VERSION = '0.0.26'
VERSION = '0.0.27'
7 changes: 4 additions & 3 deletions pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
"Topic :: Utilities",
]

version = "0.0.26"
version = "0.0.27"
requires = ["pymupdf>=1.26.3"]

text = requires[0].split("=")[1]
text = tuple(map(int, text.split(".")))
text = f"MINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n"
text = f"# Generated file - do not edit.\nMINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n"
Path("pymupdf4llm/versions_file.py").write_text(text)

setuptools.setup(
Expand All @@ -37,12 +37,13 @@
url="https://github.com/pymupdf/RAG",
classifiers=classifiers,
package_data={
"pymupdf4llm": ["LICENSE", "helpers/*.py", "llama/*.py"],
"pymupdf4llm": ["helpers/*.py", "llama/*.py"],
},
project_urls={
"Documentation": "https://pymupdf.readthedocs.io/",
"Source": "https://github.com/pymupdf/RAG/tree/main/pymupdf4llm/pymupdf4llm",
"Tracker": "https://github.com/pymupdf/RAG/issues",
"Changelog": "https://github.com/pymupdf/RAG/blob/main/CHANGES.md",
"License": "https://github.com/pymupdf/RAG/blob/main/LICENSE",
},
)