Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# Change Log

## Changes in version 0.0.21

### Fixes:

* [116](https://github.com/pymupdf/RAG/issues/116) - Handling Graphical Images & Superscripts

### Other Changes:


## Changes in version 0.0.20

### Fixes:
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown

__version__ = "0.0.20"
__version__ = "0.0.21"
version = __version__
version_tuple = tuple(map(int, version.split(".")))

Expand Down
18 changes: 9 additions & 9 deletions pymupdf4llm/pymupdf4llm/helpers/multi_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@

import pymupdf

pymupdf.TOOLS.set_small_glyph_heights(True)
pymupdf.TOOLS.unset_quad_corrections(True)


def column_boxes(
Expand Down Expand Up @@ -237,7 +237,7 @@ def join_rects_phase2(bboxes):
if (
abs(r.x0 - r0.x0) <= 3
and abs(r.x1 - r0.x1) <= 3
and abs(r0.y1 - r.y0) <= 12
and abs(r0.y1 - r.y0) <= 10
):
r0 |= r
new_rects[-1] = r0
Expand Down Expand Up @@ -344,7 +344,7 @@ def join_rects_phase3(bboxes, path_rects, cache):
]

if textpage is None:
textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)
textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXT_ACCURATE_BBOXES)

bboxes = []

Expand Down Expand Up @@ -417,7 +417,6 @@ def join_rects_phase3(bboxes, path_rects, cache):
# immediately return of no text found
if bboxes == []:
return []

# --------------------------------------------------------------------
# Join bboxes to establish some column structure
# --------------------------------------------------------------------
Expand Down Expand Up @@ -467,7 +466,8 @@ def join_rects_phase3(bboxes, path_rects, cache):
return nblocks

# several phases of rectangle joining
nblocks = join_rects_phase1(nblocks)
# TODO: disabled for now as too aggressive:
# nblocks = join_rects_phase1(nblocks)
nblocks = join_rects_phase2(nblocks)
nblocks = join_rects_phase3(nblocks, path_rects, cache)

Expand All @@ -491,14 +491,14 @@ def join_rects_phase3(bboxes, path_rects, cache):
# check if footer margin is given
if len(sys.argv) > 2:
footer_margin = int(sys.argv[2])
else: # use default vaue
footer_margin = 50
else:
footer_margin = 0

# check if header margin is given
if len(sys.argv) > 3:
header_margin = int(sys.argv[3])
else: # use default vaue
header_margin = 50
else:
header_margin = 0

# open document
doc = pymupdf.open(filename)
Expand Down
103 changes: 57 additions & 46 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ def get_header_id(self, span: dict, page=None) -> str:
markdown header prefix string of 0 to n concatenated '#' characters.
"""
fontsize = round(span["size"]) # compute fontsize
if fontsize <= self.body_limit:
return ""
hdr_id = self.header_id.get(fontsize, "")
return hdr_id

Expand Down Expand Up @@ -278,7 +280,7 @@ def to_markdown(
ignore_code: (bool) suppress code-like formatting (mono-space fonts)
extract_words: (bool) include "words"-like output in page chunks
show_progress: (bool) print progress as each page is processed.
glyph_fallback: (bool) replace the Invalid Unicode by glyph number.
use_glyphs: (bool) replace the Invalid Unicode by glyph numbers.

"""
if write_images is False and embed_images is False and force_text is False:
Expand Down Expand Up @@ -427,8 +429,8 @@ def write_text(
if clip is None:
clip = parms.clip
out_string = ""
# This is a list of tuples (linerect, spanlist)

# This is a list of tuples (linerect, spanlist)
nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3)
nlines = [
l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values())
Expand All @@ -450,21 +452,18 @@ def write_text(
# Pick up tables ABOVE this text block
# ------------------------------------------------------------
if tables:
tab_candidates = sorted(
[
(i, tab_rect)
for i, tab_rect in parms.tab_rects.items()
if tab_rect.y1 <= lrect.y0
and i not in parms.deleted_tables
and (
0
or lrect.x0 <= tab_rect.x0 < lrect.x1
or lrect.x0 < tab_rect.x1 <= lrect.x1
or tab_rect.x0 <= lrect.x0 < lrect.x1 <= tab_rect.x1
)
],
key=lambda j: (j[1].y1, j[1].x0),
)
tab_candidates = [
(i, tab_rect)
for i, tab_rect in parms.tab_rects.items()
if tab_rect.y1 <= lrect.y0
and i not in parms.written_tables
and (
0
or lrect.x0 <= tab_rect.x0 < lrect.x1
or lrect.x0 < tab_rect.x1 <= lrect.x1
or tab_rect.x0 <= lrect.x0 < lrect.x1 <= tab_rect.x1
)
]
for i, _ in tab_candidates:
out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n"
if EXTRACT_WORDS:
Expand All @@ -481,14 +480,14 @@ def write_text(
key=lambda c: (c.y1, c.x0),
)
parms.line_rects.extend(cells)
parms.deleted_tables.append(i)
parms.written_tables.append(i)

# ------------------------------------------------------------
# Pick up images / graphics ABOVE this text block
# ------------------------------------------------------------
if images:
for i in range(len(parms.img_rects)):
if i in parms.deleted_images:
if i in parms.written_images:
continue
r = parms.img_rects[i]
if r.y1 <= lrect.y0 and (
Expand All @@ -502,7 +501,7 @@ def write_text(
out_string += GRAPHICS_TEXT % pathname

# recursive invocation
if force_text:
if force_text is True:
img_txt = write_text(
parms,
r,
Expand All @@ -513,7 +512,7 @@ def write_text(

if not is_white(img_txt):
out_string += img_txt
parms.deleted_images.append(i)
parms.written_images.append(i)

parms.line_rects.append(lrect)

Expand Down Expand Up @@ -668,7 +667,7 @@ def output_tables(parms, text_rect):
[j for j in parms.tab_rects.items() if j[1].y1 <= text_rect.y0],
key=lambda j: (j[1].y1, j[1].x0),
):
if i in parms.deleted_tables:
if i in parms.written_tables:
continue
this_md += parms.tabs[i].to_markdown(clean=False)
if EXTRACT_WORDS:
Expand All @@ -685,14 +684,11 @@ def output_tables(parms, text_rect):
key=lambda c: (c.y1, c.x0),
)
parms.line_rects.extend(cells)
del parms.tab_rects[i] # do not touch this table twice
parms.written_tables.append(i) # do not touch this table twice

else: # output all remaining tables
for i, trect in sorted(
parms.tab_rects.items(),
key=lambda j: (j[1].y1, j[1].x0),
):
if i in parms.deleted_tables:
for i, trect in parms.tab_rects.items():
if i in parms.written_tables:
continue
this_md += parms.tabs[i].to_markdown(clean=False)
if EXTRACT_WORDS:
Expand All @@ -709,10 +705,10 @@ def output_tables(parms, text_rect):
key=lambda c: (c.y1, c.x0),
)
parms.line_rects.extend(cells)
del parms.tab_rects[i] # do not touch this table twice
parms.written_tables.append(i) # do not touch this table twice
return this_md

def output_images(parms, text_rect):
def output_images(parms, text_rect, force_text):
"""Output images and graphics above text rectangle."""
if not parms.img_rects:
return ""
Expand All @@ -723,10 +719,10 @@ def output_images(parms, text_rect):
continue
if img_rect.x0 >= text_rect.x1 or img_rect.x1 <= text_rect.x0:
continue
if i in parms.deleted_images:
if i in parms.written_images:
continue
pathname = save_image(parms, img_rect, i)
parms.deleted_images.append(i) # do not touch this image twice
parms.written_images.append(i) # do not touch this image twice
if pathname:
this_md += GRAPHICS_TEXT % pathname
if force_text:
Expand All @@ -741,10 +737,10 @@ def output_images(parms, text_rect):
this_md += img_txt
else: # output all remaining images
for i, img_rect in enumerate(parms.img_rects):
if i in parms.deleted_images:
if i in parms.written_images:
continue
pathname = save_image(parms, img_rect, i)
parms.deleted_images.append(i) # do not touch this image twice
parms.written_images.append(i) # do not touch this image twice
if pathname:
this_md += GRAPHICS_TEXT % pathname
if force_text:
Expand Down Expand Up @@ -867,6 +863,9 @@ def get_page_output(
# extract external links on page
parms.links = [l for l in page.get_links() if l["kind"] == pymupdf.LINK_URI]

# extract annotation rectangles on page
parms.annot_rects = [a.rect for a in page.annots()]

# make a TextPage for all later extractions
parms.textpage = page.get_textpage(flags=textflags, clip=parms.clip)

Expand Down Expand Up @@ -904,10 +903,20 @@ def get_page_output(
parms.img_rects = [i["bbox"] for i in parms.images]

# Locate all tables on page
parms.written_tables = [] # stores already written tables
if table_strategy is None:
parms.tabs = []
else:
parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
del_this = []
for i, t in enumerate(parms.tabs):
if t.row_count < 2 or t.col_count < 2:
# ignore tables with too few rows or columns
del_this.append(i)
for i in sorted(del_this, reverse=True):
del parms.tabs.tables[i]
parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1]))

# Make a list of table boundary boxes.
# Must include the header bbox (which may exist outside tab.bbox)
tab_rects = {}
Expand All @@ -930,11 +939,13 @@ def get_page_output(
paths = [
p
for p in page.get_drawings()
if not intersects_rects(p["rect"], parms.tab_rects0)
and p["rect"] in parms.clip
and 3 < p["rect"].width < parms.clip.width
and 3 < p["rect"].height < parms.clip.height
and not (p["type"] == "f" and p["fill"] == parms.bg_color)
if p["rect"] in parms.clip
and p["rect"].width < parms.clip.width
and p["rect"].height < parms.clip.height
and (p["rect"].width > 3 or p["rect"].height > 3)
and not (p["fill"] == parms.bg_color and p["fill"] != None)
and not intersects_rects(p["rect"], parms.tab_rects0)
and not intersects_rects(p["rect"], parms.annot_rects)
]
else:
paths = []
Expand All @@ -948,19 +959,19 @@ def get_page_output(
vg_clusters0 = [] # worthwhile vector graphics go here

# walk through all vector graphics outside any table
for bbox in refine_boxes(page.cluster_drawings(drawings=paths)):
clusters = page.cluster_drawings(drawings=paths)
for bbox in clusters:
if is_significant(bbox, paths):
vg_clusters0.append(bbox)

# remove paths that are not in some relevant graphic
parms.actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)]

# also add image rectangles to the list
# also add image rectangles to the list and vice versa
vg_clusters0.extend(parms.img_rects)
parms.img_rects.extend(vg_clusters0)
parms.img_rects = sorted(set(parms.img_rects), key=lambda r: (r.y1, r.x0))
parms.deleted_images = []
parms.deleted_tables = []
parms.written_images = []
# these may no longer be pairwise disjoint:
# remove area overlaps by joining into larger rects
parms.vg_clusters0 = refine_boxes(vg_clusters0)
Expand Down Expand Up @@ -989,7 +1000,7 @@ def get_page_output(
for text_rect in text_rects:
# output tables above this rectangle
parms.md_string += output_tables(parms, text_rect)
parms.md_string += output_images(parms, text_rect)
parms.md_string += output_images(parms, text_rect, force_text)

# output text inside this rectangle
parms.md_string += write_text(
Expand All @@ -1004,7 +1015,7 @@ def get_page_output(

# write any remaining tables and images
parms.md_string += output_tables(parms, None)
parms.md_string += output_images(parms, None)
parms.md_string += output_images(parms, None, force_text)

parms.md_string += "\n-----\n\n"
while parms.md_string.startswith("\n"):
Expand Down Expand Up @@ -1153,7 +1164,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
import time

try:
filename = "slide12.pdf"
filename = "sample_document.pdf"
except IndexError:
print(f"Usage:\npython {os.path.basename(__file__)} input.pdf")
sys.exit()
Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setuptools.setup(
name="pymupdf4llm",
version="0.0.20",
version="0.0.21",
author="Artifex",
author_email="support@artifex.com",
description="PyMuPDF Utilities for LLM/RAG",
Expand Down