Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/src/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,22 @@
Change Log
===========================================================================

Changes in version 0.0.13
--------------------------

Fixes:
~~~~~~~

* `112 <https://github.com/pymupdf/RAG/issues/112>`_ "Invalid bandwriter header dimensions/setup"


Improvements:
~~~~~~~~~~~~~~
* New parameter `ignore_code` suppresses special formatting of text in mono-spaced fonts.
* New parameter `extract_words` enforces `page_chunks=True` and adds a "words" list to each page dictionary.



Changes in version 0.0.11
--------------------------

Expand Down
2 changes: 1 addition & 1 deletion pymupdf4llm/pymupdf4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown

__version__ = "0.0.12"
__version__ = "0.0.13"
version = __version__
version_tuple = tuple(map(int, version.split(".")))

Expand Down
17 changes: 7 additions & 10 deletions pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
import string
import sys

try:
import pymupdf as fitz # available with v1.24.3
except ImportError:
import fitz
import pymupdf

WHITE = set(string.whitespace)

Expand Down Expand Up @@ -96,13 +93,13 @@ def sanitize_spans(line):
blocks = [
b
for b in textpage.extractDICT()["blocks"]
if b["type"] == 0 and not fitz.Rect(b["bbox"]).is_empty
if b["type"] == 0 and not pymupdf.Rect(b["bbox"]).is_empty
]
spans = [] # all spans in TextPage here
for bno, b in enumerate(blocks): # the numbered blocks
for lno, line in enumerate(b["lines"]): # the numbered lines
for sno, s in enumerate(line["spans"]): # the numered spans
sbbox = fitz.Rect(s["bbox"]) # span bbox as a Rect
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
mpoint = (sbbox.tl + sbbox.br) / 2 # middle point
if mpoint not in clip:
continue
Expand Down Expand Up @@ -165,16 +162,16 @@ def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr
cases of text replaced by way of redaction annotations.

Args:
page: (fitz.Page)
page: (pymupdf.Page)
textpage: (TextPage) if None a temporary one is created.
clip: (rect-like) only consider spans inside this area
sep: (str) use this string when joining multiple MuPDF lines.
Returns:
String of plain text in reading sequence.
"""
textflags = fitz.TEXT_MEDIABOX_CLIP
textflags = pymupdf.TEXT_MEDIABOX_CLIP
page.remove_rotation()
prect = page.rect if not clip else fitz.Rect(clip) # area to consider
prect = page.rect if not clip else pymupdf.Rect(clip) # area to consider

xsep = sep if sep == "|" else ""

Expand Down Expand Up @@ -255,7 +252,7 @@ def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr
import pathlib

filename = sys.argv[1]
doc = fitz.open(filename)
doc = pymupdf.open(filename)
text = ""
for page in doc:
text += get_text_lines(page, sep=" ") + "\n" + chr(12) + "\n"
Expand Down
57 changes: 8 additions & 49 deletions pymupdf4llm/pymupdf4llm/helpers/multi_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
# for each page execute
bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

bboxes is a list of fitz.IRect objects, that are sorted ascending by their
bboxes is a list of pymupdf.IRect objects, that are sorted ascending by their
y0, then x0 coordinates. Their text content can be extracted by all PyMuPDF
get_text() variants, like for instance the following:
for rect in bboxes:
Expand All @@ -62,10 +62,7 @@

import string

try:
import pymupdf as fitz
except ImportError:
import fitz
import pymupdf


def column_boxes(
Expand Down Expand Up @@ -103,7 +100,7 @@ def is_white(text):
paths = page.get_drawings()

if textpage is None:
textpage = page.get_textpage(clip=clip, flags=fitz.TEXTFLAGS_TEXT)
textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)

bboxes = []

Expand Down Expand Up @@ -151,44 +148,6 @@ def can_extend(temp, bb, bboxlist, vert_bboxes):

return True

# def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
# """Extend a bbox to the right page border.

# Whenever there is no text to the right of a bbox, enlarge it up
# to the right page border.

# Args:
# bboxes: (list[IRect]) bboxes to check
# width: (int) page width
# path_bboxes: (list[IRect]) bboxes with a background color
# vert_bboxes: (list[IRect]) bboxes with vertical text
# img_bboxes: (list[IRect]) bboxes of images
# Returns:
# Potentially modified bboxes.
# """
# for i, bb in enumerate(bboxes):
# # do not extend text with background color
# if in_bbox(bb, path_bboxes):
# continue

# # do not extend text in images
# if in_bbox(bb, img_bboxes):
# continue

# # temp extends bb to the right page border
# temp = +bb
# temp.x1 = width

# # do not cut through colored background or images
# if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
# continue

# # also, do not intersect other text bboxes
# check = can_extend(temp, bb, bboxes, vert_bboxes)
# if check:
# bboxes[i] = temp # replace with enlarged bbox

# return [b for b in bboxes if b != None]

def join_rects_phase1(bboxes):
"""Postprocess identified text blocks, phase 1.
Expand Down Expand Up @@ -336,7 +295,7 @@ def clean_nblocks(nblocks):

# Make block rectangles, ignoring non-horizontal text
for b in blocks:
bbox = fitz.IRect(b["bbox"]) # bbox of the block
bbox = pymupdf.IRect(b["bbox"]) # bbox of the block

# ignore text written upon images
if no_image_text and in_bbox(bbox, img_bboxes):
Expand All @@ -352,9 +311,9 @@ def clean_nblocks(nblocks):
vert_bboxes.append(bbox)
continue

srect = fitz.EMPTY_IRECT()
srect = pymupdf.EMPTY_IRECT()
for line in b["lines"]:
lbbox = fitz.IRect(line["bbox"])
lbbox = pymupdf.IRect(line["bbox"])
text = "".join([s["text"].strip() for s in line["spans"]])
if len(text) > 1:
srect |= lbbox
Expand Down Expand Up @@ -435,7 +394,7 @@ def clean_nblocks(nblocks):
"""
import sys

RED = fitz.pdfcolor["red"]
RED = pymupdf.pdfcolor["red"]
# get the file name
filename = sys.argv[1]

Expand All @@ -452,7 +411,7 @@ def clean_nblocks(nblocks):
header_margin = 50

# open document
doc = fitz.open(filename)
doc = pymupdf.open(filename)

# iterate over the pages
for page in doc:
Expand Down
Loading