diff --git a/changes.txt b/changes.txt index f94e01d73..6122dd095 100644 --- a/changes.txt +++ b/changes.txt @@ -5,6 +5,7 @@ Change Log **Changes in version 1.24.4 (2024-05-16)** * **Fixed** `3418 `_: Re-introduced bug, text align add_redact_annot + * **Fixed** `3472 `_: insert_pdf gives SystemError * Other: diff --git a/scripts/test.py b/scripts/test.py index ed74c676c..c0d499e20 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -130,7 +130,7 @@ def main(argv): build_type = None build_mupdf = True gdb = False - test_pymupdf = True + test_fitz = True implementations = None test_names = list() venv = 2 @@ -157,7 +157,7 @@ def main(argv): elif arg == '-d': build_type = 'debug' elif arg == '-f': - test_pymupdf = int(next(args)) + test_fitz = int(next(args)) elif arg in ('-h', '--help'): show_help() return @@ -226,7 +226,7 @@ def do_test(): pytest_options=pytest_options, timeout=timeout, gdb=gdb, - test_pymupdf=test_pymupdf, + test_fitz=test_fitz, ) for command in commands: @@ -359,7 +359,7 @@ def test( pytest_options=None, timeout=None, gdb=False, - test_pymupdf=True, + test_fitz=True, ): ''' Args: @@ -375,7 +375,7 @@ def test( See top-level option `-p`. gdb: See top-level option `--gdb`. - test_pymupdf: + test_fitz: See top-level option `-f`. ''' pymupdf_dir_rel = gh_release.relpath(pymupdf_dir) @@ -433,7 +433,7 @@ def test( for p in glob.glob(f'{pymupdf_dir_rel}/tests/test_*_fitz.py'): print(f'Removing {p=}') os.remove(p) - if test_pymupdf: + if test_fitz: # Create copies of each test file, modified to use `pymupdf` # instead of `fitz`. for p in glob.glob(f'{pymupdf_dir_rel}/tests/test_*.py'): diff --git a/setup.py b/setup.py index c9e39e9aa..0449030ba 100755 --- a/setup.py +++ b/setup.py @@ -606,21 +606,26 @@ def add( ret, from_, to_): if path_so_leaf_b: # Add rebased implementation files. - add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py') # For `fitz` module alias. - add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py') # For `fitz` module alias. - add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py') # For `fitz` module alias. to_dir = 'pymupdf/' add( ret_p, f'{g_root}/src/__init__.py', to_dir) add( ret_p, f'{g_root}/src/__main__.py', to_dir) add( ret_p, f'{g_root}/src/pymupdf.py', to_dir) add( ret_p, f'{g_root}/src/table.py', to_dir) add( ret_p, f'{g_root}/src/utils.py', to_dir) + add( ret_p, f'{g_root}/src/_apply_pages.py', to_dir) add( ret_p, f'{g_root}/src/build/extra.py', to_dir) add( ret_p, f'{g_root}/src/build/{path_so_leaf_b}', to_dir) + # Add support for `fitz` backwards compatibility. + add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py') + add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py') + add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py') + if mupdf_local: + # Add MuPDF Python API. add( ret_p, f'{mupdf_build_dir}/mupdf.py', to_dir) + # Add MuPDF shared libraries. if windows: wp = pipcl.wdev.WindowsPython() add( ret_p, f'{mupdf_build_dir}/_mupdf.pyd', to_dir) diff --git a/src/__init__.py b/src/__init__.py index 78d38dcd7..d772644c8 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -20,6 +20,7 @@ import string import sys import tarfile +import time import typing import warnings import weakref @@ -61,8 +62,19 @@ def _set_stream(name, default): _g_out_log = _set_stream('PYMUPDF_LOG', sys.stdout) _g_out_message = _set_stream('PYMUPDF_MESSAGE', sys.stdout) -# Set to list() if we are in test suite. -_g_log_items = None +_g_log_items = list() +_g_log_items_active = False + +def _log_items(): + return _g_log_items + +def _log_items_active(active): + global _g_log_items_active + _g_log_items_active = active + +def _log_items_clear(): + del _g_log_items[:] + def log( text='', caller=1): ''' @@ -73,7 +85,7 @@ def log( text='', caller=1): line = frame_record.lineno function = frame_record.function text = f'{filename}:{line}:{function}: {text}' - if _g_log_items is not None: + if _g_log_items_active: _g_log_items.append(text) print(text, file=_g_out_log) _g_out_log.flush() @@ -20942,6 +20954,197 @@ def vdist(dir, a, b): return mupdf.fz_abs(dx * dir.y + dy * dir.x) +def apply_pages( + path, + pagefn, + *, + pagefn_args=(), + pagefn_kwargs=dict(), + initfn=None, + initfn_args=(), + initfn_kwargs=dict(), + pages=None, + method='single', + concurrency=None, + _stats=False, + ): + ''' + Returns list of results from `pagefn()`, optionally using concurrency for + speed. + + Args: + path: + Path of document. + pagefn: + Function to call for each page; is passed (page, *pagefn_args, + **pagefn_kwargs). Return value is added to list that we return. If + `method` is not 'single', must be a top-level function - nested + functions don't work with concurrency. + pagefn_args + pagefn_kwargs: + Additional args to pass to `pagefn`. Must be picklable. + initfn: + If true, called once in each worker process; is passed + (*initfn_args, **initfn_kwargs). + initfn_args + initfn_kwargs: + Args to pass to initfn. Must be picklable. + pages: + List of page numbers to process, or None to include all pages. + method: + 'single' + Do not use concurrency. + 'mp' + Operate concurrently using Python's `multiprocessing` module. + 'fork' + Operate concurrently using custom implementation with + `os.fork()`. Does not work on Windows. + concurrency: + Number of worker processes to use when operating concurrently. If + None, we use the number of available CPUs. + _stats: + Internal, may change or be removed. If true, we output simple + timing diagnostics. + + Note: We require a file path rather than a Document, because Document + instances do not work properly after a fork - internal file descriptor + offsets are shared between the parent and child processes. + ''' + if _stats: + t0 = time.time() + + if method == 'single': + if initfn: + initfn(*initfn_args, **initfn_kwargs) + ret = list() + document = Document(path) + for page in document: + r = pagefn(page, *pagefn_args, **initfn_kwargs) + ret.append(r) + + else: + # Use concurrency. + # + from . import _apply_pages + + if pages is None: + if _stats: + t = time.time() + with Document(path) as document: + num_pages = len(document) + pages = list(range(num_pages)) + if _stats: + t = time.time() - t + log(f'{t:.2f}s: count pages.') + + if _stats: + t = time.time() + + if method == 'mp': + ret = _apply_pages._multiprocessing( + path, + pages, + pagefn, + pagefn_args, + pagefn_kwargs, + initfn, + initfn_args, + initfn_kwargs, + concurrency, + _stats, + ) + + elif method == 'fork': + ret = _apply_pages._fork( + path, + pages, + pagefn, + pagefn_args, + pagefn_kwargs, + initfn, + initfn_args, + initfn_kwargs, + concurrency, + _stats, + ) + + else: + assert 0, f'Unrecognised {method=}.' + + if _stats: + t = time.time() - t + log(f'{t:.2f}s: work.') + + if _stats: + t = time.time() - t0 + log(f'{t:.2f}s: total.') + return ret + + +def get_text( + path, + *, + pages=None, + method='single', + concurrency=None, + + option='text', + clip=None, + flags=None, + textpage=None, + sort=False, + delimiters=None, + + _stats=False, + ): + ''' + Returns list of results from `Page.get_text()`, optionally using + concurrency for speed. + + Args: + path: + Path of document. + pages: + List of page numbers to process, or None to include all pages. + method: + 'single' + Do not use concurrency. + 'mp' + Operate concurrently using Python's `multiprocessing` module. + 'fork' + Operate concurrently using custom implementation with + `os.fork`. Does not work on Windows. + concurrency: + Number of worker processes to use when operating concurrently. If + None, we use the number of available CPUs. + option + clip + flags + textpage + sort + delimiters: + Passed to internal calls to `Page.get_text()`. + ''' + args_dict = dict( + option=option, + clip=clip, + flags=flags, + textpage=textpage, + sort=sort, + delimiters=delimiters, + ) + + return apply_pages( + path, + Page.get_text, + pagefn_kwargs=args_dict, + pages=pages, + method=method, + concurrency=concurrency, + _stats=_stats, + ) + + class TOOLS: ''' We use @staticmethod to avoid the need to create an instance of this class. diff --git a/src/_apply_pages.py b/src/_apply_pages.py new file mode 100644 index 000000000..73a702382 --- /dev/null +++ b/src/_apply_pages.py @@ -0,0 +1,253 @@ +import multiprocessing +import os +import time + +import pymupdf + + +# Support for concurrent processing of document pages. +# + +class _worker_State: + pass +_worker_state = _worker_State() + + +def _worker_init( + path, + initfn, + initfn_args, + initfn_kwargs, + pagefn, + pagefn_args, + pagefn_kwargs, + stats, + ): + # pylint: disable=attribute-defined-outside-init + _worker_state.path = path + _worker_state.pagefn = pagefn + _worker_state.pagefn_args = pagefn_args + _worker_state.pagefn_kwargs = pagefn_kwargs + _worker_state.stats = stats + _worker_state.document = None + if initfn: + initfn(*initfn_args, **initfn_kwargs) + + +def _stats_write(t, label): + t = time.time() - t + if t >= 0.1: + pymupdf.log(f'{os.getpid()=}: {t:2f}s: {label}.') + + +def _worker_fn(page_number): + # Create Document from filename if we haven't already done so. + if not _worker_state.document: + if _worker_state.stats: + t = time.time() + _worker_state.document = pymupdf.Document(_worker_state.path) # pylint: disable=attribute-defined-outside-init + if _worker_state.stats: + _stats_write(t, 'pymupdf.Document()') + + if _worker_state.stats: + t = time.time() + page = _worker_state.document[page_number] + if _worker_state.stats: + _stats_write(t, '_worker_state.document[page_number]') + + if _worker_state.stats: + t = time.time() + ret = _worker_state.pagefn( + page, + *_worker_state.pagefn_args, + **_worker_state.pagefn_kwargs, + ) + if _worker_state.stats: + _stats_write(t, '_worker_state.pagefn()') + + return ret + + +def _multiprocessing( + path, + pages, + pagefn, + pagefn_args, + pagefn_kwargs, + initfn, + initfn_args, + initfn_kwargs, + concurrency, + stats, + ): + #print(f'_worker_mp(): {concurrency=}', flush=1) + with multiprocessing.Pool( + concurrency, + _worker_init, + ( + path, + initfn, initfn_args, initfn_kwargs, + pagefn, pagefn_args, pagefn_kwargs, + stats, + ), + ) as pool: + result = pool.map_async(_worker_fn, pages) + return result.get() + + +def _fork( + path, + pages, + pagefn, + pagefn_args, + pagefn_kwargs, + initfn, + initfn_args, + initfn_kwargs, + concurrency, + stats, + ): + verbose = 0 + if concurrency is None: + concurrency = multiprocessing.cpu_count() + # We write page numbers to `queue_down` and read `(page_num, text)` from + # `queue_up`. Workers each repeatedly read the next available page number + # from `queue_down`, extract the text and write it onto `queue_up`. + # + # This is better than pre-allocating a subset of pages to each worker + # because it ensures there will never be idle workers until we are near the + # end with fewer pages left than workers. + # + queue_down = multiprocessing.Queue() + queue_up = multiprocessing.Queue() + def childfn(): + document = None + if verbose: + pymupdf.log(f'{os.getpid()=}: {initfn=} {initfn_args=}') + _worker_init( + path, + initfn, + initfn_args, + initfn_kwargs, + pagefn, + pagefn_args, + pagefn_kwargs, + stats, + ) + while 1: + if verbose: + pymupdf.log(f'{os.getpid()=}: calling get().') + page_num = queue_down.get() + if verbose: + pymupdf.log(f'{os.getpid()=}: {page_num=}.') + if page_num is None: + break + try: + if not document: + if stats: + t = time.time() + document = pymupdf.Document(path) + if stats: + _stats_write(t, 'pymupdf.Document(path)') + + if stats: + t = time.time() + page = document[page_num] + if stats: + _stats_write(t, 'document[page_num]') + + if verbose: + pymupdf.log(f'{os.getpid()=}: {_worker_state=}') + + if stats: + t = time.time() + ret = pagefn( + page, + *_worker_state.pagefn_args, + **_worker_state.pagefn_kwargs, + ) + if stats: + _stats_write(t, f'{page_num=} pagefn()') + except Exception as e: + if verbose: pymupdf.log(f'{os.getpid()=}: exception {e=}') + ret = e + if verbose: + pymupdf.log(f'{os.getpid()=}: sending {page_num=} {ret=}') + + queue_up.put( (page_num, ret) ) + + error = None + + pids = list() + try: + # Start child processes. + if stats: + t = time.time() + for i in range(concurrency): + p = os.fork() # pylint: disable=no-member + if p == 0: + # Child process. + try: + try: + childfn() + except Exception as e: + pymupdf.log(f'{os.getpid()=}: childfn() => {e=}') + raise + finally: + if verbose: + pymupdf.log(f'{os.getpid()=}: calling os._exit(0)') + os._exit(0) + pids.append(p) + if stats: + _stats_write(t, 'create child processes') + + # Send page numbers. + if stats: + t = time.time() + if verbose: + pymupdf.log(f'Sending page numbers.') + for page_num in range(len(pages)): + queue_down.put(page_num) + if stats: + _stats_write(t, 'Send page numbers') + + # Collect results. We give up if any worker sends an exception instead + # of text, but this hasn't been tested. + ret = [None] * len(pages) + for i in range(len(pages)): + page_num, text = queue_up.get() + if verbose: + pymupdf.log(f'{page_num=} {len(text)=}') + assert ret[page_num] is None + if isinstance(text, Exception): + if not error: + error = text + break + ret[page_num] = text + + # Close queue. This should cause exception in workers and terminate + # them, but on macos-arm64 this does not seem to happen, so we also + # send None, which makes workers terminate. + for i in range(concurrency): + queue_down.put(None) + if verbose: pymupdf.log(f'Closing queues.') + queue_down.close() + + if error: + raise error + if verbose: + pymupdf.log(f'After concurrent, returning {len(ret)=}') + return ret + + finally: + # Join all child proceses. + if stats: + t = time.time() + for pid in pids: + if verbose: + pymupdf.log(f'waiting for {pid=}.') + e = os.waitpid(pid, 0) + if verbose: + pymupdf.log(f'{pid=} => {e=}') + if stats: + _stats_write(t, 'Join all child proceses') diff --git a/src/extra.i b/src/extra.i index 0b8c0ea53..79efde30b 100644 --- a/src/extra.i +++ b/src/extra.i @@ -2216,41 +2216,7 @@ static int JM_rects_overlap(const fz_rect a, const fz_rect b) // void JM_append_rune(fz_buffer *buff, int ch); -void ll_JM_print_stext_page_as_text(fz_buffer *res, fz_stext_page *page) -{ - fz_stext_block *block; - fz_stext_line *line; - fz_stext_char *ch; - fz_rect rect = page->mediabox; - fz_rect chbbox; - int last_char = 0; - - for (block = page->first_block; block; block = block->next) - { - if (block->type == FZ_STEXT_BLOCK_TEXT) - { - for (line = block->u.t.first_line; line; line = line->next) - { - last_char = 0; - for (ch = line->first_char; ch; ch = ch->next) - { - chbbox = JM_char_bbox(line, ch); - if (mupdf::ll_fz_is_infinite_rect(rect) || - JM_rects_overlap(rect, chbbox)) - { - last_char = ch->c; - JM_append_rune(res, last_char); - } - } - if (last_char != 10 && last_char > 0) - { - mupdf::ll_fz_append_string(res, "\n"); - } - } - } - } -} //----------------------------------------------------------------------------- // Plain text output. An identical copy of fz_print_stext_page_as_text, // but lines within a block are concatenated by space instead a new-line @@ -2258,11 +2224,6 @@ void ll_JM_print_stext_page_as_text(fz_buffer *res, fz_stext_page *page) //----------------------------------------------------------------------------- void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page) { - if (0) - { - return ll_JM_print_stext_page_as_text(res.m_internal, page.m_internal); - } - fz_rect rect = page.m_internal->mediabox; for (auto block: page) diff --git a/src/fitz___init__.py b/src/fitz___init__.py index e2f1b6980..36b562922 100644 --- a/src/fitz___init__.py +++ b/src/fitz___init__.py @@ -4,3 +4,6 @@ from pymupdf import _as_fz_page from pymupdf import _as_pdf_document from pymupdf import _as_pdf_page +from pymupdf import _log_items +from pymupdf import _log_items_active +from pymupdf import _log_items_clear diff --git a/tests/conftest.py b/tests/conftest.py index 76ef90ef7..34b086253 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,8 @@ def wrap(*args, **kwargs): assert not wt, f'{wt=}' assert not pymupdf.TOOLS.set_small_glyph_heights() - pymupdf._g_log_items = list() + pymupdf._log_items_clear() + pymupdf._log_items_active(True) # Run the test. rep = yield @@ -26,4 +27,5 @@ def wrap(*args, **kwargs): assert not pymupdf.TOOLS.set_small_glyph_heights() - assert not pymupdf._g_log_items, f'log() was called; {len(pymupdf._g_log_items)=}.' + log_items = pymupdf._log_items() + assert not log_items, f'log() was called; {len(log_items)=}.' diff --git a/tests/test_pylint.py b/tests/test_pylint.py index 82e4305c2..ea099dbbb 100644 --- a/tests/test_pylint.py +++ b/tests/test_pylint.py @@ -35,6 +35,7 @@ def test_pylint(): W0622: Redefining built-in 'FileNotFoundError' (redefined-builtin) W0622: Redefining built-in 'open' (redefined-builtin) W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation) + R1734: Consider using [] instead of list() (use-list-literal) ''' ) @@ -80,6 +81,7 @@ def test_pylint(): W0718: Catching too general exception Exception (broad-exception-caught) W0719: Raising too general exception: Exception (broad-exception-raised) C3001: Lambda expression assigned to a variable. Define a function using the "def" keyword instead. (unnecessary-lambda-assignment) + R0801: Similar lines in 2 files ''' ) ignores_list = list() @@ -110,6 +112,7 @@ def test_pylint(): leafs = [ '__init__.py', '__main__.py', + '_apply_pages.py', 'fitz___init__.py', 'fitz_table.py', 'fitz_utils.py', @@ -117,7 +120,7 @@ def test_pylint(): 'table.py', 'utils.py', ] - + leafs.sort() try: leafs_git = pipcl.git_items(directory) except Exception as e: diff --git a/tests/test_textextract.py b/tests/test_textextract.py index 22fc82917..5374e153f 100644 --- a/tests/test_textextract.py +++ b/tests/test_textextract.py @@ -267,3 +267,50 @@ def test_3197(): assert text_utf8 == text_utf8_expected[i] else: assert text_utf8 != text_utf8_expected[i] + + +def test_document_text(): + import platform + import time + + path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') + concurrency = None + + def llen(texts): + l = 0 + for text in texts: + l += len(text) if isinstance(text, str) else text + return l + + results = dict() + _stats = 1 + + print('') + method = 'single' + t = time.time() + document = pymupdf.Document(path) + texts0 = pymupdf.get_text(path, _stats=_stats) + t0 = time.time() - t + print(f'{method}: {t0=} {llen(texts0)=}', flush=1) + + # Dummy run seems to avoid misleading stats with slow first run. + method = 'mp' + texts = pymupdf.get_text(path, concurrency=concurrency, method=method, _stats=_stats) + + method = 'mp' + t = time.time() + texts = pymupdf.get_text(path, concurrency=concurrency, method=method, _stats=_stats) + t = time.time() - t + print(f'{method}: {concurrency=} {t=} ({t0/t:.2f}x) {llen(texts)=}', flush=1) + assert texts == texts0 + + if platform.system() != 'Windows': + method = 'fork' + t = time.time() + texts = pymupdf.get_text(path, concurrency=concurrency, method='fork', _stats=_stats) + t = time.time() - t + print(f'{method}: {concurrency=} {t=} ({t0/t:.2f}x) {llen(texts)=}', flush=1) + assert texts == texts0 + + if _stats: + pymupdf._log_items_clear()