From a3210b3a6893727fe575e656752787933d1a2462 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Thu, 12 Dec 2024 12:26:37 +0000 Subject: [PATCH 1/3] src/__init__.py: simplified some bool tests. --- src/__init__.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 7ed758abd..96f4c811e 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -4513,7 +4513,7 @@ def get_page_fonts(self, pno: int, full: bool =False) -> list: exception_info() raise ValueError("need a Page or page number") val = self._getPageInfo(pno, 1) - if full is False: + if not full: return [v[:-1] for v in val] return val @@ -4525,7 +4525,7 @@ def get_page_images(self, pno: int, full: bool =False) -> list: if not self.is_pdf: return () val = self._getPageInfo(pno, 2) - if full is False: + if not full: return [v[:-1] for v in val] return val @@ -6720,7 +6720,7 @@ def __bool__(self): def __eq__(self, mat): if not hasattr(mat, "__len__"): return False - return len(mat) == 6 and bool(self - mat) is False + return len(mat) == 6 and not (self - mat) def __getitem__(self, i): return (self.a, self.b, self.c, self.d, self.e, self.f)[i] @@ -9277,7 +9277,7 @@ def get_image_bbox(self, name, transform=0): else: raise ValueError("found multiple images named '%s'." % name) xref = item[-1] - if xref != 0 or transform is True: + if xref != 0 or transform: try: return self.get_image_rects(item, transform=transform)[0] except Exception: @@ -10646,7 +10646,7 @@ def __bool__(self): def __eq__(self, p): if not hasattr(p, "__len__"): return False - return len(p) == 2 and bool(self - p) is False + return len(p) == 2 and not (self - p) def __getitem__(self, i): return (self.x, self.y)[i] @@ -10677,7 +10677,7 @@ def __init__(self, *args, x=None, y=None): self.x = l.x self.y = l.y else: - if hasattr(l, "__getitem__") is False: + if not hasattr(l, "__getitem__"): raise ValueError("Point: bad args") if len(l) != 2: raise ValueError("Point: bad seq len") @@ -10891,7 +10891,7 @@ def __init__(self, *args, ul=None, ur=None, ll=None, lr=None): if isinstance(l, mupdf.FzQuad): self.this = l self.ul, self.ur, self.ll, self.lr = Point(l.ul), Point(l.ur), Point(l.ll), Point(l.lr) - elif hasattr(l, "__getitem__") is False: + elif not hasattr(l, "__getitem__"): raise ValueError("Quad: bad args") elif len(l) != 4: raise ValueError("Quad: bad seq len") @@ -11092,7 +11092,7 @@ def __contains__(self, x): def __eq__(self, rect): if not hasattr(rect, "__len__"): return False - return len(rect) == 4 and bool(self - rect) is False + return len(rect) == 4 and not (self - rect) def __getitem__(self, i): return (self.x0, self.y0, self.x1, self.y1)[i] @@ -12593,7 +12593,7 @@ def extractDICT(self, cb=None, sort=False) -> dict: if cb is not None: val["width"] = cb.width val["height"] = cb.height - if sort is True: + if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks @@ -12659,7 +12659,7 @@ def default(self, s): if cb is not None: val["width"] = cb.width val["height"] = cb.height - if sort is True: + if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks @@ -12673,7 +12673,7 @@ def extractRAWDICT(self, cb=None, sort=False) -> dict: if cb is not None: val["width"] = cb.width val["height"] = cb.height - if sort is True: + if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks @@ -12693,7 +12693,7 @@ def default(self,s): if cb is not None: val["width"] = cb.width val["height"] = cb.height - if sort is True: + if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks @@ -12708,7 +12708,7 @@ def extractSelection(self, pointa, pointb): def extractText(self, sort=False) -> str: """Return simple, bare text on the page.""" - if sort is False: + if not sort: return self._extractText(0) blocks = self.extractBLOCKS()[:] blocks.sort(key=lambda b: (b[3], b[0])) From 769d5958d3908fb6dec983da3ab546b9c6530c9e Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Thu, 12 Dec 2024 12:41:13 +0000 Subject: [PATCH 2/3] src/utils.py: simplified some bool tests. --- src/utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/utils.py b/src/utils.py index 0d9195cd4..43cc97e50 100644 --- a/src/utils.py +++ b/src/utils.py @@ -494,7 +494,7 @@ def get_text_blocks( blocks = tp.extractBLOCKS() if textpage is None: del tp - if sort is True: + if sort: blocks.sort(key=lambda b: (b[3], b[0])) return blocks @@ -571,7 +571,7 @@ def sort_words(words): if textpage is None: del tp - if words and sort is True: + if words and sort: # advanced sort if any words found words = sort_words(words) @@ -771,7 +771,7 @@ def full_ocr(page, dpi, language, flags): return tpage # if OCR for the full page, OCR its pixmap @ desired dpi - if full is True: + if full: return full_ocr(page, dpi, language, flags) # For partial OCR, make a normal textpage, then extend it with text that @@ -948,7 +948,7 @@ def get_text( page, clip=clip, flags=flags, textpage=textpage, sort=sort ) - if option == "text" and sort is True: + if option == "text" and sort: return get_sorted_text( page, clip=clip, @@ -1227,7 +1227,7 @@ def recurse(olItem, liste, lvl): lvl = 1 liste = [] toc = recurse(olItem, liste, lvl) - if doc.is_pdf and simple is False: + if doc.is_pdf and not simple: doc._extend_toc_items(toc) return toc @@ -4561,7 +4561,7 @@ def remove_hidden(cont_lines): if doc.is_encrypted or doc.is_closed: raise ValueError("closed or encrypted doc") - if clean_pages is False: + if not clean_pages: hidden_text = False redactions = False @@ -4848,9 +4848,11 @@ def output_justify(start, line): nlines = len(new_lines) if nlines > max_lines: msg = "Only fitting %i of %i lines." % (max_lines, nlines) - if warn is True: + if warn is None: + pass + elif warn: pymupdf.message("Warning: " + msg) - elif warn is False: + else: raise ValueError(msg) start = pymupdf.Point() @@ -5561,7 +5563,7 @@ def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = # Once the sets of used unicodes and glyphs are known, we compute a # smaller version of the buffer user package fontTools. - if fallback is False: # by default use MuPDF function + if not fallback: # by default use MuPDF function pdf = mupdf.pdf_document_from_fz_document(doc) mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) return From 9f2c6bace6e772d5585dc460ceb587776526475c Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 17 Dec 2024 11:59:59 +0000 Subject: [PATCH 3/3] src/ tests/ docs/: avoid crash if samples memoryview is used after Pixmap destruction. This addresses #4155. src/__init__.py: With .samples_mv, remember the memoryview and call its .release() method in Pixmap.__del__(). docs/pixmap.rst: Document improved behaviour of Pixmap.samples_mv. Also warn that Pixmap.samples_ptr is unsafe after destruction of the pixmap. tests/test_pixmap.py: Added test_4155(), check we get ValueError when accessing memoryview after Pixmap is destroyed. --- docs/pixmap.rst | 6 ++++++ src/__init__.py | 15 ++++++++++++++- tests/test_pixmap.py | 18 ++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/docs/pixmap.rst b/docs/pixmap.rst index 75251aeec..a6546618b 100644 --- a/docs/pixmap.rst +++ b/docs/pixmap.rst @@ -546,6 +546,9 @@ Have a look at the :ref:`FAQ` section to see some pixmap usage "at work". 367 ns ± 1.75 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each) In [4]: %timeit len(pix.samples) 3.52 ms ± 57.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + After the Pixmap has been destroyed, any attempt to use the memoryview + will fail with ValueError. :type: memoryview @@ -559,6 +562,9 @@ Have a look at the :ref:`FAQ` section to see some pixmap usage "at work". img = QtGui.QImage(pix.samples_ptr, pix.width, pix.height, format) # (2) Both of the above lead to the same Qt image, but (2) can be **many hundred times faster**, because it avoids an additional copy of the pixel area. + + Warning: after the Pixmap has been destroyed, the Python pointer will be + invalid and attempting to use it may crash the Python interpreter. :type: int diff --git a/src/__init__.py b/src/__init__.py index 96f4c811e..c69c9a276 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -10059,6 +10059,9 @@ def __init__(self, *args): # data. Doesn't seem to make much difference to Pixmap.set_pixel() so # not currently used. self._memory_view = None + + # Cache for property `self.samples_mv`. + self._samples_mv = None def __len__(self): return self.size @@ -10339,7 +10342,13 @@ def samples_mv(self): ''' Pixmap samples memoryview. ''' - return mupdf.fz_pixmap_samples_memoryview(self.this) + # We remember the returned memoryview so that our `__del__()` can + # release it; otherwise accessing it after we have been destructed will + # fail, possibly crashing Python; this is #4155. + # + if self._samples_mv is None: + self._samples_mv = mupdf.fz_pixmap_samples_memoryview(self.this) + return self._samples_mv @property def samples_ptr(self): @@ -10625,6 +10634,10 @@ def yres(self): width = w height = h + + def __del__(self): + if self._samples_mv: + self._samples_mv.release() del Point diff --git a/tests/test_pixmap.py b/tests/test_pixmap.py index 0c9b8eab3..82007c9fd 100644 --- a/tests/test_pixmap.py +++ b/tests/test_pixmap.py @@ -428,3 +428,21 @@ def test_3854(): assert rms < 1 else: assert rms == 0 + + +def test_4155(): + path = os.path.normpath(f'{__file__}/../../tests/resources/test_3854.pdf') + with pymupdf.open(path) as document: + page = document[0] + pixmap = page.get_pixmap() + mv = pixmap.samples_mv + mvb1 = mv.tobytes() + del page + del pixmap + try: + mvb2 = mv.tobytes() + except ValueError as e: + print(f'Received exception: {e}') + assert 'operation forbidden on released memoryview object' in str(e) + else: + assert 0, f'Did not receive expected exception when using defunct memoryview.'