pymupdf · julian-smith-artifex-com · Dec 17, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 17, 2024
diff --git a/docs/pixmap.rst b/docs/pixmap.rst
@@ -546,6 +546,9 @@ Have a look at the :ref:`FAQ` section to see some pixmap usage "at work".
          367 ns ± 1.75 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
          In [4]: %timeit len(pix.samples)
          3.52 ms ± 57.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+
+      After the Pixmap has been destroyed, any attempt to use the memoryview
+      will fail with ValueError.
 
       :type: memoryview
 
@@ -559,6 +562,9 @@ Have a look at the :ref:`FAQ` section to see some pixmap usage "at work".
          img = QtGui.QImage(pix.samples_ptr, pix.width, pix.height, format) # (2)
 
       Both of the above lead to the same Qt image, but (2) can be **many hundred times faster**, because it avoids an additional copy of the pixel area.
+
+      Warning: after the Pixmap has been destroyed, the Python pointer will be
+      invalid and attempting to use it may crash the Python interpreter.
 
       :type: int
 

diff --git a/src/__init__.py b/src/__init__.py
@@ -4513,7 +4513,7 @@ def get_page_fonts(self, pno: int, full: bool =False) -> list:
                 exception_info()
                 raise ValueError("need a Page or page number")
         val = self._getPageInfo(pno, 1)
-        if full is False:
+        if not full:
             return [v[:-1] for v in val]
         return val
 
@@ -4525,7 +4525,7 @@ def get_page_images(self, pno: int, full: bool =False) -> list:
         if not self.is_pdf:
             return ()
         val = self._getPageInfo(pno, 2)
-        if full is False:
+        if not full:
             return [v[:-1] for v in val]
         return val
 
@@ -6720,7 +6720,7 @@ def __bool__(self):
     def __eq__(self, mat):
         if not hasattr(mat, "__len__"):
             return False
-        return len(mat) == 6 and bool(self - mat) is False
+        return len(mat) == 6 and not (self - mat)
 
     def __getitem__(self, i):
         return (self.a, self.b, self.c, self.d, self.e, self.f)[i]
@@ -9277,7 +9277,7 @@ def get_image_bbox(self, name, transform=0):
             else:
                 raise ValueError("found multiple images named '%s'." % name)
         xref = item[-1]
-        if xref != 0 or transform is True:
+        if xref != 0 or transform:
             try:
                 return self.get_image_rects(item, transform=transform)[0]
             except Exception:
@@ -10059,6 +10059,9 @@ def __init__(self, *args):
         # data.  Doesn't seem to make much difference to Pixmap.set_pixel() so
         # not currently used.
         self._memory_view = None
+
+        # Cache for property `self.samples_mv`.
+        self._samples_mv = None
 
     def __len__(self):
         return self.size
@@ -10339,7 +10342,13 @@ def samples_mv(self):
         '''
         Pixmap samples memoryview.
         '''
-        return mupdf.fz_pixmap_samples_memoryview(self.this)
+        # We remember the returned memoryview so that our `__del__()` can
+        # release it; otherwise accessing it after we have been destructed will
+        # fail, possibly crashing Python; this is #4155.
+        #
+        if self._samples_mv is None:
+            self._samples_mv = mupdf.fz_pixmap_samples_memoryview(self.this)
+        return self._samples_mv
 
     @property
     def samples_ptr(self):
@@ -10625,6 +10634,10 @@ def yres(self):
 
     width  = w
     height = h
+
+    def __del__(self):
+        if self._samples_mv:
+            self._samples_mv.release()
 
 
 del Point
@@ -10646,7 +10659,7 @@ def __bool__(self):
     def __eq__(self, p):
         if not hasattr(p, "__len__"):
             return False
-        return len(p) == 2 and bool(self - p) is False
+        return len(p) == 2 and not (self - p)
 
     def __getitem__(self, i):
         return (self.x, self.y)[i]
@@ -10677,7 +10690,7 @@ def __init__(self, *args, x=None, y=None):
                 self.x = l.x
                 self.y = l.y
             else:
-                if hasattr(l, "__getitem__") is False:
+                if not hasattr(l, "__getitem__"):
                     raise ValueError("Point: bad args")
                 if len(l) != 2:
                     raise ValueError("Point: bad seq len")
@@ -10891,7 +10904,7 @@ def __init__(self, *args, ul=None, ur=None, ll=None, lr=None):
             if isinstance(l, mupdf.FzQuad):
                 self.this = l
                 self.ul, self.ur, self.ll, self.lr = Point(l.ul), Point(l.ur), Point(l.ll), Point(l.lr)
-            elif hasattr(l, "__getitem__") is False:
+            elif not hasattr(l, "__getitem__"):
                 raise ValueError("Quad: bad args")
             elif len(l) != 4:
                 raise ValueError("Quad: bad seq len")
@@ -11092,7 +11105,7 @@ def __contains__(self, x):
     def __eq__(self, rect):
         if not hasattr(rect, "__len__"):
             return False
-        return len(rect) == 4 and bool(self - rect) is False
+        return len(rect) == 4 and not (self - rect)
 
     def __getitem__(self, i):
         return (self.x0, self.y0, self.x1, self.y1)[i]
@@ -12593,7 +12606,7 @@ def extractDICT(self, cb=None, sort=False) -> dict:
         if cb is not None:
             val["width"] = cb.width
             val["height"] = cb.height
-        if sort is True:
+        if sort:
             blocks = val["blocks"]
             blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0]))
             val["blocks"] = blocks
@@ -12659,7 +12672,7 @@ def default(self, s):
         if cb is not None:
             val["width"] = cb.width
             val["height"] = cb.height
-        if sort is True:
+        if sort:
             blocks = val["blocks"]
             blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0]))
             val["blocks"] = blocks
@@ -12673,7 +12686,7 @@ def extractRAWDICT(self, cb=None, sort=False) -> dict:
         if cb is not None:
             val["width"] = cb.width
             val["height"] = cb.height
-        if sort is True:
+        if sort:
             blocks = val["blocks"]
             blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0]))
             val["blocks"] = blocks
@@ -12693,7 +12706,7 @@ def default(self,s):
         if cb is not None:
             val["width"] = cb.width
             val["height"] = cb.height
-        if sort is True:
+        if sort:
             blocks = val["blocks"]
             blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0]))
             val["blocks"] = blocks
@@ -12708,7 +12721,7 @@ def extractSelection(self, pointa, pointb):
 
     def extractText(self, sort=False) -> str:
         """Return simple, bare text on the page."""
-        if sort is False:
+        if not sort:
             return self._extractText(0)
         blocks = self.extractBLOCKS()[:]
         blocks.sort(key=lambda b: (b[3], b[0]))

diff --git a/src/utils.py b/src/utils.py
@@ -494,7 +494,7 @@ def get_text_blocks(
     blocks = tp.extractBLOCKS()
     if textpage is None:
         del tp
-    if sort is True:
+    if sort:
         blocks.sort(key=lambda b: (b[3], b[0]))
     return blocks
 
@@ -571,7 +571,7 @@ def sort_words(words):
 
     if textpage is None:
         del tp
-    if words and sort is True:
+    if words and sort:
         # advanced sort if any words found
         words = sort_words(words)
 
@@ -771,7 +771,7 @@ def full_ocr(page, dpi, language, flags):
         return tpage
 
     # if OCR for the full page, OCR its pixmap @ desired dpi
-    if full is True:
+    if full:
         return full_ocr(page, dpi, language, flags)
 
     # For partial OCR, make a normal textpage, then extend it with text that
@@ -948,7 +948,7 @@ def get_text(
             page, clip=clip, flags=flags, textpage=textpage, sort=sort
         )
 
-    if option == "text" and sort is True:
+    if option == "text" and sort:
         return get_sorted_text(
             page,
             clip=clip,
@@ -1227,7 +1227,7 @@ def recurse(olItem, liste, lvl):
     lvl = 1
     liste = []
     toc = recurse(olItem, liste, lvl)
-    if doc.is_pdf and simple is False:
+    if doc.is_pdf and not simple:
         doc._extend_toc_items(toc)
     return toc
 
@@ -4561,7 +4561,7 @@ def remove_hidden(cont_lines):
     if doc.is_encrypted or doc.is_closed:
         raise ValueError("closed or encrypted doc")
 
-    if clean_pages is False:
+    if not clean_pages:
         hidden_text = False
         redactions = False
 
@@ -4848,9 +4848,11 @@ def output_justify(start, line):
     nlines = len(new_lines)
     if nlines > max_lines:
         msg = "Only fitting %i of %i lines." % (max_lines, nlines)
-        if warn is True:
+        if warn is None:
+            pass
+        elif warn:
             pymupdf.message("Warning: " + msg)
-        elif warn is False:
+        else:
             raise ValueError(msg)
 
     start = pymupdf.Point()
@@ -5561,7 +5563,7 @@ def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool =
     # Once the sets of used unicodes and glyphs are known, we compute a
     # smaller version of the buffer user package fontTools.
 
-    if fallback is False:  # by default use MuPDF function
+    if not fallback:  # by default use MuPDF function
         pdf = mupdf.pdf_document_from_fz_document(doc)
         mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count)))
         return

diff --git a/tests/test_pixmap.py b/tests/test_pixmap.py
@@ -428,3 +428,21 @@ def test_3854():
         assert rms < 1
     else:
         assert rms == 0
+
+
+def test_4155():
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3854.pdf')
+    with pymupdf.open(path) as document:
+        page = document[0]
+        pixmap = page.get_pixmap()
+        mv = pixmap.samples_mv
+        mvb1 = mv.tobytes()
+    del page
+    del pixmap
+    try:
+        mvb2 = mv.tobytes()
+    except ValueError as e:
+        print(f'Received exception: {e}')
+        assert 'operation forbidden on released memoryview object' in str(e)
+    else:
+        assert 0, f'Did not receive expected exception when using defunct memoryview.'