diff --git a/rendercanvas/contexts/bitmapcontext.py b/rendercanvas/contexts/bitmapcontext.py index 3a5a5c0..879d1d7 100644 --- a/rendercanvas/contexts/bitmapcontext.py +++ b/rendercanvas/contexts/bitmapcontext.py @@ -36,7 +36,7 @@ def set_bitmap(self, bitmap): """Set the rendered bitmap image. Call this in the draw event. The bitmap must be an object that can be - conveted to a memoryview, like a numpy array. It must represent a 2D + converted to a memoryview, like a numpy array. It must represent a 2D image in either grayscale or rgba format, with uint8 values """ diff --git a/rendercanvas/contexts/wgpucontext.py b/rendercanvas/contexts/wgpucontext.py index 8d9f96a..745172f 100644 --- a/rendercanvas/contexts/wgpucontext.py +++ b/rendercanvas/contexts/wgpucontext.py @@ -1,3 +1,4 @@ +import time from typing import Sequence from .basecontext import BaseContext @@ -99,7 +100,6 @@ def configure( # "tone_mapping": tone_mapping, "alpha_mode": alpha_mode, } - # Let subclass finnish the configuration, then store the config self._configure(config) self._config = config @@ -189,11 +189,28 @@ def __init__(self, present_info: dict): # The last used texture self._texture = None + # A ring-buffer to download the rendered images to the CPU/RAM. The + # image is first copied from the texture to an available copy-buffer. + # This is very fast (which is why we don't have a ring of textures). + # Mapping the buffers to RAM takes time, and we want to wait for this + # asynchronously. + # + # I feel that using just one buffer is sufficient. Adding more costs + # memory, and does not necessarily improve the FPS. It can actually + # strain the GPU more, because it would be busy mapping multiple buffers + # at once. I leave the ring-mechanism in-place for now, so we can + # experiment with it. + self._downloaders = [None] # Put as many None's as you want buffers + def _get_capabilities(self): """Get dict of capabilities and cache the result.""" import wgpu + # Store usage flags now that we have the wgpu namespace + self._our_texture_usage = wgpu.TextureUsage.COPY_SRC + self._our_buffer_usage = wgpu.BufferUsage.COPY_DST | wgpu.BufferUsage.MAP_READ + capabilities = {} # Query format capabilities from the info provided by the canvas @@ -260,8 +277,14 @@ def _configure(self, config: dict): f"Configure: unsupported alpha-mode: {alpha_mode} not in {cap_alpha_modes}" ) + # (re)create downloaders + self._downloaders[:] = [ + ImageDownloader(config["device"], self._our_buffer_usage) + ] + def _unconfigure(self) -> None: self._drop_texture() + self._downloaders[:] = [None for _ in self._downloaders] def _get_current_texture(self): # When the texture is active right now, we could either: @@ -271,8 +294,6 @@ def _get_current_texture(self): # Right now we return the existing texture, so user can retrieve it in different render passes that write to the same frame. if self._texture is None: - import wgpu - width, height = self.physical_size width, height = max(width, 1), max(height, 1) @@ -283,7 +304,7 @@ def _get_current_texture(self): label="present", size=(width, height, 1), format=self._config["format"], - usage=self._config["usage"] | wgpu.TextureUsage.COPY_SRC, + usage=self._config["usage"] | self._our_texture_usage, ) return self._texture @@ -292,17 +313,56 @@ def _rc_present(self) -> None: if not self._texture: return {"method": "skip"} - bitmap = self._get_bitmap() + # TODO: in some cases, like offscreen backend, we don't want to skip the first frame! + + # Get bitmap from oldest downloader + bitmap = None + downloader = self._downloaders.pop(0) + try: + bitmap = downloader.get_bitmap() + finally: + self._downloaders.append(downloader) + + # Select new downloader + downloader = self._downloaders[-1] + downloader.initiate_download(self._texture) + self._drop_texture() - return {"method": "bitmap", "format": "rgba-u8", "data": bitmap} + if bitmap is None: + return {"method": "skip"} + else: + return {"method": "bitmap", "format": "rgba-u8", "data": bitmap} + + def _rc_close(self): + self._drop_texture() + + +class ImageDownloader: + """A helper class that wraps a copy-buffer to async-download an image from a texture.""" + + def __init__(self, device, buffer_usage): + self._device = device + self._buffer_usage = buffer_usage + self._buffer = None + self._time = 0 + + def initiate_download(self, texture): + # TODO: assert not waiting - def _get_bitmap(self): - texture = self._texture - device = texture._device + self._parse_texture_metadata(texture) + nbytes = self._padded_stride * self._texture_size[1] + self._ensure_size(nbytes) + self._copy_texture(texture) + # Note: the buffer.map_async() method by default also does a flush, to hide a bug in wgpu-core (https://github.com/gfx-rs/wgpu/issues/5173). + # That bug does not affect this use-case, so we use a special (undocumented :/) map-mode to prevent wgpu-py from doing its sync thing. + self._awaitable = self._buffer.map_async("READ_NOSYNC", 0, nbytes) + + def _parse_texture_metadata(self, texture): size = texture.size format = texture.format nchannels = 4 # we expect rgba or bgra + if not format.startswith(("rgba", "bgra")): raise RuntimeError(f"Image present unsupported texture format {format}.") if "8" in format: @@ -316,21 +376,6 @@ def _get_bitmap(self): f"Image present unsupported texture format bitdepth {format}." ) - data = device.queue.read_texture( - { - "texture": texture, - "mip_level": 0, - "origin": (0, 0, 0), - }, - { - "offset": 0, - "bytes_per_row": bytes_per_pixel * size[0], - "rows_per_image": size[1], - }, - size, - ) - - # Derive struct dtype from wgpu texture format memoryview_type = "B" if "float" in format: memoryview_type = "e" if "16" in format else "f" @@ -344,10 +389,107 @@ def _get_bitmap(self): if "sint" in format: memoryview_type = memoryview_type.lower() - # Represent as memory object to avoid numpy dependency - # Equivalent: np.frombuffer(data, np.uint8).reshape(size[1], size[0], nchannels) + plain_stride = bytes_per_pixel * size[0] + extra_stride = (256 - plain_stride % 256) % 256 + padded_stride = plain_stride + extra_stride + + self._memoryview_type = memoryview_type + self._nchannels = nchannels + self._plain_stride = plain_stride + self._padded_stride = padded_stride + self._texture_size = size + + def _ensure_size(self, required_size): + # Get buffer and decide whether we can still use it + buffer = self._buffer + if buffer is None: + pass # No buffer + elif required_size > buffer.size: + buffer = None # Buffer too small + elif required_size < 0.25 * buffer.size: + buffer = None # Buffer too large + elif required_size > 0.75 * buffer.size: + self._time = time.perf_counter() # Size is fine + elif time.perf_counter() - self._time > 5.0: + buffer = None # Too large too long + + # Create a new buffer if we need one + if buffer is None: + buffer_size = required_size + buffer_size += (4096 - buffer_size % 4096) % 4096 + self._buffer = self._device.create_buffer( + label="copy-buffer", size=buffer_size, usage=self._buffer_usage + ) - return data.cast(memoryview_type, (size[1], size[0], nchannels)) + def _copy_texture(self, texture): + source = { + "texture": texture, + "mip_level": 0, + "origin": (0, 0, 0), + } - def _rc_close(self): - self._drop_texture() + destination = { + "buffer": self._buffer, + "offset": 0, + "bytes_per_row": self._padded_stride, + "rows_per_image": self._texture_size[1], + } + + # Copy data to temp buffer + encoder = self._device.create_command_encoder() + encoder.copy_texture_to_buffer(source, destination, texture.size) + command_buffer = encoder.finish() + self._device.queue.submit([command_buffer]) + + def get_bitmap(self): + if self._buffer is None: # todo: more explicit state tracking + return None + + memoryview_type = self._memoryview_type + plain_stride = self._plain_stride + padded_stride = self._padded_stride + + nbytes = plain_stride * self._texture_size[1] + plain_shape = (self._texture_size[1], self._texture_size[0], self._nchannels) + + # Download from mappable buffer + # Because we use `copy=False``, we *must* copy the data. + if self._buffer.map_state == "pending": + self._awaitable.sync_wait() + mapped_data = self._buffer.read_mapped(copy=False) + + # Copy the data + if padded_stride > plain_stride: + # Copy per row + data = memoryview(bytearray(nbytes)).cast(mapped_data.format) + i_start = 0 + for i in range(self._texture_size[1]): + row = mapped_data[i * padded_stride : i * padded_stride + plain_stride] + data[i_start : i_start + plain_stride] = row + i_start += plain_stride + else: + # Copy as a whole + data = memoryview(bytearray(mapped_data)).cast(mapped_data.format) + + # Alternative copy solution using Numpy. + # I expected this to be faster, but does not really seem to be. Seems not worth it + # since we technically don't depend on Numpy. Leaving here for reference. + # import numpy as np + # mapped_data = np.asarray(mapped_data)[:data_length] + # data = np.empty(nbytes, dtype=mapped_data.dtype) + # mapped_data.shape = -1, padded_stride + # data.shape = -1, plain_stride + # data[:] = mapped_data[:, :plain_stride] + # data.shape = -1 + # data = memoryview(data) + + # Since we use read_mapped(copy=False), we must unmap it *after* we've copied the data. + self._buffer.unmap() + + # Derive struct dtype from wgpu texture format + + # Represent as memory object to avoid numpy dependency + # Equivalent: np.frombuffer(data, np.uint8).reshape(plain_shape) + data = data.cast(memoryview_type, plain_shape) + + return data diff --git a/rendercanvas/stub.py b/rendercanvas/stub.py index 2986fb8..31a6857 100644 --- a/rendercanvas/stub.py +++ b/rendercanvas/stub.py @@ -42,7 +42,7 @@ def _rc_call_later(self, delay, callback): class StubCanvasGroup(BaseCanvasGroup): """ - The ``CanvasGroup`` representss a group of canvas objects from the same class, that share a loop. + The ``CanvasGroup`` represents a group of canvas objects from the same class, that share a loop. The initial/default loop is passed when the ``CanvasGroup`` is instantiated.