diff --git a/rendercanvas/contexts/bitmapcontext.py b/rendercanvas/contexts/bitmapcontext.py
index 3a5a5c0..879d1d7 100644
--- a/rendercanvas/contexts/bitmapcontext.py
+++ b/rendercanvas/contexts/bitmapcontext.py
@@ -36,7 +36,7 @@ def set_bitmap(self, bitmap):
         """Set the rendered bitmap image.
 
         Call this in the draw event. The bitmap must be an object that can be
-        conveted to a memoryview, like a numpy array. It must represent a 2D
+        converted to a memoryview, like a numpy array. It must represent a 2D
         image in either grayscale or rgba format, with uint8 values
         """
 
diff --git a/rendercanvas/contexts/wgpucontext.py b/rendercanvas/contexts/wgpucontext.py
index 8d9f96a..745172f 100644
--- a/rendercanvas/contexts/wgpucontext.py
+++ b/rendercanvas/contexts/wgpucontext.py
@@ -1,3 +1,4 @@
+import time
 from typing import Sequence
 
 from .basecontext import BaseContext
@@ -99,7 +100,6 @@ def configure(
             # "tone_mapping": tone_mapping,
             "alpha_mode": alpha_mode,
         }
-
         # Let subclass finnish the configuration, then store the config
         self._configure(config)
         self._config = config
@@ -189,11 +189,28 @@ def __init__(self, present_info: dict):
         # The last used texture
         self._texture = None
 
+        # A ring-buffer to download the rendered images to the CPU/RAM. The
+        # image is first copied from the texture to an available copy-buffer.
+        # This is very fast (which is why we don't have a ring of textures).
+        # Mapping the buffers to RAM takes time, and we want to wait for this
+        # asynchronously.
+        #
+        # I feel that using just one buffer is sufficient. Adding more costs
+        # memory, and does not necessarily improve the FPS. It can actually
+        # strain the GPU more, because it would be busy mapping multiple buffers
+        # at once. I leave the ring-mechanism in-place for now, so we can
+        # experiment with it.
+        self._downloaders = [None]  # Put as many None's as you want buffers
+
     def _get_capabilities(self):
         """Get dict of capabilities and cache the result."""
 
         import wgpu
 
+        # Store usage flags now that we have the wgpu namespace
+        self._our_texture_usage = wgpu.TextureUsage.COPY_SRC
+        self._our_buffer_usage = wgpu.BufferUsage.COPY_DST | wgpu.BufferUsage.MAP_READ
+
         capabilities = {}
 
         # Query format capabilities from the info provided by the canvas
@@ -260,8 +277,14 @@ def _configure(self, config: dict):
                 f"Configure: unsupported alpha-mode: {alpha_mode} not in {cap_alpha_modes}"
             )
 
+        # (re)create downloaders
+        self._downloaders[:] = [
+            ImageDownloader(config["device"], self._our_buffer_usage)
+        ]
+
     def _unconfigure(self) -> None:
         self._drop_texture()
+        self._downloaders[:] = [None for _ in self._downloaders]
 
     def _get_current_texture(self):
         # When the texture is active right now, we could either:
@@ -271,8 +294,6 @@ def _get_current_texture(self):
         # Right now we return the existing texture, so user can retrieve it in different render passes that write to the same frame.
 
         if self._texture is None:
-            import wgpu
-
             width, height = self.physical_size
             width, height = max(width, 1), max(height, 1)
 
@@ -283,7 +304,7 @@ def _get_current_texture(self):
                 label="present",
                 size=(width, height, 1),
                 format=self._config["format"],
-                usage=self._config["usage"] | wgpu.TextureUsage.COPY_SRC,
+                usage=self._config["usage"] | self._our_texture_usage,
             )
 
         return self._texture
@@ -292,17 +313,56 @@ def _rc_present(self) -> None:
         if not self._texture:
             return {"method": "skip"}
 
-        bitmap = self._get_bitmap()
+        # TODO: in some cases, like offscreen backend, we don't want to skip the first frame!
+
+        # Get bitmap from oldest downloader
+        bitmap = None
+        downloader = self._downloaders.pop(0)
+        try:
+            bitmap = downloader.get_bitmap()
+        finally:
+            self._downloaders.append(downloader)
+
+        # Select new downloader
+        downloader = self._downloaders[-1]
+        downloader.initiate_download(self._texture)
+
         self._drop_texture()
-        return {"method": "bitmap", "format": "rgba-u8", "data": bitmap}
+        if bitmap is None:
+            return {"method": "skip"}
+        else:
+            return {"method": "bitmap", "format": "rgba-u8", "data": bitmap}
+
+    def _rc_close(self):
+        self._drop_texture()
+
+
+class ImageDownloader:
+    """A helper class that wraps a copy-buffer to async-download an image from a texture."""
+
+    def __init__(self, device, buffer_usage):
+        self._device = device
+        self._buffer_usage = buffer_usage
+        self._buffer = None
+        self._time = 0
+
+    def initiate_download(self, texture):
+        # TODO: assert not waiting
 
-    def _get_bitmap(self):
-        texture = self._texture
-        device = texture._device
+        self._parse_texture_metadata(texture)
+        nbytes = self._padded_stride * self._texture_size[1]
+        self._ensure_size(nbytes)
+        self._copy_texture(texture)
 
+        # Note: the buffer.map_async() method by default also does a flush, to hide a bug in wgpu-core (https://github.com/gfx-rs/wgpu/issues/5173).
+        # That bug does not affect this use-case, so we use a special (undocumented :/) map-mode to prevent wgpu-py from doing its sync thing.
+        self._awaitable = self._buffer.map_async("READ_NOSYNC", 0, nbytes)
+
+    def _parse_texture_metadata(self, texture):
         size = texture.size
         format = texture.format
         nchannels = 4  # we expect rgba or bgra
+
         if not format.startswith(("rgba", "bgra")):
             raise RuntimeError(f"Image present unsupported texture format {format}.")
         if "8" in format:
@@ -316,21 +376,6 @@ def _get_bitmap(self):
                 f"Image present unsupported texture format bitdepth {format}."
             )
 
-        data = device.queue.read_texture(
-            {
-                "texture": texture,
-                "mip_level": 0,
-                "origin": (0, 0, 0),
-            },
-            {
-                "offset": 0,
-                "bytes_per_row": bytes_per_pixel * size[0],
-                "rows_per_image": size[1],
-            },
-            size,
-        )
-
-        # Derive struct dtype from wgpu texture format
         memoryview_type = "B"
         if "float" in format:
             memoryview_type = "e" if "16" in format else "f"
@@ -344,10 +389,107 @@ def _get_bitmap(self):
             if "sint" in format:
                 memoryview_type = memoryview_type.lower()
 
-        # Represent as memory object to avoid numpy dependency
-        # Equivalent: np.frombuffer(data, np.uint8).reshape(size[1], size[0], nchannels)
+        plain_stride = bytes_per_pixel * size[0]
+        extra_stride = (256 - plain_stride % 256) % 256
+        padded_stride = plain_stride + extra_stride
+
+        self._memoryview_type = memoryview_type
+        self._nchannels = nchannels
+        self._plain_stride = plain_stride
+        self._padded_stride = padded_stride
+        self._texture_size = size
+
+    def _ensure_size(self, required_size):
+        # Get buffer and decide whether we can still use it
+        buffer = self._buffer
+        if buffer is None:
+            pass  # No buffer
+        elif required_size > buffer.size:
+            buffer = None  # Buffer too small
+        elif required_size < 0.25 * buffer.size:
+            buffer = None  # Buffer too large
+        elif required_size > 0.75 * buffer.size:
+            self._time = time.perf_counter()  # Size is fine
+        elif time.perf_counter() - self._time > 5.0:
+            buffer = None  # Too large too long
+
+        # Create a new buffer if we need one
+        if buffer is None:
+            buffer_size = required_size
+            buffer_size += (4096 - buffer_size % 4096) % 4096
+            self._buffer = self._device.create_buffer(
+                label="copy-buffer", size=buffer_size, usage=self._buffer_usage
+            )
 
-        return data.cast(memoryview_type, (size[1], size[0], nchannels))
+    def _copy_texture(self, texture):
+        source = {
+            "texture": texture,
+            "mip_level": 0,
+            "origin": (0, 0, 0),
+        }
 
-    def _rc_close(self):
-        self._drop_texture()
+        destination = {
+            "buffer": self._buffer,
+            "offset": 0,
+            "bytes_per_row": self._padded_stride,
+            "rows_per_image": self._texture_size[1],
+        }
+
+        # Copy data to temp buffer
+        encoder = self._device.create_command_encoder()
+        encoder.copy_texture_to_buffer(source, destination, texture.size)
+        command_buffer = encoder.finish()
+        self._device.queue.submit([command_buffer])
+
+    def get_bitmap(self):
+        if self._buffer is None:  # todo: more explicit state tracking
+            return None
+
+        memoryview_type = self._memoryview_type
+        plain_stride = self._plain_stride
+        padded_stride = self._padded_stride
+
+        nbytes = plain_stride * self._texture_size[1]
+        plain_shape = (self._texture_size[1], self._texture_size[0], self._nchannels)
+
+        # Download from mappable buffer
+        # Because we use `copy=False``, we *must* copy the data.
+        if self._buffer.map_state == "pending":
+            self._awaitable.sync_wait()
+        mapped_data = self._buffer.read_mapped(copy=False)
+
+        # Copy the data
+        if padded_stride > plain_stride:
+            # Copy per row
+            data = memoryview(bytearray(nbytes)).cast(mapped_data.format)
+            i_start = 0
+            for i in range(self._texture_size[1]):
+                row = mapped_data[i * padded_stride : i * padded_stride + plain_stride]
+                data[i_start : i_start + plain_stride] = row
+                i_start += plain_stride
+        else:
+            # Copy as a whole
+            data = memoryview(bytearray(mapped_data)).cast(mapped_data.format)
+
+        # Alternative copy solution using Numpy.
+        # I expected this to be faster, but does not really seem to be. Seems not worth it
+        # since we technically don't depend on Numpy. Leaving here for reference.
+        # import numpy as np
+        # mapped_data = np.asarray(mapped_data)[:data_length]
+        # data = np.empty(nbytes, dtype=mapped_data.dtype)
+        # mapped_data.shape = -1, padded_stride
+        # data.shape = -1, plain_stride
+        # data[:] = mapped_data[:, :plain_stride]
+        # data.shape = -1
+        # data = memoryview(data)
+
+        # Since we use read_mapped(copy=False), we must unmap it *after* we've copied the data.
+        self._buffer.unmap()
+
+        # Derive struct dtype from wgpu texture format
+
+        # Represent as memory object to avoid numpy dependency
+        # Equivalent: np.frombuffer(data, np.uint8).reshape(plain_shape)
+        data = data.cast(memoryview_type, plain_shape)
+
+        return data
diff --git a/rendercanvas/stub.py b/rendercanvas/stub.py
index 2986fb8..31a6857 100644
--- a/rendercanvas/stub.py
+++ b/rendercanvas/stub.py
@@ -42,7 +42,7 @@ def _rc_call_later(self, delay, callback):
 
 class StubCanvasGroup(BaseCanvasGroup):
     """
-    The ``CanvasGroup`` representss a group of canvas objects from the same class, that share a loop.
+    The ``CanvasGroup`` represents a group of canvas objects from the same class, that share a loop.
 
     The initial/default loop is passed when the ``CanvasGroup`` is instantiated.