From e100b069da39a1f98796749fb5a499e9aa199021 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 29 Jan 2026 12:27:45 +0000 Subject: [PATCH 1/2] perf: use downcasting --- pyproject.toml | 2 +- src/annbatch/io.py | 78 ++++++++++++++++++++-------------------- tests/test_preshuffle.py | 3 ++ 3 files changed, 43 insertions(+), 40 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7f30ebb9..a63cac86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "anndata[lazy]>=0.12.6", + "anndata[lazy]>=0.12.9", "dask>=2025.9", "pandas>=2.2.2", "scipy>1.15", diff --git a/src/annbatch/io.py b/src/annbatch/io.py index 1649912d..a22aa5d8 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -92,47 +92,47 @@ def write_sharded( key The key to which this object should be written - by default the root, in which case the *entire* store (not just the group) is cleared first. """ - ad.settings.zarr_write_format = 3 - - def callback( - write_func: ad.experimental.Write, - store: zarr.Group, - elem_name: str, - elem: ad.typing.RWAble, - dataset_kwargs: Mapping[str, Any], - *, - iospec: ad.experimental.IOSpec, - ): - # Ensure we're not overriding anything here - dataset_kwargs = dataset_kwargs.copy() - if iospec.encoding_type in {"array"} and ( - any(n in store.name for n in {"obsm", "layers", "obsp"}) or "X" == elem_name + with ad.settings.override(zarr_write_format=3, write_csr_csc_indices_with_min_possible_dtype=True): + + def callback( + write_func: ad.experimental.Write, + store: zarr.Group, + elem_name: str, + elem: ad.typing.RWAble, + dataset_kwargs: Mapping[str, Any], + *, + iospec: ad.experimental.IOSpec, ): - # Get either the desired size or the next multiple down to ensure divisibility of chunks and shards - shard_size = min(dense_shard_size, _round_down(elem.shape[0], dense_chunk_size)) - chunk_size = min(dense_chunk_size, _round_down(elem.shape[0], dense_chunk_size)) - # If the shape is less than the computed size (impossible given rounds?) or the rounding caused created a 0-size chunk, then error - if elem.shape[0] < chunk_size or chunk_size == 0: - raise ValueError( - f"Choose a dense shard obs {dense_shard_size} and chunk obs {dense_chunk_size} with non-zero size less than the number of observations {elem.shape[0]}" - ) - dataset_kwargs = { - **dataset_kwargs, - "shards": (shard_size,) + elem.shape[1:], # only shard over 1st dim - "chunks": (chunk_size,) + elem.shape[1:], # only chunk over 1st dim - "compressors": compressors, - } - elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - dataset_kwargs = { - **dataset_kwargs, - "shards": (sparse_shard_size,), - "chunks": (sparse_chunk_size,), - "compressors": compressors, - } - write_func(store, elem_name, elem, dataset_kwargs=dataset_kwargs) + # Ensure we're not overriding anything here + dataset_kwargs = dataset_kwargs.copy() + if iospec.encoding_type in {"array"} and ( + any(n in store.name for n in {"obsm", "layers", "obsp"}) or "X" == elem_name + ): + # Get either the desired size or the next multiple down to ensure divisibility of chunks and shards + shard_size = min(dense_shard_size, _round_down(elem.shape[0], dense_chunk_size)) + chunk_size = min(dense_chunk_size, _round_down(elem.shape[0], dense_chunk_size)) + # If the shape is less than the computed size (impossible given rounds?) or the rounding caused created a 0-size chunk, then error + if elem.shape[0] < chunk_size or chunk_size == 0: + raise ValueError( + f"Choose a dense shard obs {dense_shard_size} and chunk obs {dense_chunk_size} with non-zero size less than the number of observations {elem.shape[0]}" + ) + dataset_kwargs = { + **dataset_kwargs, + "shards": (shard_size,) + elem.shape[1:], # only shard over 1st dim + "chunks": (chunk_size,) + elem.shape[1:], # only chunk over 1st dim + "compressors": compressors, + } + elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: + dataset_kwargs = { + **dataset_kwargs, + "shards": (sparse_shard_size,), + "chunks": (sparse_chunk_size,), + "compressors": compressors, + } + write_func(store, elem_name, elem, dataset_kwargs=dataset_kwargs) - ad.experimental.write_dispatched(group, "/" if key is None else key, adata, callback=callback) - zarr.consolidate_metadata(group.store) + ad.experimental.write_dispatched(group, "/" if key is None else key, adata, callback=callback) + zarr.consolidate_metadata(group.store) def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str]( diff --git a/tests/test_preshuffle.py b/tests/test_preshuffle.py index b891e45d..680d6fe1 100644 --- a/tests/test_preshuffle.py +++ b/tests/test_preshuffle.py @@ -258,8 +258,11 @@ def test_store_creation( pd.testing.assert_frame_equal(adata.obs, adata_orig.obs) z = zarr.open(output_path / "dataset_0") + # assert chunk behavior assert z["obsm"]["arr"].chunks[0] == 5, z["obsm"]["arr"] assert z["X"]["indices"].chunks[0] == 10 + # ensure proper downcasting + assert z["X"]["indices"].dtype == (np.uint16 if adata.X.shape[1] >= 256 else np.uint8) def _read_lazy_x_and_obs_only_from_raw(path) -> ad.AnnData: From e9b28d34a590505f7148a5def88b509181b2cccb Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 29 Jan 2026 13:29:54 +0100 Subject: [PATCH 2/2] chore: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee5db57b..73a5789f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning][]. [keep a changelog]: https://keepachangelog.com/en/1.0.0/ [semantic versioning]: https://semver.org/spec/v2.0.0.html +## [0.0.8] + +- Downcast `indices` of sparse matrices if possible when writing to disk via {attr}`anndata.settings.write_csr_csc_indices_with_min_possible_dtype` + ## [0.0.7] - Make the in-memory concatenation strategy configurable for {meth}`annbatch.Loader.__iter__` via a `concat_strategy` argument to `__init__` - sparse on-disk will concatenated then shuffled/yielded (faster, higher memory usage) but dense will be shuffled and then concated/yielded (lower memory usage).