Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning][].
[keep a changelog]: https://keepachangelog.com/en/1.0.0/
[semantic versioning]: https://semver.org/spec/v2.0.0.html

## [0.0.8]

- Downcast `indices` of sparse matrices if possible when writing to disk via {attr}`anndata.settings.write_csr_csc_indices_with_min_possible_dtype`

## [0.0.7]

- Make the in-memory concatenation strategy configurable for {meth}`annbatch.Loader.__iter__` via a `concat_strategy` argument to `__init__` - sparse on-disk will concatenated then shuffled/yielded (faster, higher memory usage) but dense will be shuffled and then concated/yielded (lower memory usage).
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ classifiers = [
"Programming Language :: Python :: 3.13",
]
dependencies = [
"anndata[lazy]>=0.12.6",
"anndata[lazy]>=0.12.9",
"dask>=2025.9",
"pandas>=2.2.2",
"scipy>1.15",
Expand Down
78 changes: 39 additions & 39 deletions src/annbatch/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,47 +92,47 @@ def write_sharded(
key
The key to which this object should be written - by default the root, in which case the *entire* store (not just the group) is cleared first.
"""
ad.settings.zarr_write_format = 3

def callback(
write_func: ad.experimental.Write,
store: zarr.Group,
elem_name: str,
elem: ad.typing.RWAble,
dataset_kwargs: Mapping[str, Any],
*,
iospec: ad.experimental.IOSpec,
):
# Ensure we're not overriding anything here
dataset_kwargs = dataset_kwargs.copy()
if iospec.encoding_type in {"array"} and (
any(n in store.name for n in {"obsm", "layers", "obsp"}) or "X" == elem_name
with ad.settings.override(zarr_write_format=3, write_csr_csc_indices_with_min_possible_dtype=True):

def callback(
write_func: ad.experimental.Write,
store: zarr.Group,
elem_name: str,
elem: ad.typing.RWAble,
dataset_kwargs: Mapping[str, Any],
*,
iospec: ad.experimental.IOSpec,
):
# Get either the desired size or the next multiple down to ensure divisibility of chunks and shards
shard_size = min(dense_shard_size, _round_down(elem.shape[0], dense_chunk_size))
chunk_size = min(dense_chunk_size, _round_down(elem.shape[0], dense_chunk_size))
# If the shape is less than the computed size (impossible given rounds?) or the rounding caused created a 0-size chunk, then error
if elem.shape[0] < chunk_size or chunk_size == 0:
raise ValueError(
f"Choose a dense shard obs {dense_shard_size} and chunk obs {dense_chunk_size} with non-zero size less than the number of observations {elem.shape[0]}"
)
dataset_kwargs = {
**dataset_kwargs,
"shards": (shard_size,) + elem.shape[1:], # only shard over 1st dim
"chunks": (chunk_size,) + elem.shape[1:], # only chunk over 1st dim
"compressors": compressors,
}
elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}:
dataset_kwargs = {
**dataset_kwargs,
"shards": (sparse_shard_size,),
"chunks": (sparse_chunk_size,),
"compressors": compressors,
}
write_func(store, elem_name, elem, dataset_kwargs=dataset_kwargs)
# Ensure we're not overriding anything here
dataset_kwargs = dataset_kwargs.copy()
if iospec.encoding_type in {"array"} and (
any(n in store.name for n in {"obsm", "layers", "obsp"}) or "X" == elem_name
):
# Get either the desired size or the next multiple down to ensure divisibility of chunks and shards
shard_size = min(dense_shard_size, _round_down(elem.shape[0], dense_chunk_size))
chunk_size = min(dense_chunk_size, _round_down(elem.shape[0], dense_chunk_size))
# If the shape is less than the computed size (impossible given rounds?) or the rounding caused created a 0-size chunk, then error
if elem.shape[0] < chunk_size or chunk_size == 0:
raise ValueError(
f"Choose a dense shard obs {dense_shard_size} and chunk obs {dense_chunk_size} with non-zero size less than the number of observations {elem.shape[0]}"
)
dataset_kwargs = {
**dataset_kwargs,
"shards": (shard_size,) + elem.shape[1:], # only shard over 1st dim
"chunks": (chunk_size,) + elem.shape[1:], # only chunk over 1st dim
"compressors": compressors,
}
elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}:
dataset_kwargs = {
**dataset_kwargs,
"shards": (sparse_shard_size,),
"chunks": (sparse_chunk_size,),
"compressors": compressors,
}
write_func(store, elem_name, elem, dataset_kwargs=dataset_kwargs)

ad.experimental.write_dispatched(group, "/" if key is None else key, adata, callback=callback)
zarr.consolidate_metadata(group.store)
ad.experimental.write_dispatched(group, "/" if key is None else key, adata, callback=callback)
zarr.consolidate_metadata(group.store)


def _check_for_mismatched_keys[T: zarr.Group | h5py.Group | PathLike[str] | str](
Expand Down
3 changes: 3 additions & 0 deletions tests/test_preshuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,11 @@ def test_store_creation(

pd.testing.assert_frame_equal(adata.obs, adata_orig.obs)
z = zarr.open(output_path / "dataset_0")
# assert chunk behavior
assert z["obsm"]["arr"].chunks[0] == 5, z["obsm"]["arr"]
assert z["X"]["indices"].chunks[0] == 10
# ensure proper downcasting
assert z["X"]["indices"].dtype == (np.uint16 if adata.X.shape[1] >= 256 else np.uint8)


def _read_lazy_x_and_obs_only_from_raw(path) -> ad.AnnData:
Expand Down
Loading