Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for MuData 0.3 release #53

Merged
merged 14 commits into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8, 3.9, "3.10"]
python-version: [3.9, "3.10"]

steps:
- uses: actions/checkout@v2
Expand Down
20 changes: 20 additions & 0 deletions docs/source/io/input.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,23 @@ Omics data
When data fromats specific to genomics are of interest, specialised readers can be found in analysis frameworks such as `muon <https://muon.readthedocs.io/>`_. These functions, including the ones for Cell Ranger count matrices as well as Snap files, `are described here <https://muon.readthedocs.io/en/latest/io/input.html>`_.


Remote storage
--------------

MuData objects can be read and cached from remote locations including via HTTP(S) or from S3 buckets. This is achieved via [`fsspec`](https://github.com/fsspec/filesystem_spec). For example, to read a MuData object from a remote server:
::
import fsspec

fname = "https://github.com/gtca/h5xx-datasets/raw/main/datasets/minipbcite.h5mu?download="
with fsspec.open(fname) as f:
mdata = mudata.read_h5mu(f)


A caching layer can be added in the following way:
::
fname_cached = "filecache::" + fname
with fsspec.open(fname_cached, filecache={'cache_storage': '/tmp/'}):
mdata = mudata.read_h5mu(f)


For more `fsspec` usage examples see [its documentation](https://filesystem-spec.readthedocs.io/).
189 changes: 145 additions & 44 deletions mudata/_core/io.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from typing import Any, Callable, Optional, Union, TYPE_CHECKING

if TYPE_CHECKING:
import zarr
import fsspec

from typing import Union
from os import PathLike
import os
from warnings import warn
Expand All @@ -14,12 +14,12 @@
import h5py
import anndata as ad
from anndata import AnnData
from anndata.compat import _read_attr

# from anndata.compat import _read_hdf5_attribute # 0.8
from pathlib import Path
from scipy import sparse

from mudata import MuData
from .mudata import ModDict, MuData
from .file_backing import MuDataFileManager, AnnDataFileManager

#
Expand All @@ -34,13 +34,13 @@ def _write_h5mu(file: h5py.File, mdata: MuData, write_data=True, **kwargs):
write_elem(
file,
"obs",
mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False)),
mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False).copy()),
dataset_kwargs=kwargs,
)
write_elem(
file,
"var",
mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False)),
mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False).copy()),
dataset_kwargs=kwargs,
)
write_elem(file, "obsm", dict(mdata.obsm), dataset_kwargs=kwargs)
Expand Down Expand Up @@ -127,13 +127,13 @@ def write_zarr(
write_elem(
file,
"obs",
mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False)),
mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False).copy()),
dataset_kwargs=kwargs,
)
write_elem(
file,
"var",
mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False)),
mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False).copy()),
dataset_kwargs=kwargs,
)
write_elem(file, "obsm", dict(mdata.obsm), dataset_kwargs=kwargs)
Expand Down Expand Up @@ -193,6 +193,8 @@ def write_zarr(
# Restore top-level annotation
if not mdata.is_view or not mdata.isbacked:
mdata.update()
else:
raise TypeError("Expected MuData or AnnData object")


def write_h5mu(filename: PathLike, mdata: MuData, **kwargs):
Expand Down Expand Up @@ -306,7 +308,7 @@ def write(filename: PathLike, data: Union[MuData, AnnData]):
else:
assert isinstance(data, AnnData), "Only MuData and AnnData objects are accepted"

m = re.search("^(.+)\.(h5mu)[/]?([^/]*)[/]?(.*)$", str(filename))
m = re.search("^(.+).(h5mu)[/]?([^/]*)[/]?(.*)$", str(filename))
if m is not None:
m = m.groups()
else:
Expand Down Expand Up @@ -338,6 +340,54 @@ def write(filename: PathLike, data: Union[MuData, AnnData]):
#


def _validate_h5mu(filename: PathLike) -> (str, Optional[Callable]):
fname: [str, Path, "fsspec.core.io.BufferedReader", "fsspec.core.OpenFile"] = filename
callback = None

try:
with open(filename, "rb") as f:
ish5mu = f.read(6) == b"MuData"
except TypeError as e:
# Support for fsspec
#
# Namely, opening remote files should work via
# with fsspec.open("s3://bucket/file.h5mu") as f:
# mdata = read_h5mu(f)
# or
# mdata = read_h5mu(fsspec.open("s3://bucket/file.h5mu")
if (
filename.__class__.__name__ == "BufferedReader"
or filename.__class__.__name__ == "OpenFile"
):
try:
from fsspec.core import OpenFile

if isinstance(filename, OpenFile):
fname = filename.__enter__()
callback = lambda: fname.__exit__()
ish5mu = fname.read(6) == b"MuData"
except ImportError as e:
raise ImportError(
"To read from remote storage or cache, install fsspec: pip install fsspec"
) from e
else:
ish5mu = False
raise e

if not ish5mu:
if isinstance(filename, str) or isinstance(filename, Path):
if h5py.is_hdf5(filename):
warn(
"The HDF5 file was not created by muon/mudata, we can't guarantee that everything will work correctly"
)
else:
raise ValueError("The file is not an HDF5 file")
else:
warn("Cannot verify that the (remote) file is a valid H5MU file")

return fname, callback


def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
"""
Read MuData object from HDF5 file
Expand All @@ -358,33 +408,24 @@ def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
else:
mode = backed
manager = MuDataFileManager(filename, mode) if backed else MuDataFileManager()
with open(filename, "rb") as f:
ish5mu = f.read(6) == b"MuData"
if not ish5mu:
if h5py.is_hdf5(filename):
warn(
"The HDF5 file was not created by muon, we can't guarantee that everything will work correctly"
)
else:
raise ValueError("The file is not an HDF5 file")

with h5py.File(filename, mode) as f:
fname, callback = _validate_h5mu(filename)

with h5py.File(fname, mode) as f:
d = {}
for k in f.keys():
if k in ["obs", "var"]:
d[k] = read_dataframe(f[k])
if k == "mod":
mods = {}
mods = ModDict()
gmods = f[k]
for m in gmods.keys():
ad = _read_h5mu_mod(gmods[m], manager, backed not in (None, False))
mods[m] = ad

mod_order = None
if "mod-order" in gmods.attrs:
mod_order = gmods.attrs["mod-order"]
# TODO: use in v0.8
# mod_order = _read_hdf5_attribute(k, "mod-order")
mod_order = _read_attr(gmods.attrs, "mod-order")
if mod_order is not None and all([m in gmods for m in mod_order]):
mods = {k: mods[k] for k in mod_order}

Expand All @@ -395,6 +436,9 @@ def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
if "axis" in f.attrs:
d["axis"] = f.attrs["axis"]

if callback is not None:
callback()

mu = MuData._init_from_dict_(**d)
mu.file = manager
return mu
Expand Down Expand Up @@ -435,6 +479,13 @@ def read_zarr(store: Union[str, Path, MutableMapping, zarr.Group]):
for m in gmods.keys():
ad = _read_zarr_mod(gmods[m], manager)
mods[m] = ad

mod_order = None
if "mod-order" in gmods.attrs:
mod_order = _read_attr(gmods.attrs, "mod-order")
if mod_order is not None and all([m in gmods for m in mod_order]):
mods = {k: mods[k] for k in mod_order}

d[k] = mods
else: # Base case
d[k] = read_elem(f[k])
Expand All @@ -458,13 +509,14 @@ def _read_zarr_mod(g: zarr.Group, manager: MuDataFileManager = None, backed: boo
d[k] = read_dataframe(g[k])
elif k == "X":
X = g["X"]
if isinstance(X, zarr.Group):
dtype = X["data"].dtype
elif hasattr(X, "dtype"):
dtype = X.dtype
else:
raise ValueError()
d["dtype"] = dtype
# No more dtype in anndata >=0.10
# if isinstance(X, zarr.Group):
# dtype = X["data"].dtype
# elif hasattr(X, "dtype"):
# dtype = X.dtype
# else:
# raise ValueError()
# d["dtype"] = dtype
if not backed:
d["X"] = read_elem(X)
elif k != "raw":
Expand Down Expand Up @@ -499,13 +551,14 @@ def _read_h5mu_mod(
d[k] = read_dataframe(g[k])
elif k == "X":
X = g["X"]
if isinstance(X, h5py.Group):
dtype = X["data"].dtype
elif hasattr(X, "dtype"):
dtype = X.dtype
else:
raise ValueError()
d["dtype"] = dtype
# No more dtype in anndata >=0.10
# if isinstance(X, h5py.Group):
# dtype = X["data"].dtype
# elif hasattr(X, "dtype"):
# dtype = X.dtype
# else:
# raise ValueError()
# d["dtype"] = dtype
if not backed:
d["X"] = read_elem(X)
elif k != "raw":
Expand All @@ -522,12 +575,12 @@ def _read_h5mu_mod(

def read_h5ad(
filename: PathLike,
mod: str,
mod: Optional[str],
backed: Union[str, bool, None] = None,
) -> AnnData:
"""
Read AnnData object from inside a .h5mu file
or from a standalone .h5ad file
or from a standalone .h5ad file (mod=None)

Currently replicates and modifies anndata._io.h5ad.read_h5ad.
Matrices are loaded as they are in the file (sparse or dense).
Expand All @@ -542,10 +595,37 @@ def read_h5ad(
"r+",
], "Argument `backed` should be boolean, or r/r+, or None"

from anndata import read_h5ad
from anndata._io.specs.registry import read_elem
from anndata._io.h5ad import read_dataframe, _read_raw

d = {}
if mod is None:
try:
return read_h5ad(filename, backed=backed)
except TypeError as e:
fname, callback = filename, None
# Support fsspec
if (
filename.__class__.__name__ == "BufferedReader"
or filename.__class__.__name__ == "OpenFile"
):
try:
from fsspec.core import OpenFile

if isinstance(filename, OpenFile):
fname = filename.__enter__()
callback = lambda: fname.__exit__()
except ImportError as e:
raise ImportError(
"To read from remote storage or cache, install fsspec: pip install fsspec"
) from e

adata = read_h5ad(fname, backed=backed)
if callable is not None:
callback()
return adata
else:
raise e

hdf5_mode = "r"
if backed not in {None, False}:
Expand Down Expand Up @@ -579,22 +659,43 @@ def read(filename: PathLike, **kwargs) -> Union[MuData, AnnData]:
- FILE.h5mu/MODALITY
- FILE.h5mu/mod/MODALITY
- FILE.h5ad

OpenFile and BufferedReader from fsspec are supported for remote storage, e.g.:
- mdata = read(fsspec.open("s3://bucket/file.h5mu")))
- with fsspec.open("s3://bucket/file.h5mu") as f:
mdata = read(f)
- with fsspec.open("https://server/file.h5ad") as f:
adata = read(f)
"""
import re

m = re.search("^(.+)\.(h5mu)[/]?([^/]*)[/]?(.*)$", str(filename))
if filename.__class__.__name__ == "BufferedReader":
raise TypeError(
"Use format-specific functions (read_h5mu, read_zarr) to read from BufferedReader or provide an OpenFile instance."
)
elif filename.__class__.__name__ == "OpenFile":
fname = filename.path
else:
fname = str(filename)

m = re.search("^(.+).(h5mu)[/]?([^/]*)[/]?(.*)$", fname)
if m is not None:
m = m.groups()
else:
if filename.endswith(".h5ad"):
if fname.endswith(".h5ad"):
m = [filename[:-5], "h5ad", "", ""]
else:
raise ValueError("Expected non-empty .h5ad or .h5mu file name")

filepath = ".".join([m[0], m[1]])
if isinstance(filename, str) or isinstance(filename, Path):
pathstrlike = True
filepath = ".".join([m[0], m[1]])
else:
pathstrlike = False
filepath = filename

if m[1] == "h5mu":
if all(i == 0 for i in map(len, m[2:])):
if all(i == 0 for i in map(len, m[2:])) or not pathstrlike:
# Ends with .h5mu
return read_h5mu(filepath, **kwargs)
elif m[3] == "":
Expand Down
Loading
Loading