scverse · gtca · Jun 30, 2024 · Aug 15, 2023 · Sep 7, 2023 · Sep 8, 2023
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10"]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/docs/source/io/input.rst b/docs/source/io/input.rst
@@ -46,3 +46,23 @@ Omics data
 When data fromats specific to genomics are of interest, specialised readers can be found in analysis frameworks such as `muon <https://muon.readthedocs.io/>`_. These functions, including the ones for Cell Ranger count matrices as well as Snap files, `are described here <https://muon.readthedocs.io/en/latest/io/input.html>`_.
 
 
+Remote storage
+--------------
+
+MuData objects can be read and cached from remote locations including via HTTP(S) or from S3 buckets. This is achieved via [`fsspec`](https://github.com/fsspec/filesystem_spec). For example, to read a MuData object from a remote server:
+::
+   import fsspec
+
+   fname = "https://github.com/gtca/h5xx-datasets/raw/main/datasets/minipbcite.h5mu?download="
+   with fsspec.open(fname) as f:
+      mdata = mudata.read_h5mu(f)
+
+
+A caching layer can be added in the following way:
+::
+   fname_cached = "filecache::" + fname
+   with fsspec.open(fname_cached, filecache={'cache_storage': '/tmp/'}):
+      mdata = mudata.read_h5mu(f)
+
+
+For more `fsspec` usage examples see [its documentation](https://filesystem-spec.readthedocs.io/).
diff --git a/mudata/_core/io.py b/mudata/_core/io.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
+from typing import Any, Callable, Optional, Union, TYPE_CHECKING
 
 if TYPE_CHECKING:
     import zarr
+    import fsspec
 
-from typing import Union
 from os import PathLike
 import os
 from warnings import warn
@@ -14,12 +14,12 @@
 import h5py
 import anndata as ad
 from anndata import AnnData
+from anndata.compat import _read_attr
 
-# from anndata.compat import _read_hdf5_attribute  # 0.8
 from pathlib import Path
 from scipy import sparse
 
-from mudata import MuData
+from .mudata import ModDict, MuData
 from .file_backing import MuDataFileManager, AnnDataFileManager
 
 #
@@ -34,13 +34,13 @@ def _write_h5mu(file: h5py.File, mdata: MuData, write_data=True, **kwargs):
     write_elem(
         file,
         "obs",
-        mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False)),
+        mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False).copy()),
         dataset_kwargs=kwargs,
     )
     write_elem(
         file,
         "var",
-        mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False)),
+        mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False).copy()),
         dataset_kwargs=kwargs,
     )
     write_elem(file, "obsm", dict(mdata.obsm), dataset_kwargs=kwargs)
@@ -127,13 +127,13 @@ def write_zarr(
         write_elem(
             file,
             "obs",
-            mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False)),
+            mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False).copy()),
             dataset_kwargs=kwargs,
         )
         write_elem(
             file,
             "var",
-            mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False)),
+            mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False).copy()),
             dataset_kwargs=kwargs,
         )
         write_elem(file, "obsm", dict(mdata.obsm), dataset_kwargs=kwargs)
@@ -193,6 +193,8 @@ def write_zarr(
         # Restore top-level annotation
         if not mdata.is_view or not mdata.isbacked:
             mdata.update()
+    else:
+        raise TypeError("Expected MuData or AnnData object")
 
 
 def write_h5mu(filename: PathLike, mdata: MuData, **kwargs):
@@ -306,7 +308,7 @@ def write(filename: PathLike, data: Union[MuData, AnnData]):
     else:
         assert isinstance(data, AnnData), "Only MuData and AnnData objects are accepted"
 
-        m = re.search("^(.+)\.(h5mu)[/]?([^/]*)[/]?(.*)$", str(filename))
+        m = re.search("^(.+).(h5mu)[/]?([^/]*)[/]?(.*)$", str(filename))
         if m is not None:
             m = m.groups()
         else:
@@ -338,6 +340,54 @@ def write(filename: PathLike, data: Union[MuData, AnnData]):
 #
 
 
+def _validate_h5mu(filename: PathLike) -> (str, Optional[Callable]):
+    fname: [str, Path, "fsspec.core.io.BufferedReader", "fsspec.core.OpenFile"] = filename
+    callback = None
+
+    try:
+        with open(filename, "rb") as f:
+            ish5mu = f.read(6) == b"MuData"
+    except TypeError as e:
+        # Support for fsspec
+        #
+        # Namely, opening remote files should work via
+        # with fsspec.open("s3://bucket/file.h5mu") as f:
+        #     mdata = read_h5mu(f)
+        # or
+        # mdata = read_h5mu(fsspec.open("s3://bucket/file.h5mu")
+        if (
+            filename.__class__.__name__ == "BufferedReader"
+            or filename.__class__.__name__ == "OpenFile"
+        ):
+            try:
+                from fsspec.core import OpenFile
+
+                if isinstance(filename, OpenFile):
+                    fname = filename.__enter__()
+                    callback = lambda: fname.__exit__()
+                ish5mu = fname.read(6) == b"MuData"
+            except ImportError as e:
+                raise ImportError(
+                    "To read from remote storage or cache, install fsspec: pip install fsspec"
+                ) from e
+        else:
+            ish5mu = False
+            raise e
+
+    if not ish5mu:
+        if isinstance(filename, str) or isinstance(filename, Path):
+            if h5py.is_hdf5(filename):
+                warn(
+                    "The HDF5 file was not created by muon/mudata, we can't guarantee that everything will work correctly"
+                )
+            else:
+                raise ValueError("The file is not an HDF5 file")
+        else:
+            warn("Cannot verify that the (remote) file is a valid H5MU file")
+
+    return fname, callback
+
+
 def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
     """
     Read MuData object from HDF5 file
@@ -358,33 +408,24 @@ def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
     else:
         mode = backed
     manager = MuDataFileManager(filename, mode) if backed else MuDataFileManager()
-    with open(filename, "rb") as f:
-        ish5mu = f.read(6) == b"MuData"
-    if not ish5mu:
-        if h5py.is_hdf5(filename):
-            warn(
-                "The HDF5 file was not created by muon, we can't guarantee that everything will work correctly"
-            )
-        else:
-            raise ValueError("The file is not an HDF5 file")
 
-    with h5py.File(filename, mode) as f:
+    fname, callback = _validate_h5mu(filename)
+
+    with h5py.File(fname, mode) as f:
         d = {}
         for k in f.keys():
             if k in ["obs", "var"]:
                 d[k] = read_dataframe(f[k])
             if k == "mod":
-                mods = {}
+                mods = ModDict()
                 gmods = f[k]
                 for m in gmods.keys():
                     ad = _read_h5mu_mod(gmods[m], manager, backed not in (None, False))
                     mods[m] = ad
 
                 mod_order = None
                 if "mod-order" in gmods.attrs:
-                    mod_order = gmods.attrs["mod-order"]
-                # TODO: use in v0.8
-                # mod_order = _read_hdf5_attribute(k, "mod-order")
+                    mod_order = _read_attr(gmods.attrs, "mod-order")
                 if mod_order is not None and all([m in gmods for m in mod_order]):
                     mods = {k: mods[k] for k in mod_order}
 
@@ -395,6 +436,9 @@ def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
         if "axis" in f.attrs:
             d["axis"] = f.attrs["axis"]
 
+        if callback is not None:
+            callback()
+
     mu = MuData._init_from_dict_(**d)
     mu.file = manager
     return mu
@@ -435,6 +479,13 @@ def read_zarr(store: Union[str, Path, MutableMapping, zarr.Group]):
             for m in gmods.keys():
                 ad = _read_zarr_mod(gmods[m], manager)
                 mods[m] = ad
+
+            mod_order = None
+            if "mod-order" in gmods.attrs:
+                mod_order = _read_attr(gmods.attrs, "mod-order")
+            if mod_order is not None and all([m in gmods for m in mod_order]):
+                mods = {k: mods[k] for k in mod_order}
+
             d[k] = mods
         else:  # Base case
             d[k] = read_elem(f[k])
@@ -458,13 +509,14 @@ def _read_zarr_mod(g: zarr.Group, manager: MuDataFileManager = None, backed: boo
             d[k] = read_dataframe(g[k])
         elif k == "X":
             X = g["X"]
-            if isinstance(X, zarr.Group):
-                dtype = X["data"].dtype
-            elif hasattr(X, "dtype"):
-                dtype = X.dtype
-            else:
-                raise ValueError()
-            d["dtype"] = dtype
+            # No more dtype in anndata >=0.10
+            # if isinstance(X, zarr.Group):
+            #     dtype = X["data"].dtype
+            # elif hasattr(X, "dtype"):
+            #     dtype = X.dtype
+            # else:
+            #     raise ValueError()
+            # d["dtype"] = dtype
             if not backed:
                 d["X"] = read_elem(X)
         elif k != "raw":
@@ -499,13 +551,14 @@ def _read_h5mu_mod(
             d[k] = read_dataframe(g[k])
         elif k == "X":
             X = g["X"]
-            if isinstance(X, h5py.Group):
-                dtype = X["data"].dtype
-            elif hasattr(X, "dtype"):
-                dtype = X.dtype
-            else:
-                raise ValueError()
-            d["dtype"] = dtype
+            # No more dtype in anndata >=0.10
+            # if isinstance(X, h5py.Group):
+            #     dtype = X["data"].dtype
+            # elif hasattr(X, "dtype"):
+            #     dtype = X.dtype
+            # else:
+            #     raise ValueError()
+            # d["dtype"] = dtype
             if not backed:
                 d["X"] = read_elem(X)
         elif k != "raw":
@@ -522,12 +575,12 @@ def _read_h5mu_mod(
 
 def read_h5ad(
     filename: PathLike,
-    mod: str,
+    mod: Optional[str],
     backed: Union[str, bool, None] = None,
 ) -> AnnData:
     """
     Read AnnData object from inside a .h5mu file
-    or from a standalone .h5ad file
+    or from a standalone .h5ad file (mod=None)
 
     Currently replicates and modifies anndata._io.h5ad.read_h5ad.
     Matrices are loaded as they are in the file (sparse or dense).
@@ -542,10 +595,37 @@ def read_h5ad(
         "r+",
     ], "Argument `backed` should be boolean, or r/r+, or None"
 
+    from anndata import read_h5ad
     from anndata._io.specs.registry import read_elem
     from anndata._io.h5ad import read_dataframe, _read_raw
 
-    d = {}
+    if mod is None:
+        try:
+            return read_h5ad(filename, backed=backed)
+        except TypeError as e:
+            fname, callback = filename, None
+            # Support fsspec
+            if (
+                filename.__class__.__name__ == "BufferedReader"
+                or filename.__class__.__name__ == "OpenFile"
+            ):
+                try:
+                    from fsspec.core import OpenFile
+
+                    if isinstance(filename, OpenFile):
+                        fname = filename.__enter__()
+                        callback = lambda: fname.__exit__()
+                except ImportError as e:
+                    raise ImportError(
+                        "To read from remote storage or cache, install fsspec: pip install fsspec"
+                    ) from e
+
+                adata = read_h5ad(fname, backed=backed)
+                if callable is not None:
+                    callback()
+                return adata
+            else:
+                raise e
 
     hdf5_mode = "r"
     if backed not in {None, False}:
@@ -579,22 +659,43 @@ def read(filename: PathLike, **kwargs) -> Union[MuData, AnnData]:
       - FILE.h5mu/MODALITY
       - FILE.h5mu/mod/MODALITY
       - FILE.h5ad
+
+    OpenFile and BufferedReader from fsspec are supported for remote storage, e.g.:
+      - mdata = read(fsspec.open("s3://bucket/file.h5mu")))
+      - with fsspec.open("s3://bucket/file.h5mu") as f:
+            mdata = read(f)
+      - with fsspec.open("https://server/file.h5ad") as f:
+            adata = read(f)
     """
     import re
 
-    m = re.search("^(.+)\.(h5mu)[/]?([^/]*)[/]?(.*)$", str(filename))
+    if filename.__class__.__name__ == "BufferedReader":
+        raise TypeError(
+            "Use format-specific functions (read_h5mu, read_zarr) to read from BufferedReader or provide an OpenFile instance."
+        )
+    elif filename.__class__.__name__ == "OpenFile":
+        fname = filename.path
+    else:
+        fname = str(filename)
+
+    m = re.search("^(.+).(h5mu)[/]?([^/]*)[/]?(.*)$", fname)
     if m is not None:
         m = m.groups()
     else:
-        if filename.endswith(".h5ad"):
+        if fname.endswith(".h5ad"):
             m = [filename[:-5], "h5ad", "", ""]
         else:
             raise ValueError("Expected non-empty .h5ad or .h5mu file name")
 
-    filepath = ".".join([m[0], m[1]])
+    if isinstance(filename, str) or isinstance(filename, Path):
+        pathstrlike = True
+        filepath = ".".join([m[0], m[1]])
+    else:
+        pathstrlike = False
+        filepath = filename
 
     if m[1] == "h5mu":
-        if all(i == 0 for i in map(len, m[2:])):
+        if all(i == 0 for i in map(len, m[2:])) or not pathstrlike:
             # Ends with .h5mu
             return read_h5mu(filepath, **kwargs)
         elif m[3] == "":