# On-Disk Concatenation of AnnData Files

**Author:** Selman Özleyen

## Initalizing

First let's do our imports and initalize adata objects with the help of the `adata_with_dask` function defined below.

In [1]:
%load_ext memory_profiler


import numpy as np
from scipy import sparse
import pandas as pd
from anndata.tests.helpers import gen_typed_df
from anndata.experimental import write_elem
import zarr
import anndata
from pathlib import Path
import glob
import tempfile
import dask.distributed as dd

import anndata
from anndata._core.merge import concat
import dask.array as da
import zarr

from anndata.experimental import read_dispatched, read_elem, concat_on_disk


OUTDIR = Path("tmpdata")





shapes = ["fat", "tall", "square"]
sizes = [10_000]
densities = [0.1, 1]
num_runs = 1


def create_adata(shape, X):
    M, N = shape
    obs_names = pd.Index(f"cell{i}" for i in range(shape[0]))
    var_names = pd.Index(f"gene{i}" for i in range(shape[1]))
    obs = gen_typed_df(M, obs_names)
    var = gen_typed_df(N, var_names)
    # For #147
    obs.rename(columns=dict(cat="obs_cat"), inplace=True)
    var.rename(columns=dict(cat="var_cat"), inplace=True)
    return anndata.AnnData(X, obs=obs, var=var)



In [95]:
file_id = 1
for _ in range(num_runs):
    for shape in shapes:
        for size in sizes:
            for density in densities:
                is_dense = density == 1
                array_funcs = []
                array_names = []
                if is_dense:
                    array_names.append("np")
                    array_funcs.append(lambda x: x.toarray())
                else:
                    array_names.append("csc")
                    array_names.append("csr")
                    array_funcs.append(sparse.csc_matrix)
                    array_funcs.append(sparse.csr_matrix)

                for array_func, array_name in zip(array_funcs, array_names):
                    M = size
                    N = size
                    if shape != "square":
                        other_size = int(size * np.random.uniform(0.7, 0.9))
                        if shape == "fat":
                            M = other_size
                        elif shape == "tall":
                            N = other_size

                    X = array_func(
                        sparse.random(M, N, density=density, format="csc")
                    )
                    adata = create_adata(
                        (M, N),
                        X,
                    )
                    fname =  str(OUTDIR)+f"/{file_id:02d}_{shape}_{array_name}"
                    file_id += 1
                    print(f"wrote {M}x{N}_density={density:0.1f}_{array_name} -> {fname}")
                    if is_dense:
                        output_zarr_path = f"{str(fname)}.zarr"
                        z = zarr.open_group(output_zarr_path)

                        write_elem(z, "/", adata)
                        zarr.consolidate_metadata(z.store)
                    else:
                        adata.write_zarr(f"{fname}.zarr")


wrote 7475x10000_density=0.1_csc -> tmpdata/01_fat_csc
wrote 7346x10000_density=0.1_csr -> tmpdata/02_fat_csr
wrote 7401x10000_density=1.0_np -> tmpdata/03_fat_np
wrote 10000x7039_density=0.1_csc -> tmpdata/04_tall_csc
wrote 10000x7889_density=0.1_csr -> tmpdata/05_tall_csr
wrote 10000x8142_density=1.0_np -> tmpdata/06_tall_np
wrote 10000x10000_density=0.1_csc -> tmpdata/07_square_csc
wrote 10000x10000_density=0.1_csr -> tmpdata/08_square_csr
wrote 10000x10000_density=1.0_np -> tmpdata/09_square_np


In [2]:
nps = set(glob.glob(str(OUTDIR) + "/*np*"))
csrs = set(glob.glob(str(OUTDIR) + "/*csr*"))
cscs = set(glob.glob(str(OUTDIR) + "/*csc*"))
fats = set(glob.glob(str(OUTDIR) + "/*fat*"))
talls = set(glob.glob(str(OUTDIR) + "/*tall*"))
squares = set(glob.glob(str(OUTDIR) + "/*square*"))


In [3]:
nps

{'tmpdata/03_fat_np.zarr',
 'tmpdata/06_tall_np.zarr',
 'tmpdata/09_square_np.zarr'}

In [17]:
def concat_on_disk_wrapper(filepaths, writepth, axis):
    from multiprocessing import Lock
    with dd.LocalCluster(memory_limit="400MB", n_workers=1,threads_per_worker=1) as cluster:
        with dd.Client(cluster) as client:
            
            lock = Lock()
            concat_on_disk(filepaths, writepth, axis=axis, overwrite=True)


In [18]:
data = dict()
for axis in (0, 1):
    for fileset in ("csrs", "nps-0", "nps-1", "cscs"):
        filepaths = []
        if "csrs" in fileset:
            filepaths = csrs
            axis = 0
        elif "nps" in fileset:
            filepaths = nps
            if "0" in fileset:
                axis = 0
            elif "1" in fileset:
                axis = 1
        elif fileset == "cscs":
            filepaths = cscs
            axis = 1

        if axis == 0:
            filepaths = filepaths.intersection(fats.union(squares))
        elif axis == 1:
            filepaths = filepaths.intersection(talls.union(squares))

        data[fileset] = filepaths, axis


Here is how our adata looks like

In [19]:
writepth = OUTDIR / "out.zarr"

In [20]:
filepaths, axis = data["nps-1"]

In [21]:
concat_on_disk_wrapper(filepaths, writepth, axis=axis)

2023-07-24 15:37:24,507 - distributed.protocol.pickle - ERROR - Failed to serialize <ToPickle: HighLevelGraph with 1 layers.
<dask.highlevelgraph.HighLevelGraph object at 0x7f59aa12c6d0>
 0. 140023082158464
>.
Traceback (most recent call last):
  File "/home/sel/mambaforge/envs/dask/lib/python3.9/site-packages/distributed/protocol/pickle.py", line 63, in dumps
    result = pickle.dumps(x, **dump_kwargs)
TypeError: cannot pickle '_thread.lock' object

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/sel/mambaforge/envs/dask/lib/python3.9/site-packages/distributed/protocol/pickle.py", line 68, in dumps
    pickler.dump(x)
TypeError: cannot pickle '_thread.lock' object

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/sel/mambaforge/envs/dask/lib/python3.9/site-packages/distributed/protocol/pickle.py", line 81, in dumps
    result = cloudpickle.dumps(x,

TypeError: ('Could not serialize object of type HighLevelGraph', '<ToPickle: HighLevelGraph with 1 layers.\n<dask.highlevelgraph.HighLevelGraph object at 0x7f59aa12c6d0>\n 0. 140023082158464\n>')

Above error raised while writing key 'X' of <class 'zarr.hierarchy.Group'> to <zarr.storage.DirectoryStore object at 0x7f59aa13b610>

In [77]:
filepaths

{'tmpdata/02_fat_csr.zarr', 'tmpdata/08_square_csr.zarr'}