From 1d4bfd9532d56982b6080b1dc048a57c49d5d170 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 11:08:10 +0200 Subject: [PATCH 01/89] (chore): migrate `anndata` PR --- scanpy/get/__init__.py | 2 + scanpy/get/groupby.py | 543 +++++++++++++++++++++++++++++++++++ scanpy/tests/test_groupby.py | 108 +++++++ 3 files changed, 653 insertions(+) create mode 100644 scanpy/get/groupby.py create mode 100644 scanpy/tests/test_groupby.py diff --git a/scanpy/get/__init__.py b/scanpy/get/__init__.py index 9716b416b..243669b0f 100644 --- a/scanpy/get/__init__.py +++ b/scanpy/get/__init__.py @@ -3,3 +3,5 @@ # Private from .get import _get_obs_rep, _set_obs_rep + +from .groupby import GroupBy diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py new file mode 100644 index 000000000..38328fcaa --- /dev/null +++ b/scanpy/get/groupby.py @@ -0,0 +1,543 @@ +from collections import defaultdict +from typing import ( + Optional, + Iterable, + AbstractSet, + Sequence, + Collection, + Tuple, + Union, + NamedTuple, + Literal, +) + +from anndata import AnnData, utils +import numpy as np +import pandas as pd +import collections.abc as cabc +from scipy.sparse import coo_matrix, dia_matrix + + +class CountMeanVar(NamedTuple): + count: Optional[pd.Series] = None + mean: pd.DataFrame = None + var: Optional[pd.DataFrame] = None + + @classmethod + def from_df(cls, df: pd.DataFrame) -> "CountMeanVar": + return CountMeanVar( + count=df["count"] if "count" in df.columns else None, + mean=df["mean"], + var=df["var"] if "var" in df.columns else None, + ) + + def map(self, f=lambda v: v, **fs) -> "CountMeanVar": + fs = defaultdict(lambda: f, fs) + return CountMeanVar( + **{ + stat: fs[stat](val) + for stat, val in self._asdict().items() + if val is not None + } + ) + + +Score = Literal[ + "diff-score", "fold-score", "t-score", "v-score", "t-score-pooled", "v-score-pooled" +] + + +class GroupBy: + """ + Functionality for grouping and aggregating AnnData observations by key, per variable. + + There is currently support for count, sum, mean, and varience per group, and for scores + derived from these per pair of groups. + + Set `weight` for weighted sum, mean, and variance. + + Set `explode` to True and use a key of type tuple to assign observations to multiple groups. + In this case, repetition of a key confers multiplicity of the observation in the group. + + Set `key_set` to a list of keys to most efficiently compute results for a subset of groups. + + NaN values propagate, with the exception that `score_pairs` sets non-finite scores to 0 by + default. Use the pd_* methods to instead mask NaN values. These slower methods convert data + to dense format and do not currently support weight, explode, or key_set. + + **Implementation** + + Moments are computed using weighted sum aggregation of AnnData obsevations per variable + (i.e., feature) via multiplication by a sparse coordinate matrix A, exposed by + `sparse_aggregator`. The approach works with data in ndarray or scipy sparse formats, with + no view or copy overhead on runtime or memory, even when filtering keys. + + Runtime is effectively computation of the product A * X, i.e. the count of (non-zero) + entries in X with multiplicity the number of group memberships for that entry. This is + O(data) for partitions (each observation belonging to exactly one group), independent of + the number of groups. + + To compute scores, first statistics are computed for each group in at least one pair, and + then scores are computed for each pair using the statistics. Runtime is dominated by the + former, so is effectively independent of the number of pairs. + + Params + ------ + adata + key + Group key field in adata.obs. + weight + Weight field in adata.obs of type float. + explode + If False, each observation is assigned to the group keyed by adata.obs[key]. + If True, each observation is assigned to all groups in tuple adata.obs[key]. + key_set + Subset of keys to which to filter. + """ + + adata: AnnData + key: str + weight: Optional[str] + explode: bool + key_set: AbstractSet[str] + _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated + + def __init__( + self, + adata: AnnData, + key: str, + *, + weight: Optional[str] = None, + explode: bool = False, + key_set: Optional[Iterable[str]] = None, + ): + self.adata = adata + self.key = key + self.weight = weight + self.explode = explode + self.key_set = None if key_set is None else dict.fromkeys(key_set).keys() + self._key_index = None + + def count(self) -> pd.Series: + """ + Count the number of observations in each group. + + Returns + ------- + Series of counts indexed by key. + """ + keys, key_index, _, _ = self._extract_indices() + count_ = np.bincount(key_index) + return pd.Series( + data=count_, + index=pd.Index(keys, name=self.key, tupleize_cols=False), + name="count", + ) + + def sum(self) -> pd.DataFrame: + """ + Compute the sum per feature per group of observations. + + Returns + ------- + DataFrame of sums indexed by key with columns from adata. + """ + A, keys = self.sparse_aggregator(normalize=False) + X = self.adata.X + return pd.DataFrame( + index=pd.Index(keys, name=self.key, tupleize_cols=False), + columns=self.adata.var_names.copy(), + data=utils.asarray(A * X), + ) + + def mean(self) -> pd.DataFrame: + """ + Compute the mean per feature per group of observations. + + Returns + ------- + DataFrame of means indexed by key with columns from adata. + """ + A, keys = self.sparse_aggregator(normalize=True) + X = self.adata.X + return pd.DataFrame( + index=pd.Index(keys, name=self.key, tupleize_cols=False), + columns=self.adata.var_names.copy(), + data=utils.asarray(A * X), + ) + + def var(self, dof: int = 1) -> pd.DataFrame: + """ + Compute the variance per feature per group of observations. + + See also count_mean_var, which is more efficient when the mean is also desired. + + Params + ------ + dof + Degrees of freedom for variance. + + Returns + ------- + DataFrame of variances indexed by key with columns from adata. + """ + return self.count_mean_var(dof).var + + def count_mean_var(self, dof: int = 1) -> CountMeanVar: + """ + Compute the count, as well as mean and variance per feature, per group of observations. + + The formula `Var(X) = E(X^2) - E(X)^2` suffers loss of precision when the variance is a + very small fraction of the squared mean. In particular, when X is constant, the formula may + nonetheless be non-zero. By default, our implementation resets the variance to exactly zero + when the computed variance, relative to the squared mean, nears limit of precision of the + floating-point significand. + + Params + ------ + dof + Degrees of freedom for variance. + + Returns + ------- + Dictionary with keys (count, mean, var) and values the corresponding Series and DataFrames. + """ + assert dof >= 0 + A, keys = self.sparse_aggregator(normalize=True) + X = self.adata.X + count_ = np.bincount(self._key_index) + mean_ = utils.asarray(A @ X) + mean_sq = utils.asarray(A @ _power(X, 2)) + if self.weight is None: + sq_mean = mean_ ** 2 + else: + A_unweighted, _ = GroupBy( + self.adata, self.key, explode=self.explode, key_set=self.key_set + ).sparse_aggregator() + mean_unweighted = utils.asarray(A_unweighted * X) + sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted ** 2 + var_ = mean_sq - sq_mean + precision = 2 << (42 if X.dtype == np.float64 else 20) + # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 + var_[precision * var_ < sq_mean] = 0 + if dof != 0: + var_ *= (count_ / (count_ - dof))[:, np.newaxis] + + index = pd.Index(keys, name=self.key, tupleize_cols=False) + count_sr = pd.Series(index=index, data=count_, name="count") + mean_df = pd.DataFrame( + index=index.copy(), columns=self.adata.var_names.copy(), data=mean_ + ) + var_df = pd.DataFrame( + index=index.copy(), columns=self.adata.var_names.copy(), data=var_ + ) + return CountMeanVar(count=count_sr, mean=mean_df, var=var_df) + + def score_pairs( + self, + score: Score, + pairs: Collection[Tuple[str, str]], + *, + return_stats: bool = False, + nan_to_zero: bool = True, + ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, CountMeanVar]]: + """ + Compute scores per feature for pairs of groups of observations. + + First, summary statistics are computed for each group key present in at least one pair. + Second, scores are computed from the summary statistics for each pair of keys in pairs. + + Each pair has control/source/vehicle first and case/target/perturbation second. + + Summary statistics are a subset of count (`n`), mean (`m`), and variance (`v`). + + Available scoring functions: + + - 'diff-score': `m1 - m0` + - 'fold-score': `m1 / m0` + - 't-score': `(m1 - m0) / sqrt(v0/n0 + v1/n1)` + - 'v-score': `(m1 - m0) / sqrt(v0 + v1)` + - 't-score-pooled': `(m1 - m0) / sqrt(v_pooled * (1/n0 + 1/n1))` + - 'v-score-pooled': `(m1 - m0) / sqrt(v_pooled)` + + where `v_pooled = ((n0 - 1) * v0 + (n1 - 1) * v1) / (n0 + n1 - 2)`. + + If weight is provided, then mean and variance are weighted. + + Pairs are dropped if either group has no observations. + + By default, all non-finite scores are reset to zero without warning. + Set `nan_to_zero=False` to investigate non-finite values more closely. + + Params + ------ + score + One of diff-score, fold-score, t-score, v-score, t-score-pooled, v-score-pooled. + pairs + List of ordered pairs of keys in adata.obs['key']. + return_stats + If True, also return dictionary of summary stats via tuple (scores, stats). + nan_to_zero: bool + If True, ignore divide-by-zero warnings and reset non-finite scores to zero. + + Returns + ------- + scores + DataFrame of scores indexed by key_0 and key_1 from each pair. + stats + If `return_stats=True` was specified, a dict of stat name to feature and observation + """ + scores = { + "diff-score": ( + lambda *args, **kwargs: CountMeanVar(mean=self.mean(*args, **kwargs)), + GroupBy._diff_score, + ), + "fold-score": ( + lambda *args, **kwargs: CountMeanVar(mean=self.mean(*args, **kwargs)), + GroupBy._fold_score, + ), + "t-score": (self.count_mean_var, GroupBy._t_score), + "t-score-pooled": (self.count_mean_var, GroupBy._t_score_pooled), + "v-score": (self.count_mean_var, GroupBy._v_score), + "v-score-pooled": (self.count_mean_var, GroupBy._v_score_pooled), + } + + stat_func, score_func = scores[score] + # key_set = set(k for p in pairs for k in p) + stats: CountMeanVar = stat_func() + # pairs = sorted(pairs) + i0, i1 = map(list, zip(*pairs)) + with np.errstate(divide=("ignore" if nan_to_zero else "warn")): + data = score_func( + stats.map( + lambda df: df.loc[i0].values, + count=lambda df: df.loc[i0].values[:, np.newaxis], + ), + stats.map( + lambda df: df.loc[i1].values, + count=lambda df: df.loc[i1].values[:, np.newaxis], + ), + ) + if nan_to_zero: + data[~np.isfinite(data)] = 0 + index = pd.MultiIndex.from_tuples( + pairs, names=[self.key + "_0", self.key + "_1"] + ) + df = pd.DataFrame(index=index, columns=self.adata.var_names.copy(), data=data) + if return_stats: + return df, stats + else: + return df + + def sparse_aggregator( + self, normalize: bool = False + ) -> Tuple[coo_matrix, np.ndarray]: + """ + Form a coordinate-sparse matrix A such that rows of A * X + are weighted sums of groups of rows of X. + + A[i, j] = w includes X[j,:] in group i with weight w. + + Params + ------ + normalize + If true, weights for each group are normalized to sum to 1.0, + corresponding to (weighted) mean. + + Returns + ------- + A + weighted sums of groups of rows of X. + keys + An ndarray with keys[i] the group key corresponding to row i of A. + """ + keys, key_index, obs_index, weight_value = self._extract_indices() + if obs_index is None: + obs_index = np.arange(len(key_index)) + if self.weight is None: + weight_value = np.ones(len(key_index)) + A = coo_matrix( + (weight_value, (key_index, obs_index)), + shape=(len(keys), self.adata.X.shape[0]), + ) + if normalize: + n_row = A.shape[0] + row_sums = np.asarray(A.sum(axis=1)) + D = dia_matrix(((row_sums.T ** -1), [0]), shape=(n_row, n_row)) + A = D * A + return A, keys + + def _extract_indices(self): + def _filter_indices(key_set, keys, key_index, obs_index, weight_value=None): + keep = [i for i, k in enumerate(keys) if k in set(key_set)] + if len(keep) == 0: + raise ValueError("No keys in key_set found in adata.obs[key].") + elif len(keep) < len(keys): + mask = np.in1d(key_index, keep) + remap = np.zeros(len(keys), dtype=np.int64) + for i, j in enumerate(keep): + remap[j] = i + keys = [keys[j] for j in keep] + key_index = np.array( + [remap[i] for i in key_index[mask]], dtype=np.int64 + ) + obs_index = obs_index[mask] + if weight_value is not None: + weight_value = weight_value[mask] + return keys, key_index, obs_index, weight_value + + key_value = self.adata.obs[self.key] + if self.explode: + assert isinstance( + key_value.iloc[0], tuple + ), "key type must be tuple to explode" + keys, key_index = np.unique( + _ndarray_from_seq([k for ks in key_value for k in ks]), + return_inverse=True, + ) + obs_index = np.array([i for i, ks in enumerate(key_value) for _ in ks]) + else: + keys, key_index = np.unique( + _ndarray_from_seq(key_value), return_inverse=True + ) + obs_index = np.arange(len(key_index)) + if self.weight is None: + weight_value = None + else: + weight_value = self.adata.obs[self.weight].values[obs_index] + if self.key_set is not None: + keys, key_index, obs_index, weight_value = _filter_indices( + self.key_set, keys, key_index, obs_index, weight_value + ) + self._key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter + return keys, key_index, obs_index, weight_value + + def pd_mean(self) -> pd.DataFrame: + """ + Slower implementation of mean that masks NaN values. + """ + assert ( + (self.weight is None) and (self.explode is False) and (self.key_set is None) + ) + df = pd.DataFrame( + index=self.adata.obs[self.key], + columns=self.adata.var_names, + data=utils.asarray(self.adata.X), + ) + return df.groupby(self.key).mean() + + def _pd_count_mean_var_df(self) -> pd.DataFrame: + assert ( + (self.weight is None) and (self.explode is False) and (self.key_set is None) + ) + aggs = ["count", "mean", "var"] + df = pd.DataFrame( + index=self.adata.obs[self.key], + columns=self.adata.var_names, + data=utils.asarray(self.adata.X), + ) + return df.groupby(self.key).agg(aggs).swaplevel(axis=1).sort_index(axis=1) + + def pd_count_mean_var(self) -> CountMeanVar: + """ + Slower implementation of count_mean_var that masks NaN values. + """ + return CountMeanVar.from_df(self._pd_count_mean_var_df()) + + def pd_score_pairs( + self, score: Score, pairs: Collection[Tuple[str, str]] + ) -> pd.DataFrame: + """ + Slower implementation of score_pairs that masks NaN values. + """ + assert ( + (self.weight is None) and (self.explode is False) and (self.key_set is None) + ) + scores = { + "diff-score": GroupBy._diff_score, + "fold-score": GroupBy._fold_score, + "t-score": GroupBy._t_score, + "t-score-pooled": GroupBy._t_score_pooled, + "v-score": GroupBy._v_score, + "v-score-pooled": GroupBy._v_score_pooled, + } + mean_only = score == "diff-score" or score == "fold-score" + if mean_only: + df = self.pd_mean() + else: + df = self._pd_count_mean_var_df() + dfs = df.loc[[p[0] for p in pairs]], df.loc[[p[1] for p in pairs]] + score_func = scores[score] + data = score_func( + *( + CountMeanVar( + count=None if mean_only else df["count"].values, + mean=df.values if mean_only else df["mean"].values, + var=None if mean_only else df["var"].values, + ) + for df in dfs + ) + ) + return pd.DataFrame( + index=pd.MultiIndex.from_tuples( + pairs, names=[self.key + "_0", self.key + "_1"] + ), + columns=dfs[0].columns if mean_only else dfs[1]["count"].columns, + data=data, + )[self.adata.var_names] + + # score functions + + @staticmethod + def _diff_score(cmv0: CountMeanVar, cmv1: CountMeanVar): + return cmv1.mean - cmv0.mean + + @staticmethod + def _fold_score(cmv0: CountMeanVar, cmv1: CountMeanVar): + return cmv1.mean / cmv0.mean + + @staticmethod + def _t_score(cmv0: CountMeanVar, cmv1: CountMeanVar): + std = np.sqrt(cmv0.var / cmv0.count + cmv1.var / cmv1.count) + ( + cmv0.var + cmv1.var == 0 + ) + return (cmv1.mean - cmv0.mean) / std + + @staticmethod + def _t_score_pooled(cmv0: CountMeanVar, cmv1: CountMeanVar): + var_pooled = GroupBy._var_pooled(cmv0, cmv1) + return (cmv1.mean - cmv0.mean) / np.sqrt( + var_pooled * (1 / cmv0.count + 1 / cmv1.count) + ) + + @staticmethod + def _v_score(cmv0: CountMeanVar, cmv1: CountMeanVar): + return (cmv1.mean - cmv0.mean) / ( + np.sqrt(cmv0.var + cmv1.var) + (cmv0.var + cmv1.var == 0) + ) + + @staticmethod + def _v_score_pooled(cmv0: CountMeanVar, cmv1: CountMeanVar): + var_pooled = GroupBy._var_pooled(cmv0, cmv1) + return (cmv1.mean - cmv0.mean) / np.sqrt(var_pooled) + + @staticmethod + def _var_pooled(cmv0: CountMeanVar, cmv1: CountMeanVar): + return ((cmv0.count - 1) * cmv0.var + (cmv1.count - 1) * cmv1.var) / ( + cmv0.count + cmv1.count - 2 + ) + + +def _power(X, power): + return X ** power if isinstance(X, np.ndarray) else X.power(power) + + +def _ndarray_from_seq(lst: Sequence): + # prevents expansion of iterables as axis + n = len(lst) + if n > 0 and isinstance(lst[0], cabc.Iterable): + arr = np.empty(n, dtype=object) + arr[:] = lst + else: + arr = np.array(lst) + return arr diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py new file mode 100644 index 000000000..ee4d10e7a --- /dev/null +++ b/scanpy/tests/test_groupby.py @@ -0,0 +1,108 @@ +import anndata as ad +import scanpy as sc +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix + + +def test_groupby(): + genes = ["A", "B"] + cells = [ + "v0", + "v1", + "v2", + "w0", + "w1", + "a1", + "a2", + "a3", + "b1", + "b2", + "c1", + "c2", + "d0", + ] + pairs = [("v", "b"), ("v", "a"), ("w", "c"), ("w", "d"), ("w", "v")] + + obs = pd.DataFrame(index=pd.Index(cells, name="cell")) + obs["key"] = pd.Categorical([c[0] for c in cells]) + obs["tuple_key"] = pd.Categorical([(c[0],) for c in cells]) + obs["weight"] = 2.0 + + var = pd.DataFrame(index=genes) + + X = np.array( + [ + [0, -2], + [1, 13], + [2, 1], # v + [3, 12], + [4, 2], # w + [5, 11], + [6, 3], + [7, 10], # a + [8, 4], + [9, 9], # b + [10, 5], + [11, 8], # c + [12, 6], # d + ], + dtype=np.float32, + ) + + adata_sparse = ad.AnnData(obs=obs, var=var, X=csr_matrix(X)) + adata_dense = ad.AnnData(obs=obs, var=var, X=X) + + gb = sc.get.GroupBy(adata_sparse, key="key") + stats_sparse = gb.count_mean_var() + stats_dense = sc.get.GroupBy(adata_dense, key="key").count_mean_var() + stats_pd = sc.get.GroupBy(adata_dense, key="key").pd_count_mean_var() + + assert stats_sparse.count.equals(stats_dense.count) + assert np.allclose(stats_sparse.mean, stats_dense.mean) + assert np.allclose(stats_sparse.var, stats_dense.var, equal_nan=True) + + assert stats_sparse.count.equals(stats_pd.count["A"]) + assert np.allclose(stats_sparse.mean, stats_pd.mean) + assert np.allclose(stats_sparse.var, stats_pd.var, equal_nan=True) + + gb_weight = sc.get.GroupBy(adata_sparse, key="key", weight="weight") + stats_weight = gb_weight.count_mean_var() + sum_ = gb.sum() + sum_weight = gb_weight.sum() + + assert np.allclose(2 * sum_, sum_weight) + assert np.allclose(stats_sparse.mean, stats_weight.mean) + assert np.allclose(stats_sparse.var, stats_dense.var, equal_nan=True) + + key_set = ["v", "w"] + mean_key_set = sc.get.GroupBy(adata_sparse, key="key", key_set=key_set).mean() + assert np.allclose(stats_sparse.mean.loc[key_set], mean_key_set) + + gb_explode = sc.get.GroupBy(adata_sparse, key="tuple_key", explode=True) + stats_explode = gb_explode.count_mean_var() + + assert stats_sparse.count.equals(stats_explode.count) + assert np.allclose(stats_sparse.mean, stats_explode.mean) + assert np.allclose(stats_sparse.var, stats_explode.var, equal_nan=True) + + for score in [ + "diff-score", + "fold-score", + "t-score", + "t-score-pooled", + "v-score", + "v-score-pooled", + ]: + score_sparse = gb.score_pairs(score, pairs=pairs, nan_to_zero=False) + score_explode = gb_explode.score_pairs(score, pairs, nan_to_zero=False) + score_pd = gb.pd_score_pairs(score, pairs=pairs) + + assert np.allclose(score_sparse, score_explode, equal_nan=True) + assert np.allclose(score_sparse, score_pd, equal_nan=True) + + score_nan = gb.score_pairs("t-score", pairs=pairs, nan_to_zero=False) + assert not np.all(np.isfinite(score_nan)) + + score_nan[~np.isfinite(score_nan)] = 0 + assert np.allclose(score_nan, gb.score_pairs("t-score", pairs=pairs)) From 9eb1993476a589ebb19d4aec7f6f90d4a3d3fa5d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 11:21:17 +0200 Subject: [PATCH 02/89] (feat): add option for custom data --- scanpy/get/groupby.py | 28 +++++++++++++++------------- scanpy/tests/test_groupby.py | 29 ++++++++++++++++++----------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 38328fcaa..f97daa959 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -15,7 +15,7 @@ import numpy as np import pandas as pd import collections.abc as cabc -from scipy.sparse import coo_matrix, dia_matrix +from scipy.sparse import coo_matrix, dia_matrix, spmatrix class CountMeanVar(NamedTuple): @@ -86,6 +86,8 @@ class GroupBy: adata key Group key field in adata.obs. + data + Element of the AnnData to aggregate (default None yields adata.X) weight Weight field in adata.obs of type float. explode @@ -97,6 +99,7 @@ class GroupBy: adata: AnnData key: str + data: Union[np.ndarray, spmatrix] weight: Optional[str] explode: bool key_set: AbstractSet[str] @@ -107,11 +110,13 @@ def __init__( adata: AnnData, key: str, *, + data: Union[np.ndarray, spmatrix, None] = None, weight: Optional[str] = None, explode: bool = False, key_set: Optional[Iterable[str]] = None, ): self.adata = adata + self.data = adata.X if data is None else data self.key = key self.weight = weight self.explode = explode @@ -143,11 +148,10 @@ def sum(self) -> pd.DataFrame: DataFrame of sums indexed by key with columns from adata. """ A, keys = self.sparse_aggregator(normalize=False) - X = self.adata.X return pd.DataFrame( index=pd.Index(keys, name=self.key, tupleize_cols=False), columns=self.adata.var_names.copy(), - data=utils.asarray(A * X), + data=utils.asarray(A * self.data), ) def mean(self) -> pd.DataFrame: @@ -159,11 +163,10 @@ def mean(self) -> pd.DataFrame: DataFrame of means indexed by key with columns from adata. """ A, keys = self.sparse_aggregator(normalize=True) - X = self.adata.X return pd.DataFrame( index=pd.Index(keys, name=self.key, tupleize_cols=False), columns=self.adata.var_names.copy(), - data=utils.asarray(A * X), + data=utils.asarray(A * self.data), ) def var(self, dof: int = 1) -> pd.DataFrame: @@ -204,20 +207,19 @@ def count_mean_var(self, dof: int = 1) -> CountMeanVar: """ assert dof >= 0 A, keys = self.sparse_aggregator(normalize=True) - X = self.adata.X count_ = np.bincount(self._key_index) - mean_ = utils.asarray(A @ X) - mean_sq = utils.asarray(A @ _power(X, 2)) + mean_ = utils.asarray(A @ self.data) + mean_sq = utils.asarray(A @ _power(self.data, 2)) if self.weight is None: sq_mean = mean_ ** 2 else: A_unweighted, _ = GroupBy( self.adata, self.key, explode=self.explode, key_set=self.key_set ).sparse_aggregator() - mean_unweighted = utils.asarray(A_unweighted * X) + mean_unweighted = utils.asarray(A_unweighted * self.data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted ** 2 var_ = mean_sq - sq_mean - precision = 2 << (42 if X.dtype == np.float64 else 20) + precision = 2 << (42 if self.data.dtype == np.float64 else 20) # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 var_[precision * var_ < sq_mean] = 0 if dof != 0: @@ -358,7 +360,7 @@ def sparse_aggregator( weight_value = np.ones(len(key_index)) A = coo_matrix( (weight_value, (key_index, obs_index)), - shape=(len(keys), self.adata.X.shape[0]), + shape=(len(keys), self.data.shape[0]), ) if normalize: n_row = A.shape[0] @@ -422,7 +424,7 @@ def pd_mean(self) -> pd.DataFrame: df = pd.DataFrame( index=self.adata.obs[self.key], columns=self.adata.var_names, - data=utils.asarray(self.adata.X), + data=utils.asarray(self.data), ) return df.groupby(self.key).mean() @@ -434,7 +436,7 @@ def _pd_count_mean_var_df(self) -> pd.DataFrame: df = pd.DataFrame( index=self.adata.obs[self.key], columns=self.adata.var_names, - data=utils.asarray(self.adata.X), + data=utils.asarray(self.data), ) return df.groupby(self.key).agg(aggs).swaplevel(axis=1).sort_index(axis=1) diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index ee4d10e7a..1ef17f1f0 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -3,9 +3,16 @@ import numpy as np import pandas as pd from scipy.sparse import csr_matrix - - -def test_groupby(): +import pytest + +@pytest.mark.parametrize( + 'use_layers', + [ + False, + True, + ], +) +def test_groupby(use_layers): genes = ["A", "B"] cells = [ "v0", @@ -50,13 +57,13 @@ def test_groupby(): dtype=np.float32, ) - adata_sparse = ad.AnnData(obs=obs, var=var, X=csr_matrix(X)) - adata_dense = ad.AnnData(obs=obs, var=var, X=X) + adata_sparse = ad.AnnData(obs=obs, var=var, X=csr_matrix(X), layers={ 'test': csr_matrix(X) }) + adata_dense = ad.AnnData(obs=obs, var=var, X=X, layers={ 'test': X.copy() }) # .copy needed? - gb = sc.get.GroupBy(adata_sparse, key="key") + gb = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key") stats_sparse = gb.count_mean_var() - stats_dense = sc.get.GroupBy(adata_dense, key="key").count_mean_var() - stats_pd = sc.get.GroupBy(adata_dense, key="key").pd_count_mean_var() + stats_dense = sc.get.GroupBy(adata_dense, data=(adata_dense.layers['test'] if use_layers else adata_dense.X), key="key").count_mean_var() + stats_pd = sc.get.GroupBy(adata_dense, data=(adata_dense.layers['test'] if use_layers else adata_dense.X), key="key").pd_count_mean_var() assert stats_sparse.count.equals(stats_dense.count) assert np.allclose(stats_sparse.mean, stats_dense.mean) @@ -66,7 +73,7 @@ def test_groupby(): assert np.allclose(stats_sparse.mean, stats_pd.mean) assert np.allclose(stats_sparse.var, stats_pd.var, equal_nan=True) - gb_weight = sc.get.GroupBy(adata_sparse, key="key", weight="weight") + gb_weight = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key", weight="weight") stats_weight = gb_weight.count_mean_var() sum_ = gb.sum() sum_weight = gb_weight.sum() @@ -76,10 +83,10 @@ def test_groupby(): assert np.allclose(stats_sparse.var, stats_dense.var, equal_nan=True) key_set = ["v", "w"] - mean_key_set = sc.get.GroupBy(adata_sparse, key="key", key_set=key_set).mean() + mean_key_set = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key", key_set=key_set).mean() assert np.allclose(stats_sparse.mean.loc[key_set], mean_key_set) - gb_explode = sc.get.GroupBy(adata_sparse, key="tuple_key", explode=True) + gb_explode = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="tuple_key", explode=True) stats_explode = gb_explode.count_mean_var() assert stats_sparse.count.equals(stats_explode.count) From e1c7eef7723fd877ad4c9d62c60f67252ac3c38a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 11:49:30 +0200 Subject: [PATCH 03/89] (chore): remove pair scoring --- scanpy/get/groupby.py | 180 ----------------------------------- scanpy/tests/test_groupby.py | 21 ---- 2 files changed, 201 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index f97daa959..0230685c5 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -4,7 +4,6 @@ Iterable, AbstractSet, Sequence, - Collection, Tuple, Union, NamedTuple, @@ -235,102 +234,6 @@ def count_mean_var(self, dof: int = 1) -> CountMeanVar: ) return CountMeanVar(count=count_sr, mean=mean_df, var=var_df) - def score_pairs( - self, - score: Score, - pairs: Collection[Tuple[str, str]], - *, - return_stats: bool = False, - nan_to_zero: bool = True, - ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, CountMeanVar]]: - """ - Compute scores per feature for pairs of groups of observations. - - First, summary statistics are computed for each group key present in at least one pair. - Second, scores are computed from the summary statistics for each pair of keys in pairs. - - Each pair has control/source/vehicle first and case/target/perturbation second. - - Summary statistics are a subset of count (`n`), mean (`m`), and variance (`v`). - - Available scoring functions: - - - 'diff-score': `m1 - m0` - - 'fold-score': `m1 / m0` - - 't-score': `(m1 - m0) / sqrt(v0/n0 + v1/n1)` - - 'v-score': `(m1 - m0) / sqrt(v0 + v1)` - - 't-score-pooled': `(m1 - m0) / sqrt(v_pooled * (1/n0 + 1/n1))` - - 'v-score-pooled': `(m1 - m0) / sqrt(v_pooled)` - - where `v_pooled = ((n0 - 1) * v0 + (n1 - 1) * v1) / (n0 + n1 - 2)`. - - If weight is provided, then mean and variance are weighted. - - Pairs are dropped if either group has no observations. - - By default, all non-finite scores are reset to zero without warning. - Set `nan_to_zero=False` to investigate non-finite values more closely. - - Params - ------ - score - One of diff-score, fold-score, t-score, v-score, t-score-pooled, v-score-pooled. - pairs - List of ordered pairs of keys in adata.obs['key']. - return_stats - If True, also return dictionary of summary stats via tuple (scores, stats). - nan_to_zero: bool - If True, ignore divide-by-zero warnings and reset non-finite scores to zero. - - Returns - ------- - scores - DataFrame of scores indexed by key_0 and key_1 from each pair. - stats - If `return_stats=True` was specified, a dict of stat name to feature and observation - """ - scores = { - "diff-score": ( - lambda *args, **kwargs: CountMeanVar(mean=self.mean(*args, **kwargs)), - GroupBy._diff_score, - ), - "fold-score": ( - lambda *args, **kwargs: CountMeanVar(mean=self.mean(*args, **kwargs)), - GroupBy._fold_score, - ), - "t-score": (self.count_mean_var, GroupBy._t_score), - "t-score-pooled": (self.count_mean_var, GroupBy._t_score_pooled), - "v-score": (self.count_mean_var, GroupBy._v_score), - "v-score-pooled": (self.count_mean_var, GroupBy._v_score_pooled), - } - - stat_func, score_func = scores[score] - # key_set = set(k for p in pairs for k in p) - stats: CountMeanVar = stat_func() - # pairs = sorted(pairs) - i0, i1 = map(list, zip(*pairs)) - with np.errstate(divide=("ignore" if nan_to_zero else "warn")): - data = score_func( - stats.map( - lambda df: df.loc[i0].values, - count=lambda df: df.loc[i0].values[:, np.newaxis], - ), - stats.map( - lambda df: df.loc[i1].values, - count=lambda df: df.loc[i1].values[:, np.newaxis], - ), - ) - if nan_to_zero: - data[~np.isfinite(data)] = 0 - index = pd.MultiIndex.from_tuples( - pairs, names=[self.key + "_0", self.key + "_1"] - ) - df = pd.DataFrame(index=index, columns=self.adata.var_names.copy(), data=data) - if return_stats: - return df, stats - else: - return df - def sparse_aggregator( self, normalize: bool = False ) -> Tuple[coo_matrix, np.ndarray]: @@ -446,89 +349,6 @@ def pd_count_mean_var(self) -> CountMeanVar: """ return CountMeanVar.from_df(self._pd_count_mean_var_df()) - def pd_score_pairs( - self, score: Score, pairs: Collection[Tuple[str, str]] - ) -> pd.DataFrame: - """ - Slower implementation of score_pairs that masks NaN values. - """ - assert ( - (self.weight is None) and (self.explode is False) and (self.key_set is None) - ) - scores = { - "diff-score": GroupBy._diff_score, - "fold-score": GroupBy._fold_score, - "t-score": GroupBy._t_score, - "t-score-pooled": GroupBy._t_score_pooled, - "v-score": GroupBy._v_score, - "v-score-pooled": GroupBy._v_score_pooled, - } - mean_only = score == "diff-score" or score == "fold-score" - if mean_only: - df = self.pd_mean() - else: - df = self._pd_count_mean_var_df() - dfs = df.loc[[p[0] for p in pairs]], df.loc[[p[1] for p in pairs]] - score_func = scores[score] - data = score_func( - *( - CountMeanVar( - count=None if mean_only else df["count"].values, - mean=df.values if mean_only else df["mean"].values, - var=None if mean_only else df["var"].values, - ) - for df in dfs - ) - ) - return pd.DataFrame( - index=pd.MultiIndex.from_tuples( - pairs, names=[self.key + "_0", self.key + "_1"] - ), - columns=dfs[0].columns if mean_only else dfs[1]["count"].columns, - data=data, - )[self.adata.var_names] - - # score functions - - @staticmethod - def _diff_score(cmv0: CountMeanVar, cmv1: CountMeanVar): - return cmv1.mean - cmv0.mean - - @staticmethod - def _fold_score(cmv0: CountMeanVar, cmv1: CountMeanVar): - return cmv1.mean / cmv0.mean - - @staticmethod - def _t_score(cmv0: CountMeanVar, cmv1: CountMeanVar): - std = np.sqrt(cmv0.var / cmv0.count + cmv1.var / cmv1.count) + ( - cmv0.var + cmv1.var == 0 - ) - return (cmv1.mean - cmv0.mean) / std - - @staticmethod - def _t_score_pooled(cmv0: CountMeanVar, cmv1: CountMeanVar): - var_pooled = GroupBy._var_pooled(cmv0, cmv1) - return (cmv1.mean - cmv0.mean) / np.sqrt( - var_pooled * (1 / cmv0.count + 1 / cmv1.count) - ) - - @staticmethod - def _v_score(cmv0: CountMeanVar, cmv1: CountMeanVar): - return (cmv1.mean - cmv0.mean) / ( - np.sqrt(cmv0.var + cmv1.var) + (cmv0.var + cmv1.var == 0) - ) - - @staticmethod - def _v_score_pooled(cmv0: CountMeanVar, cmv1: CountMeanVar): - var_pooled = GroupBy._var_pooled(cmv0, cmv1) - return (cmv1.mean - cmv0.mean) / np.sqrt(var_pooled) - - @staticmethod - def _var_pooled(cmv0: CountMeanVar, cmv1: CountMeanVar): - return ((cmv0.count - 1) * cmv0.var + (cmv1.count - 1) * cmv1.var) / ( - cmv0.count + cmv1.count - 2 - ) - def _power(X, power): return X ** power if isinstance(X, np.ndarray) else X.power(power) diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index 1ef17f1f0..e8711e43e 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -92,24 +92,3 @@ def test_groupby(use_layers): assert stats_sparse.count.equals(stats_explode.count) assert np.allclose(stats_sparse.mean, stats_explode.mean) assert np.allclose(stats_sparse.var, stats_explode.var, equal_nan=True) - - for score in [ - "diff-score", - "fold-score", - "t-score", - "t-score-pooled", - "v-score", - "v-score-pooled", - ]: - score_sparse = gb.score_pairs(score, pairs=pairs, nan_to_zero=False) - score_explode = gb_explode.score_pairs(score, pairs, nan_to_zero=False) - score_pd = gb.pd_score_pairs(score, pairs=pairs) - - assert np.allclose(score_sparse, score_explode, equal_nan=True) - assert np.allclose(score_sparse, score_pd, equal_nan=True) - - score_nan = gb.score_pairs("t-score", pairs=pairs, nan_to_zero=False) - assert not np.all(np.isfinite(score_nan)) - - score_nan[~np.isfinite(score_nan)] = 0 - assert np.allclose(score_nan, gb.score_pairs("t-score", pairs=pairs)) From 31beb0d64b2dc5d6289e065a2e5534dc98466bd5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 11:59:19 +0200 Subject: [PATCH 04/89] (feat): change return types to `AnnData` --- scanpy/get/groupby.py | 133 ++++++++++------------------------- scanpy/tests/test_groupby.py | 25 +++---- 2 files changed, 46 insertions(+), 112 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 0230685c5..39abbdc48 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -16,31 +16,6 @@ import collections.abc as cabc from scipy.sparse import coo_matrix, dia_matrix, spmatrix - -class CountMeanVar(NamedTuple): - count: Optional[pd.Series] = None - mean: pd.DataFrame = None - var: Optional[pd.DataFrame] = None - - @classmethod - def from_df(cls, df: pd.DataFrame) -> "CountMeanVar": - return CountMeanVar( - count=df["count"] if "count" in df.columns else None, - mean=df["mean"], - var=df["var"] if "var" in df.columns else None, - ) - - def map(self, f=lambda v: v, **fs) -> "CountMeanVar": - fs = defaultdict(lambda: f, fs) - return CountMeanVar( - **{ - stat: fs[stat](val) - for stat, val in self._asdict().items() - if val is not None - } - ) - - Score = Literal[ "diff-score", "fold-score", "t-score", "v-score", "t-score-pooled", "v-score-pooled" ] @@ -138,54 +113,46 @@ def count(self) -> pd.Series: name="count", ) - def sum(self) -> pd.DataFrame: + def sum(self) -> AnnData: """ Compute the sum per feature per group of observations. Returns ------- - DataFrame of sums indexed by key with columns from adata. + AnnData with sum in X indexed on obs by key with var from adata. """ A, keys = self.sparse_aggregator(normalize=False) - return pd.DataFrame( - index=pd.Index(keys, name=self.key, tupleize_cols=False), - columns=self.adata.var_names.copy(), - data=utils.asarray(A * self.data), + + return AnnData( + obs=pd.DataFrame( + index=pd.Index(keys, name=self.key), + ), + var=pd.DataFrame( + index=pd.Index(self.adata.var_names.copy(), name=self.key), + ), + X=utils.asarray(A * self.data) ) - def mean(self) -> pd.DataFrame: + def mean(self) -> AnnData: """ Compute the mean per feature per group of observations. Returns ------- - DataFrame of means indexed by key with columns from adata. + AnnData with means in X indexed on obs by key with var from adata. """ A, keys = self.sparse_aggregator(normalize=True) - return pd.DataFrame( - index=pd.Index(keys, name=self.key, tupleize_cols=False), - columns=self.adata.var_names.copy(), - data=utils.asarray(A * self.data), + return AnnData( + obs=pd.DataFrame( + index=pd.Index(keys, name=self.key), + ), + var=pd.DataFrame( + index=pd.Index(self.adata.var_names.copy(), name=self.key), + ), + X=utils.asarray(A * self.data) ) - def var(self, dof: int = 1) -> pd.DataFrame: - """ - Compute the variance per feature per group of observations. - - See also count_mean_var, which is more efficient when the mean is also desired. - - Params - ------ - dof - Degrees of freedom for variance. - - Returns - ------- - DataFrame of variances indexed by key with columns from adata. - """ - return self.count_mean_var(dof).var - - def count_mean_var(self, dof: int = 1) -> CountMeanVar: + def count_mean_var(self, dof: int = 1) -> AnnData: """ Compute the count, as well as mean and variance per feature, per group of observations. @@ -202,7 +169,7 @@ def count_mean_var(self, dof: int = 1) -> CountMeanVar: Returns ------- - Dictionary with keys (count, mean, var) and values the corresponding Series and DataFrames. + AnnData with mean and var in layers indexed on obs by key with var from adata. Counts are in obs under counts. """ assert dof >= 0 A, keys = self.sparse_aggregator(normalize=True) @@ -224,15 +191,20 @@ def count_mean_var(self, dof: int = 1) -> CountMeanVar: if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] - index = pd.Index(keys, name=self.key, tupleize_cols=False) - count_sr = pd.Series(index=index, data=count_, name="count") - mean_df = pd.DataFrame( - index=index.copy(), columns=self.adata.var_names.copy(), data=mean_ - ) - var_df = pd.DataFrame( - index=index.copy(), columns=self.adata.var_names.copy(), data=var_ + return AnnData( + obs=pd.DataFrame( + index=pd.Index(keys, name=self.key), + columns=['count'], + data=count_ + ), + var=pd.DataFrame( + index=pd.Index(self.adata.var_names.copy(), name=self.key), + ), + layers={ + 'mean': mean_, + 'var': var_ + } ) - return CountMeanVar(count=count_sr, mean=mean_df, var=var_df) def sparse_aggregator( self, normalize: bool = False @@ -317,39 +289,6 @@ def _filter_indices(key_set, keys, key_index, obs_index, weight_value=None): self._key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter return keys, key_index, obs_index, weight_value - def pd_mean(self) -> pd.DataFrame: - """ - Slower implementation of mean that masks NaN values. - """ - assert ( - (self.weight is None) and (self.explode is False) and (self.key_set is None) - ) - df = pd.DataFrame( - index=self.adata.obs[self.key], - columns=self.adata.var_names, - data=utils.asarray(self.data), - ) - return df.groupby(self.key).mean() - - def _pd_count_mean_var_df(self) -> pd.DataFrame: - assert ( - (self.weight is None) and (self.explode is False) and (self.key_set is None) - ) - aggs = ["count", "mean", "var"] - df = pd.DataFrame( - index=self.adata.obs[self.key], - columns=self.adata.var_names, - data=utils.asarray(self.data), - ) - return df.groupby(self.key).agg(aggs).swaplevel(axis=1).sort_index(axis=1) - - def pd_count_mean_var(self) -> CountMeanVar: - """ - Slower implementation of count_mean_var that masks NaN values. - """ - return CountMeanVar.from_df(self._pd_count_mean_var_df()) - - def _power(X, power): return X ** power if isinstance(X, np.ndarray) else X.power(power) diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index e8711e43e..b9e5cfbd8 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -63,32 +63,27 @@ def test_groupby(use_layers): gb = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key") stats_sparse = gb.count_mean_var() stats_dense = sc.get.GroupBy(adata_dense, data=(adata_dense.layers['test'] if use_layers else adata_dense.X), key="key").count_mean_var() - stats_pd = sc.get.GroupBy(adata_dense, data=(adata_dense.layers['test'] if use_layers else adata_dense.X), key="key").pd_count_mean_var() - assert stats_sparse.count.equals(stats_dense.count) - assert np.allclose(stats_sparse.mean, stats_dense.mean) - assert np.allclose(stats_sparse.var, stats_dense.var, equal_nan=True) - - assert stats_sparse.count.equals(stats_pd.count["A"]) - assert np.allclose(stats_sparse.mean, stats_pd.mean) - assert np.allclose(stats_sparse.var, stats_pd.var, equal_nan=True) + assert np.allclose(stats_sparse.obs['count'], stats_dense.obs['count']) + assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) + assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) gb_weight = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key", weight="weight") stats_weight = gb_weight.count_mean_var() sum_ = gb.sum() sum_weight = gb_weight.sum() - assert np.allclose(2 * sum_, sum_weight) - assert np.allclose(stats_sparse.mean, stats_weight.mean) - assert np.allclose(stats_sparse.var, stats_dense.var, equal_nan=True) + assert np.allclose(2 * sum_.X, sum_weight.X) + assert np.allclose(stats_sparse.layers['mean'], stats_weight.layers['mean']) + assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) key_set = ["v", "w"] mean_key_set = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key", key_set=key_set).mean() - assert np.allclose(stats_sparse.mean.loc[key_set], mean_key_set) + assert np.allclose(stats_sparse[stats_sparse.obs.index.isin(key_set), :].layers['mean'], mean_key_set.X) gb_explode = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="tuple_key", explode=True) stats_explode = gb_explode.count_mean_var() - assert stats_sparse.count.equals(stats_explode.count) - assert np.allclose(stats_sparse.mean, stats_explode.mean) - assert np.allclose(stats_sparse.var, stats_explode.var, equal_nan=True) + assert np.allclose(stats_sparse.obs['count'], stats_explode.obs['count']) + assert np.allclose(stats_sparse.layers['mean'], stats_explode.layers['mean']) + assert np.allclose(stats_sparse.layers['var'], stats_explode.layers['var'], equal_nan=True) From 668e725c69b816b70affee0dd658e244516955c3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 15:24:53 +0200 Subject: [PATCH 05/89] (feat): keep `superset` columns. --- scanpy/get/groupby.py | 59 +++++++++++++++++++++++------------- scanpy/tests/test_groupby.py | 16 +++++----- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 39abbdc48..4443fe0bd 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -1,4 +1,4 @@ -from collections import defaultdict +from functools import cached_property from typing import ( Optional, Iterable, @@ -6,8 +6,8 @@ Sequence, Tuple, Union, - NamedTuple, Literal, + List ) from anndata import AnnData, utils @@ -97,6 +97,31 @@ def __init__( self.key_set = None if key_set is None else dict.fromkeys(key_set).keys() self._key_index = None + @cached_property + def _superset_columns(self) -> List[str]: + """Find all columns which are a superset of the key column. + + Returns: + List[str]: Superset columns. + """ + columns = [] + groupy_key_codes = self.adata.obs[self.key].astype('category') + for key in self.adata.obs: + if key != self.key: + key_codes = self.adata.obs[key].astype('category') + if all([key_codes[groupy_key_codes == group_key_code].nunique() == 1 for group_key_code in groupy_key_codes]): + columns += [key] + return columns + + @cached_property + def df_grouped(self) -> pd.DataFrame: + df = self.adata.obs.copy() + if self.key_set is not None: + df = df[df[self.key].isin(self.key_set)] + if df[self.key].dtype.name == 'category': + df[self.key] = df[self.key].cat.remove_unused_categories() + return df.groupby(self.key).first()[self._superset_columns] + def count(self) -> pd.Series: """ Count the number of observations in each group. @@ -121,14 +146,11 @@ def sum(self) -> AnnData: ------- AnnData with sum in X indexed on obs by key with var from adata. """ - A, keys = self.sparse_aggregator(normalize=False) - + A, _ = self.sparse_aggregator(normalize=False) return AnnData( - obs=pd.DataFrame( - index=pd.Index(keys, name=self.key), - ), + obs=self.df_grouped, var=pd.DataFrame( - index=pd.Index(self.adata.var_names.copy(), name=self.key), + index=pd.Index(self.adata.var_names.copy()), ), X=utils.asarray(A * self.data) ) @@ -141,13 +163,11 @@ def mean(self) -> AnnData: ------- AnnData with means in X indexed on obs by key with var from adata. """ - A, keys = self.sparse_aggregator(normalize=True) + A, _ = self.sparse_aggregator(normalize=True) return AnnData( - obs=pd.DataFrame( - index=pd.Index(keys, name=self.key), - ), + obs=self.df_grouped, var=pd.DataFrame( - index=pd.Index(self.adata.var_names.copy(), name=self.key), + index=pd.Index(self.adata.var_names.copy()), ), X=utils.asarray(A * self.data) ) @@ -172,7 +192,7 @@ def count_mean_var(self, dof: int = 1) -> AnnData: AnnData with mean and var in layers indexed on obs by key with var from adata. Counts are in obs under counts. """ assert dof >= 0 - A, keys = self.sparse_aggregator(normalize=True) + A, _ = self.sparse_aggregator(normalize=True) count_ = np.bincount(self._key_index) mean_ = utils.asarray(A @ self.data) mean_sq = utils.asarray(A @ _power(self.data, 2)) @@ -190,15 +210,12 @@ def count_mean_var(self, dof: int = 1) -> AnnData: var_[precision * var_ < sq_mean] = 0 if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] - + obs = self.df_grouped + obs['count'] = count_ return AnnData( - obs=pd.DataFrame( - index=pd.Index(keys, name=self.key), - columns=['count'], - data=count_ - ), + obs=obs, var=pd.DataFrame( - index=pd.Index(self.adata.var_names.copy(), name=self.key), + index=pd.Index(self.adata.var_names.copy()), ), layers={ 'mean': mean_, diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index b9e5cfbd8..f466aa0bf 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -29,11 +29,11 @@ def test_groupby(use_layers): "c2", "d0", ] - pairs = [("v", "b"), ("v", "a"), ("w", "c"), ("w", "d"), ("w", "v")] obs = pd.DataFrame(index=pd.Index(cells, name="cell")) obs["key"] = pd.Categorical([c[0] for c in cells]) - obs["tuple_key"] = pd.Categorical([(c[0],) for c in cells]) + obs["key_superset"] = pd.Categorical([c[0] for c in cells]).map({'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'}) + obs["key_subset"] = pd.Categorical([c[1] for c in cells]) obs["weight"] = 2.0 var = pd.DataFrame(index=genes) @@ -64,6 +64,10 @@ def test_groupby(use_layers): stats_sparse = gb.count_mean_var() stats_dense = sc.get.GroupBy(adata_dense, data=(adata_dense.layers['test'] if use_layers else adata_dense.X), key="key").count_mean_var() + # superset columns can be kept but not subsets + assert 'key_superset' in stats_dense.obs + assert 'key_subset' not in stats_dense.obs + assert np.allclose(stats_sparse.obs['count'], stats_dense.obs['count']) assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) @@ -80,10 +84,4 @@ def test_groupby(use_layers): key_set = ["v", "w"] mean_key_set = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key", key_set=key_set).mean() assert np.allclose(stats_sparse[stats_sparse.obs.index.isin(key_set), :].layers['mean'], mean_key_set.X) - - gb_explode = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="tuple_key", explode=True) - stats_explode = gb_explode.count_mean_var() - - assert np.allclose(stats_sparse.obs['count'], stats_explode.obs['count']) - assert np.allclose(stats_sparse.layers['mean'], stats_explode.layers['mean']) - assert np.allclose(stats_sparse.layers['var'], stats_explode.layers['var'], equal_nan=True) + \ No newline at end of file From b23dd9c4407e69a56676fe2990718f238d6f0d45 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 15:27:53 +0200 Subject: [PATCH 06/89] (chore): remove `explode` option (i.e., tuples) --- scanpy/get/groupby.py | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 4443fe0bd..1c6707574 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -16,11 +16,6 @@ import collections.abc as cabc from scipy.sparse import coo_matrix, dia_matrix, spmatrix -Score = Literal[ - "diff-score", "fold-score", "t-score", "v-score", "t-score-pooled", "v-score-pooled" -] - - class GroupBy: """ Functionality for grouping and aggregating AnnData observations by key, per variable. @@ -30,15 +25,9 @@ class GroupBy: Set `weight` for weighted sum, mean, and variance. - Set `explode` to True and use a key of type tuple to assign observations to multiple groups. - In this case, repetition of a key confers multiplicity of the observation in the group. Set `key_set` to a list of keys to most efficiently compute results for a subset of groups. - NaN values propagate, with the exception that `score_pairs` sets non-finite scores to 0 by - default. Use the pd_* methods to instead mask NaN values. These slower methods convert data - to dense format and do not currently support weight, explode, or key_set. - **Implementation** Moments are computed using weighted sum aggregation of AnnData obsevations per variable @@ -64,9 +53,6 @@ class GroupBy: Element of the AnnData to aggregate (default None yields adata.X) weight Weight field in adata.obs of type float. - explode - If False, each observation is assigned to the group keyed by adata.obs[key]. - If True, each observation is assigned to all groups in tuple adata.obs[key]. key_set Subset of keys to which to filter. """ @@ -75,7 +61,6 @@ class GroupBy: key: str data: Union[np.ndarray, spmatrix] weight: Optional[str] - explode: bool key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated @@ -86,14 +71,12 @@ def __init__( *, data: Union[np.ndarray, spmatrix, None] = None, weight: Optional[str] = None, - explode: bool = False, key_set: Optional[Iterable[str]] = None, ): self.adata = adata self.data = adata.X if data is None else data self.key = key self.weight = weight - self.explode = explode self.key_set = None if key_set is None else dict.fromkeys(key_set).keys() self._key_index = None @@ -200,7 +183,7 @@ def count_mean_var(self, dof: int = 1) -> AnnData: sq_mean = mean_ ** 2 else: A_unweighted, _ = GroupBy( - self.adata, self.key, explode=self.explode, key_set=self.key_set + self.adata, self.key, key_set=self.key_set ).sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self.data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted ** 2 @@ -281,20 +264,10 @@ def _filter_indices(key_set, keys, key_index, obs_index, weight_value=None): return keys, key_index, obs_index, weight_value key_value = self.adata.obs[self.key] - if self.explode: - assert isinstance( - key_value.iloc[0], tuple - ), "key type must be tuple to explode" - keys, key_index = np.unique( - _ndarray_from_seq([k for ks in key_value for k in ks]), - return_inverse=True, - ) - obs_index = np.array([i for i, ks in enumerate(key_value) for _ in ks]) - else: - keys, key_index = np.unique( - _ndarray_from_seq(key_value), return_inverse=True - ) - obs_index = np.arange(len(key_index)) + keys, key_index = np.unique( + _ndarray_from_seq(key_value), return_inverse=True + ) + obs_index = np.arange(len(key_index)) if self.weight is None: weight_value = None else: From 6177857dcd1d095abf335aa59814f8cbb0227760 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 17:02:29 +0200 Subject: [PATCH 07/89] (feat): first pass at `var`/`obs` --- scanpy/get/groupby.py | 88 +++++++++++++++++++++--------------- scanpy/tests/test_groupby.py | 75 +++++++++++++++++++++--------- 2 files changed, 105 insertions(+), 58 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 1c6707574..82afafc91 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -55,6 +55,8 @@ class GroupBy: Weight field in adata.obs of type float. key_set Subset of keys to which to filter. + df_key + One of 'obs' or 'var' on which to groupby """ adata: AnnData @@ -63,21 +65,27 @@ class GroupBy: weight: Optional[str] key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated + _df_key: str def __init__( self, adata: AnnData, key: str, *, - data: Union[np.ndarray, spmatrix, None] = None, + data: Optional[Union[np.ndarray, spmatrix]] = None, weight: Optional[str] = None, key_set: Optional[Iterable[str]] = None, + df_key: str = 'obs', ): self.adata = adata self.data = adata.X if data is None else data + if df_key == 'var' and data is None: + self.data = self.data.T self.key = key self.weight = weight self.key_set = None if key_set is None else dict.fromkeys(key_set).keys() + self._df_key = df_key + self._base_key = 'obs' if df_key != 'obs' else 'var' self._key_index = None @cached_property @@ -88,22 +96,35 @@ def _superset_columns(self) -> List[str]: List[str]: Superset columns. """ columns = [] - groupy_key_codes = self.adata.obs[self.key].astype('category') - for key in self.adata.obs: + groupy_key_codes = getattr(self.adata, self._df_key)[self.key].astype('category') + for key in getattr(self.adata, self._df_key): if key != self.key: - key_codes = self.adata.obs[key].astype('category') + key_codes = getattr(self.adata, self._df_key)[key].astype('category') if all([key_codes[groupy_key_codes == group_key_code].nunique() == 1 for group_key_code in groupy_key_codes]): columns += [key] return columns @cached_property def df_grouped(self) -> pd.DataFrame: - df = self.adata.obs.copy() + df = getattr(self.adata, self._df_key).copy() if self.key_set is not None: df = df[df[self.key].isin(self.key_set)] if df[self.key].dtype.name == 'category': df[self.key] = df[self.key].cat.remove_unused_categories() return df.groupby(self.key).first()[self._superset_columns] + + @cached_property + def base_axis_indices(self) -> pd.Index: + return pd.DataFrame( + index=pd.Index(getattr(self.adata, f'{self._base_key}_names').copy()) + ) + + @cached_property + def obs_var_dict(self) -> dict: + return { + self._df_key: self.df_grouped, + self._base_key: self.base_axis_indices + } def count(self) -> pd.Series: """ @@ -130,12 +151,10 @@ def sum(self) -> AnnData: AnnData with sum in X indexed on obs by key with var from adata. """ A, _ = self.sparse_aggregator(normalize=False) + X = utils.asarray(A * self.data) return AnnData( - obs=self.df_grouped, - var=pd.DataFrame( - index=pd.Index(self.adata.var_names.copy()), - ), - X=utils.asarray(A * self.data) + **self.obs_var_dict, + X=X if self._df_key == 'obs' else X.T ) def mean(self) -> AnnData: @@ -147,12 +166,10 @@ def mean(self) -> AnnData: AnnData with means in X indexed on obs by key with var from adata. """ A, _ = self.sparse_aggregator(normalize=True) + X = utils.asarray(A * self.data) return AnnData( - obs=self.df_grouped, - var=pd.DataFrame( - index=pd.Index(self.adata.var_names.copy()), - ), - X=utils.asarray(A * self.data) + **self.obs_var_dict, + X=X if self._df_key == 'obs' else X.T ) def count_mean_var(self, dof: int = 1) -> AnnData: @@ -183,7 +200,7 @@ def count_mean_var(self, dof: int = 1) -> AnnData: sq_mean = mean_ ** 2 else: A_unweighted, _ = GroupBy( - self.adata, self.key, key_set=self.key_set + self.adata, self.key, key_set=self.key_set, df_key=self._df_key ).sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self.data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted ** 2 @@ -193,16 +210,13 @@ def count_mean_var(self, dof: int = 1) -> AnnData: var_[precision * var_ < sq_mean] = 0 if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] - obs = self.df_grouped - obs['count'] = count_ + obs_var_dict = self.obs_var_dict + obs_var_dict[self._df_key]['count'] = count_ return AnnData( - obs=obs, - var=pd.DataFrame( - index=pd.Index(self.adata.var_names.copy()), - ), + **obs_var_dict, layers={ - 'mean': mean_, - 'var': var_ + 'mean': mean_ if self._df_key == 'obs' else mean_.T, + 'var': var_ if self._df_key == 'obs' else var_.T } ) @@ -228,13 +242,13 @@ def sparse_aggregator( keys An ndarray with keys[i] the group key corresponding to row i of A. """ - keys, key_index, obs_index, weight_value = self._extract_indices() - if obs_index is None: - obs_index = np.arange(len(key_index)) + keys, key_index, df_index, weight_value = self._extract_indices() + if df_index is None: + df_index = np.arange(len(key_index)) if self.weight is None: weight_value = np.ones(len(key_index)) A = coo_matrix( - (weight_value, (key_index, obs_index)), + (weight_value, (key_index, df_index)), shape=(len(keys), self.data.shape[0]), ) if normalize: @@ -245,7 +259,7 @@ def sparse_aggregator( return A, keys def _extract_indices(self): - def _filter_indices(key_set, keys, key_index, obs_index, weight_value=None): + def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): keep = [i for i, k in enumerate(keys) if k in set(key_set)] if len(keep) == 0: raise ValueError("No keys in key_set found in adata.obs[key].") @@ -258,26 +272,26 @@ def _filter_indices(key_set, keys, key_index, obs_index, weight_value=None): key_index = np.array( [remap[i] for i in key_index[mask]], dtype=np.int64 ) - obs_index = obs_index[mask] + df_index = df_index[mask] if weight_value is not None: weight_value = weight_value[mask] - return keys, key_index, obs_index, weight_value + return keys, key_index, df_index, weight_value - key_value = self.adata.obs[self.key] + key_value = getattr(self.adata, self._df_key)[self.key] keys, key_index = np.unique( _ndarray_from_seq(key_value), return_inverse=True ) - obs_index = np.arange(len(key_index)) + df_index = np.arange(len(key_index)) if self.weight is None: weight_value = None else: - weight_value = self.adata.obs[self.weight].values[obs_index] + weight_value = getattr(self.adata, self._df_key)[self.weight].values[df_index] if self.key_set is not None: - keys, key_index, obs_index, weight_value = _filter_indices( - self.key_set, keys, key_index, obs_index, weight_value + keys, key_index, df_index, weight_value = _filter_indices( + self.key_set, keys, key_index, df_index, weight_value ) self._key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter - return keys, key_index, obs_index, weight_value + return keys, key_index, df_index, weight_value def _power(X, power): return X ** power if isinstance(X, np.ndarray) else X.power(power) diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index f466aa0bf..5d5fc05df 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -12,9 +12,16 @@ True, ], ) -def test_groupby(use_layers): - genes = ["A", "B"] - cells = [ +@pytest.mark.parametrize( + 'df_key', + [ + 'obs', + 'var', + ], +) +def test_groupby(use_layers, df_key): + ax_base = ["A", "B"] + ax_groupby = [ "v0", "v1", "v2", @@ -30,13 +37,13 @@ def test_groupby(use_layers): "d0", ] - obs = pd.DataFrame(index=pd.Index(cells, name="cell")) - obs["key"] = pd.Categorical([c[0] for c in cells]) - obs["key_superset"] = pd.Categorical([c[0] for c in cells]).map({'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'}) - obs["key_subset"] = pd.Categorical([c[1] for c in cells]) - obs["weight"] = 2.0 + df_groupby = pd.DataFrame(index=pd.Index(ax_groupby, name="cell")) + df_groupby["key"] = pd.Categorical([c[0] for c in ax_groupby]) + df_groupby["key_superset"] = pd.Categorical([c[0] for c in ax_groupby]).map({'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'}) + df_groupby["key_subset"] = pd.Categorical([c[1] for c in ax_groupby]) + df_groupby["weight"] = 2.0 - var = pd.DataFrame(index=genes) + df_base = pd.DataFrame(index=ax_base) X = np.array( [ @@ -56,23 +63,31 @@ def test_groupby(use_layers): ], dtype=np.float32, ) + if df_key == 'obs': + adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X), layers={ 'test': csr_matrix(X) }) + adata_dense = ad.AnnData(obs=df_groupby, var=df_base, X=X, layers={ 'test': X.copy() }) # .copy needed? + else: + adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, X=csr_matrix(X.T), layers={ 'test': csr_matrix(X.T) }) + adata_dense = ad.AnnData(obs=df_base, var=df_groupby, X=X.T, layers={ 'test': X.T.copy() }) # .copy needed? - adata_sparse = ad.AnnData(obs=obs, var=var, X=csr_matrix(X), layers={ 'test': csr_matrix(X) }) - adata_dense = ad.AnnData(obs=obs, var=var, X=X, layers={ 'test': X.copy() }) # .copy needed? - - gb = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key") + data_sparse = adata_sparse.layers['test'] if use_layers else None + if df_key == 'var' and use_layers: + data_sparse = data_sparse.T + gb = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, df_key=df_key) stats_sparse = gb.count_mean_var() - stats_dense = sc.get.GroupBy(adata_dense, data=(adata_dense.layers['test'] if use_layers else adata_dense.X), key="key").count_mean_var() + data_dense = adata_dense.layers['test'] if use_layers else None + if df_key == 'var' and use_layers: + data_dense = data_dense.T + stats_dense = sc.get.GroupBy(adata_dense, key="key", data=data_dense, df_key=df_key).count_mean_var() # superset columns can be kept but not subsets - assert 'key_superset' in stats_dense.obs - assert 'key_subset' not in stats_dense.obs + assert 'key_superset' in getattr(stats_sparse, df_key) + assert 'key_subset' not in getattr(stats_sparse, df_key) - assert np.allclose(stats_sparse.obs['count'], stats_dense.obs['count']) + assert np.allclose(getattr(stats_sparse, df_key)['count'], getattr(stats_sparse, df_key)['count']) assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) - - gb_weight = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key", weight="weight") + gb_weight = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, weight="weight", df_key=df_key) stats_weight = gb_weight.count_mean_var() sum_ = gb.sum() sum_weight = gb_weight.sum() @@ -82,6 +97,24 @@ def test_groupby(use_layers): assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) key_set = ["v", "w"] - mean_key_set = sc.get.GroupBy(adata_sparse, data=(adata_sparse.layers['test'] if use_layers else adata_sparse.X), key="key", key_set=key_set).mean() - assert np.allclose(stats_sparse[stats_sparse.obs.index.isin(key_set), :].layers['mean'], mean_key_set.X) + mean_key_set = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, key_set=key_set, df_key=df_key).mean() + subset_idx = getattr(stats_sparse, df_key).index.isin(key_set) + subset = stats_sparse[subset_idx, :] if df_key == 'obs' else stats_sparse[:, subset_idx] + assert np.allclose(subset.layers['mean'], mean_key_set.X) + + df = pd.DataFrame( + index=getattr(adata_dense, df_key)["key"], + columns=getattr(adata_dense, f"{'var' if df_key == 'obs' else 'obs'}_names"), + data=adata_dense.X if df_key == 'obs' else adata_dense.X.T, + ) + grouped_agg_df = df.groupby('key').agg(["count", "mean", "var"]).swaplevel(axis=1).sort_index(axis=1) + mean = stats_dense.layers['mean'] + if df_key == 'var': + mean = mean.T + assert np.allclose(mean, grouped_agg_df['mean'].values) + var = stats_dense.layers['var'] + if df_key == 'var': + var = var.T + assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) + assert np.allclose(getattr(stats_dense, df_key)['count'], grouped_agg_df['count']['A'].values) # returns for both columns but counts only needs one because it is the same \ No newline at end of file From b9d75f9d538995065fffed24cdd7cf2dbe10ebb7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 17:06:56 +0200 Subject: [PATCH 08/89] (chore): add temporary note for now --- scanpy/get/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 82afafc91..814dbd3b1 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -50,7 +50,7 @@ class GroupBy: key Group key field in adata.obs. data - Element of the AnnData to aggregate (default None yields adata.X) + Element of the AnnData to aggregate (default None yields adata.X). Should have the same dimensions as the AnnData object. weight Weight field in adata.obs of type float. key_set From 43b5d3f9034dfbd84539c5f9240d8f104bff1357 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 3 Aug 2023 17:08:20 +0200 Subject: [PATCH 09/89] (chore): change `df_key` -> `groupby_df_key` --- scanpy/get/groupby.py | 39 ++++++++++++++++++----------------- scanpy/tests/test_groupby.py | 40 ++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 814dbd3b1..19b7004b1 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -55,7 +55,7 @@ class GroupBy: Weight field in adata.obs of type float. key_set Subset of keys to which to filter. - df_key + groupby_df_key One of 'obs' or 'var' on which to groupby """ @@ -65,7 +65,8 @@ class GroupBy: weight: Optional[str] key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated - _df_key: str + _groupby_df_key: str + _base_key: str def __init__( self, @@ -75,17 +76,17 @@ def __init__( data: Optional[Union[np.ndarray, spmatrix]] = None, weight: Optional[str] = None, key_set: Optional[Iterable[str]] = None, - df_key: str = 'obs', + groupby_df_key: str = 'obs', ): self.adata = adata self.data = adata.X if data is None else data - if df_key == 'var' and data is None: + if groupby_df_key == 'var' and data is None: self.data = self.data.T self.key = key self.weight = weight self.key_set = None if key_set is None else dict.fromkeys(key_set).keys() - self._df_key = df_key - self._base_key = 'obs' if df_key != 'obs' else 'var' + self._groupby_df_key = groupby_df_key + self._base_key = 'obs' if groupby_df_key != 'obs' else 'var' self._key_index = None @cached_property @@ -96,17 +97,17 @@ def _superset_columns(self) -> List[str]: List[str]: Superset columns. """ columns = [] - groupy_key_codes = getattr(self.adata, self._df_key)[self.key].astype('category') - for key in getattr(self.adata, self._df_key): + groupy_key_codes = getattr(self.adata, self._groupby_df_key)[self.key].astype('category') + for key in getattr(self.adata, self._groupby_df_key): if key != self.key: - key_codes = getattr(self.adata, self._df_key)[key].astype('category') + key_codes = getattr(self.adata, self._groupby_df_key)[key].astype('category') if all([key_codes[groupy_key_codes == group_key_code].nunique() == 1 for group_key_code in groupy_key_codes]): columns += [key] return columns @cached_property def df_grouped(self) -> pd.DataFrame: - df = getattr(self.adata, self._df_key).copy() + df = getattr(self.adata, self._groupby_df_key).copy() if self.key_set is not None: df = df[df[self.key].isin(self.key_set)] if df[self.key].dtype.name == 'category': @@ -122,7 +123,7 @@ def base_axis_indices(self) -> pd.Index: @cached_property def obs_var_dict(self) -> dict: return { - self._df_key: self.df_grouped, + self._groupby_df_key: self.df_grouped, self._base_key: self.base_axis_indices } @@ -154,7 +155,7 @@ def sum(self) -> AnnData: X = utils.asarray(A * self.data) return AnnData( **self.obs_var_dict, - X=X if self._df_key == 'obs' else X.T + X=X if self._groupby_df_key == 'obs' else X.T ) def mean(self) -> AnnData: @@ -169,7 +170,7 @@ def mean(self) -> AnnData: X = utils.asarray(A * self.data) return AnnData( **self.obs_var_dict, - X=X if self._df_key == 'obs' else X.T + X=X if self._groupby_df_key == 'obs' else X.T ) def count_mean_var(self, dof: int = 1) -> AnnData: @@ -200,7 +201,7 @@ def count_mean_var(self, dof: int = 1) -> AnnData: sq_mean = mean_ ** 2 else: A_unweighted, _ = GroupBy( - self.adata, self.key, key_set=self.key_set, df_key=self._df_key + self.adata, self.key, key_set=self.key_set, groupby_df_key=self._groupby_df_key ).sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self.data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted ** 2 @@ -211,12 +212,12 @@ def count_mean_var(self, dof: int = 1) -> AnnData: if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] obs_var_dict = self.obs_var_dict - obs_var_dict[self._df_key]['count'] = count_ + obs_var_dict[self._groupby_df_key]['count'] = count_ return AnnData( **obs_var_dict, layers={ - 'mean': mean_ if self._df_key == 'obs' else mean_.T, - 'var': var_ if self._df_key == 'obs' else var_.T + 'mean': mean_ if self._groupby_df_key == 'obs' else mean_.T, + 'var': var_ if self._groupby_df_key == 'obs' else var_.T } ) @@ -277,7 +278,7 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): weight_value = weight_value[mask] return keys, key_index, df_index, weight_value - key_value = getattr(self.adata, self._df_key)[self.key] + key_value = getattr(self.adata, self._groupby_df_key)[self.key] keys, key_index = np.unique( _ndarray_from_seq(key_value), return_inverse=True ) @@ -285,7 +286,7 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): if self.weight is None: weight_value = None else: - weight_value = getattr(self.adata, self._df_key)[self.weight].values[df_index] + weight_value = getattr(self.adata, self._groupby_df_key)[self.weight].values[df_index] if self.key_set is not None: keys, key_index, df_index, weight_value = _filter_indices( self.key_set, keys, key_index, df_index, weight_value diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index 5d5fc05df..870aee692 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -13,13 +13,13 @@ ], ) @pytest.mark.parametrize( - 'df_key', + 'groupby_df_key', [ 'obs', 'var', ], ) -def test_groupby(use_layers, df_key): +def test_groupby(use_layers, groupby_df_key): ax_base = ["A", "B"] ax_groupby = [ "v0", @@ -63,7 +63,7 @@ def test_groupby(use_layers, df_key): ], dtype=np.float32, ) - if df_key == 'obs': + if groupby_df_key == 'obs': adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X), layers={ 'test': csr_matrix(X) }) adata_dense = ad.AnnData(obs=df_groupby, var=df_base, X=X, layers={ 'test': X.copy() }) # .copy needed? else: @@ -71,23 +71,23 @@ def test_groupby(use_layers, df_key): adata_dense = ad.AnnData(obs=df_base, var=df_groupby, X=X.T, layers={ 'test': X.T.copy() }) # .copy needed? data_sparse = adata_sparse.layers['test'] if use_layers else None - if df_key == 'var' and use_layers: + if groupby_df_key == 'var' and use_layers: data_sparse = data_sparse.T - gb = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, df_key=df_key) + gb = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, groupby_df_key=groupby_df_key) stats_sparse = gb.count_mean_var() data_dense = adata_dense.layers['test'] if use_layers else None - if df_key == 'var' and use_layers: + if groupby_df_key == 'var' and use_layers: data_dense = data_dense.T - stats_dense = sc.get.GroupBy(adata_dense, key="key", data=data_dense, df_key=df_key).count_mean_var() + stats_dense = sc.get.GroupBy(adata_dense, key="key", data=data_dense, groupby_df_key=groupby_df_key).count_mean_var() # superset columns can be kept but not subsets - assert 'key_superset' in getattr(stats_sparse, df_key) - assert 'key_subset' not in getattr(stats_sparse, df_key) + assert 'key_superset' in getattr(stats_sparse, groupby_df_key) + assert 'key_subset' not in getattr(stats_sparse, groupby_df_key) - assert np.allclose(getattr(stats_sparse, df_key)['count'], getattr(stats_sparse, df_key)['count']) + assert np.allclose(getattr(stats_sparse, groupby_df_key)['count'], getattr(stats_sparse, groupby_df_key)['count']) assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) - gb_weight = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, weight="weight", df_key=df_key) + gb_weight = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, weight="weight", groupby_df_key=groupby_df_key) stats_weight = gb_weight.count_mean_var() sum_ = gb.sum() sum_weight = gb_weight.sum() @@ -97,24 +97,24 @@ def test_groupby(use_layers, df_key): assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) key_set = ["v", "w"] - mean_key_set = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, key_set=key_set, df_key=df_key).mean() - subset_idx = getattr(stats_sparse, df_key).index.isin(key_set) - subset = stats_sparse[subset_idx, :] if df_key == 'obs' else stats_sparse[:, subset_idx] + mean_key_set = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, key_set=key_set, groupby_df_key=groupby_df_key).mean() + subset_idx = getattr(stats_sparse, groupby_df_key).index.isin(key_set) + subset = stats_sparse[subset_idx, :] if groupby_df_key == 'obs' else stats_sparse[:, subset_idx] assert np.allclose(subset.layers['mean'], mean_key_set.X) df = pd.DataFrame( - index=getattr(adata_dense, df_key)["key"], - columns=getattr(adata_dense, f"{'var' if df_key == 'obs' else 'obs'}_names"), - data=adata_dense.X if df_key == 'obs' else adata_dense.X.T, + index=getattr(adata_dense, groupby_df_key)["key"], + columns=getattr(adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names"), + data=adata_dense.X if groupby_df_key == 'obs' else adata_dense.X.T, ) grouped_agg_df = df.groupby('key').agg(["count", "mean", "var"]).swaplevel(axis=1).sort_index(axis=1) mean = stats_dense.layers['mean'] - if df_key == 'var': + if groupby_df_key == 'var': mean = mean.T assert np.allclose(mean, grouped_agg_df['mean'].values) var = stats_dense.layers['var'] - if df_key == 'var': + if groupby_df_key == 'var': var = var.T assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) - assert np.allclose(getattr(stats_dense, df_key)['count'], grouped_agg_df['count']['A'].values) # returns for both columns but counts only needs one because it is the same + assert np.allclose(getattr(stats_dense, groupby_df_key)['count'], grouped_agg_df['count']['A'].values) # returns for both columns but counts only needs one because it is the same \ No newline at end of file From 2399a5c2032321d4de5059175d249da5019a0d1f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 09:49:14 +0200 Subject: [PATCH 10/89] (chore): clean up public/private methods and do some renaming --- scanpy/get/groupby.py | 112 +++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 19b7004b1..3db6256ba 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -32,7 +32,7 @@ class GroupBy: Moments are computed using weighted sum aggregation of AnnData obsevations per variable (i.e., feature) via multiplication by a sparse coordinate matrix A, exposed by - `sparse_aggregator`. The approach works with data in ndarray or scipy sparse formats, with + `_sparse_aggregator`. The approach works with data in ndarray or scipy sparse formats, with no view or copy overhead on runtime or memory, even when filtering keys. Runtime is effectively computation of the product A * X, i.e. the count of (non-zero) @@ -59,14 +59,14 @@ class GroupBy: One of 'obs' or 'var' on which to groupby """ - adata: AnnData - key: str - data: Union[np.ndarray, spmatrix] - weight: Optional[str] - key_set: AbstractSet[str] + _adata: AnnData + _key: str + _data: Union[np.ndarray, spmatrix] + _weight: Optional[str] + _key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated - _groupby_df_key: str - _base_key: str + _groupby_df_key: Literal['obs', 'var'] + _fixed_df_key: str def __init__( self, @@ -76,17 +76,17 @@ def __init__( data: Optional[Union[np.ndarray, spmatrix]] = None, weight: Optional[str] = None, key_set: Optional[Iterable[str]] = None, - groupby_df_key: str = 'obs', + groupby_df_key: Literal['obs', 'var'] = 'obs', ): - self.adata = adata - self.data = adata.X if data is None else data + self._adata = adata + self._data = adata.X if data is None else data if groupby_df_key == 'var' and data is None: - self.data = self.data.T - self.key = key - self.weight = weight - self.key_set = None if key_set is None else dict.fromkeys(key_set).keys() + self._data = self._data.T # transposing once here is easier than transposing this everywhere. first axis should be the groupby axis. + self._key = key + self._weight = weight + self._key_set = None if key_set is None else dict.fromkeys(key_set).keys() self._groupby_df_key = groupby_df_key - self._base_key = 'obs' if groupby_df_key != 'obs' else 'var' + self._fixed_df_key = 'obs' if groupby_df_key != 'obs' else 'var' self._key_index = None @cached_property @@ -97,34 +97,34 @@ def _superset_columns(self) -> List[str]: List[str]: Superset columns. """ columns = [] - groupy_key_codes = getattr(self.adata, self._groupby_df_key)[self.key].astype('category') - for key in getattr(self.adata, self._groupby_df_key): - if key != self.key: - key_codes = getattr(self.adata, self._groupby_df_key)[key].astype('category') + groupy_key_codes = getattr(self._adata, self._groupby_df_key)[self._key].astype('category') + for key in getattr(self._adata, self._groupby_df_key): + if key != self._key: + key_codes = getattr(self._adata, self._groupby_df_key)[key].astype('category') if all([key_codes[groupy_key_codes == group_key_code].nunique() == 1 for group_key_code in groupy_key_codes]): columns += [key] return columns @cached_property - def df_grouped(self) -> pd.DataFrame: - df = getattr(self.adata, self._groupby_df_key).copy() - if self.key_set is not None: - df = df[df[self.key].isin(self.key_set)] - if df[self.key].dtype.name == 'category': - df[self.key] = df[self.key].cat.remove_unused_categories() - return df.groupby(self.key).first()[self._superset_columns] + def _df_grouped(self) -> pd.DataFrame: + df = getattr(self._adata, self._groupby_df_key).copy() + if self._key_set is not None: + df = df[df[self._key].isin(self._key_set)] + if df[self._key].dtype.name == 'category': + df[self._key] = df[self._key].cat.remove_unused_categories() + return df.groupby(self._key).first()[self._superset_columns] @cached_property - def base_axis_indices(self) -> pd.Index: + def _base_axis_indices(self) -> pd.Index: return pd.DataFrame( - index=pd.Index(getattr(self.adata, f'{self._base_key}_names').copy()) + index=pd.Index(getattr(self._adata, f'{self._fixed_df_key}_names').copy()) ) @cached_property - def obs_var_dict(self) -> dict: + def _obs_var_dict(self) -> dict: return { - self._groupby_df_key: self.df_grouped, - self._base_key: self.base_axis_indices + self._groupby_df_key: self._df_grouped, + self._fixed_df_key: self._base_axis_indices } def count(self) -> pd.Series: @@ -139,7 +139,7 @@ def count(self) -> pd.Series: count_ = np.bincount(key_index) return pd.Series( data=count_, - index=pd.Index(keys, name=self.key, tupleize_cols=False), + index=pd.Index(keys, name=self._key, tupleize_cols=False), name="count", ) @@ -151,10 +151,10 @@ def sum(self) -> AnnData: ------- AnnData with sum in X indexed on obs by key with var from adata. """ - A, _ = self.sparse_aggregator(normalize=False) - X = utils.asarray(A * self.data) + A, _ = self._sparse_aggregator(normalize=False) + X = utils.asarray(A * self._data) return AnnData( - **self.obs_var_dict, + **self._obs_var_dict, X=X if self._groupby_df_key == 'obs' else X.T ) @@ -166,10 +166,10 @@ def mean(self) -> AnnData: ------- AnnData with means in X indexed on obs by key with var from adata. """ - A, _ = self.sparse_aggregator(normalize=True) - X = utils.asarray(A * self.data) + A, _ = self._sparse_aggregator(normalize=True) + X = utils.asarray(A * self._data) return AnnData( - **self.obs_var_dict, + **self._obs_var_dict, X=X if self._groupby_df_key == 'obs' else X.T ) @@ -193,25 +193,25 @@ def count_mean_var(self, dof: int = 1) -> AnnData: AnnData with mean and var in layers indexed on obs by key with var from adata. Counts are in obs under counts. """ assert dof >= 0 - A, _ = self.sparse_aggregator(normalize=True) + A, _ = self._sparse_aggregator(normalize=True) count_ = np.bincount(self._key_index) - mean_ = utils.asarray(A @ self.data) - mean_sq = utils.asarray(A @ _power(self.data, 2)) - if self.weight is None: + mean_ = utils.asarray(A @ self._data) + mean_sq = utils.asarray(A @ _power(self._data, 2)) + if self._weight is None: sq_mean = mean_ ** 2 else: A_unweighted, _ = GroupBy( - self.adata, self.key, key_set=self.key_set, groupby_df_key=self._groupby_df_key - ).sparse_aggregator() - mean_unweighted = utils.asarray(A_unweighted * self.data) + self._adata, self._key, key_set=self._key_set, groupby_df_key=self._groupby_df_key + )._sparse_aggregator() + mean_unweighted = utils.asarray(A_unweighted * self._data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted ** 2 var_ = mean_sq - sq_mean - precision = 2 << (42 if self.data.dtype == np.float64 else 20) + precision = 2 << (42 if self._data.dtype == np.float64 else 20) # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 var_[precision * var_ < sq_mean] = 0 if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] - obs_var_dict = self.obs_var_dict + obs_var_dict = self._obs_var_dict obs_var_dict[self._groupby_df_key]['count'] = count_ return AnnData( **obs_var_dict, @@ -221,7 +221,7 @@ def count_mean_var(self, dof: int = 1) -> AnnData: } ) - def sparse_aggregator( + def _sparse_aggregator( self, normalize: bool = False ) -> Tuple[coo_matrix, np.ndarray]: """ @@ -246,11 +246,11 @@ def sparse_aggregator( keys, key_index, df_index, weight_value = self._extract_indices() if df_index is None: df_index = np.arange(len(key_index)) - if self.weight is None: + if self._weight is None: weight_value = np.ones(len(key_index)) A = coo_matrix( (weight_value, (key_index, df_index)), - shape=(len(keys), self.data.shape[0]), + shape=(len(keys), self._data.shape[0]), ) if normalize: n_row = A.shape[0] @@ -278,18 +278,18 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): weight_value = weight_value[mask] return keys, key_index, df_index, weight_value - key_value = getattr(self.adata, self._groupby_df_key)[self.key] + key_value = getattr(self._adata, self._groupby_df_key)[self._key] keys, key_index = np.unique( _ndarray_from_seq(key_value), return_inverse=True ) df_index = np.arange(len(key_index)) - if self.weight is None: + if self._weight is None: weight_value = None else: - weight_value = getattr(self.adata, self._groupby_df_key)[self.weight].values[df_index] - if self.key_set is not None: + weight_value = getattr(self._adata, self._groupby_df_key)[self._weight].values[df_index] + if self._key_set is not None: keys, key_index, df_index, weight_value = _filter_indices( - self.key_set, keys, key_index, df_index, weight_value + self._key_set, keys, key_index, df_index, weight_value ) self._key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter return keys, key_index, df_index, weight_value From f02dacca16d6ab50fde17aa789410481c7f4aec0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 09:51:30 +0200 Subject: [PATCH 11/89] (chore): `black` --- scanpy/get/groupby.py | 64 ++++++++++++++++----------- scanpy/tests/test_groupby.py | 84 ++++++++++++++++++++++++++++-------- 2 files changed, 106 insertions(+), 42 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 3db6256ba..ac5a753bc 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -7,7 +7,7 @@ Tuple, Union, Literal, - List + List, ) from anndata import AnnData, utils @@ -16,6 +16,7 @@ import collections.abc as cabc from scipy.sparse import coo_matrix, dia_matrix, spmatrix + class GroupBy: """ Functionality for grouping and aggregating AnnData observations by key, per variable. @@ -81,7 +82,9 @@ def __init__( self._adata = adata self._data = adata.X if data is None else data if groupby_df_key == 'var' and data is None: - self._data = self._data.T # transposing once here is easier than transposing this everywhere. first axis should be the groupby axis. + self._data = ( + self._data.T + ) # transposing once here is easier than transposing this everywhere. first axis should be the groupby axis. self._key = key self._weight = weight self._key_set = None if key_set is None else dict.fromkeys(key_set).keys() @@ -97,14 +100,23 @@ def _superset_columns(self) -> List[str]: List[str]: Superset columns. """ columns = [] - groupy_key_codes = getattr(self._adata, self._groupby_df_key)[self._key].astype('category') + groupy_key_codes = getattr(self._adata, self._groupby_df_key)[self._key].astype( + 'category' + ) for key in getattr(self._adata, self._groupby_df_key): if key != self._key: - key_codes = getattr(self._adata, self._groupby_df_key)[key].astype('category') - if all([key_codes[groupy_key_codes == group_key_code].nunique() == 1 for group_key_code in groupy_key_codes]): + key_codes = getattr(self._adata, self._groupby_df_key)[key].astype( + 'category' + ) + if all( + [ + key_codes[groupy_key_codes == group_key_code].nunique() == 1 + for group_key_code in groupy_key_codes + ] + ): columns += [key] return columns - + @cached_property def _df_grouped(self) -> pd.DataFrame: df = getattr(self._adata, self._groupby_df_key).copy() @@ -113,18 +125,18 @@ def _df_grouped(self) -> pd.DataFrame: if df[self._key].dtype.name == 'category': df[self._key] = df[self._key].cat.remove_unused_categories() return df.groupby(self._key).first()[self._superset_columns] - + @cached_property def _base_axis_indices(self) -> pd.Index: return pd.DataFrame( - index=pd.Index(getattr(self._adata, f'{self._fixed_df_key}_names').copy()) + index=pd.Index(getattr(self._adata, f'{self._fixed_df_key}_names').copy()) ) - + @cached_property def _obs_var_dict(self) -> dict: return { self._groupby_df_key: self._df_grouped, - self._fixed_df_key: self._base_axis_indices + self._fixed_df_key: self._base_axis_indices, } def count(self) -> pd.Series: @@ -154,8 +166,7 @@ def sum(self) -> AnnData: A, _ = self._sparse_aggregator(normalize=False) X = utils.asarray(A * self._data) return AnnData( - **self._obs_var_dict, - X=X if self._groupby_df_key == 'obs' else X.T + **self._obs_var_dict, X=X if self._groupby_df_key == 'obs' else X.T ) def mean(self) -> AnnData: @@ -169,8 +180,7 @@ def mean(self) -> AnnData: A, _ = self._sparse_aggregator(normalize=True) X = utils.asarray(A * self._data) return AnnData( - **self._obs_var_dict, - X=X if self._groupby_df_key == 'obs' else X.T + **self._obs_var_dict, X=X if self._groupby_df_key == 'obs' else X.T ) def count_mean_var(self, dof: int = 1) -> AnnData: @@ -198,13 +208,16 @@ def count_mean_var(self, dof: int = 1) -> AnnData: mean_ = utils.asarray(A @ self._data) mean_sq = utils.asarray(A @ _power(self._data, 2)) if self._weight is None: - sq_mean = mean_ ** 2 + sq_mean = mean_**2 else: A_unweighted, _ = GroupBy( - self._adata, self._key, key_set=self._key_set, groupby_df_key=self._groupby_df_key + self._adata, + self._key, + key_set=self._key_set, + groupby_df_key=self._groupby_df_key, )._sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self._data) - sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted ** 2 + sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 var_ = mean_sq - sq_mean precision = 2 << (42 if self._data.dtype == np.float64 else 20) # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 @@ -214,11 +227,11 @@ def count_mean_var(self, dof: int = 1) -> AnnData: obs_var_dict = self._obs_var_dict obs_var_dict[self._groupby_df_key]['count'] = count_ return AnnData( - **obs_var_dict, + **obs_var_dict, layers={ 'mean': mean_ if self._groupby_df_key == 'obs' else mean_.T, - 'var': var_ if self._groupby_df_key == 'obs' else var_.T - } + 'var': var_ if self._groupby_df_key == 'obs' else var_.T, + }, ) def _sparse_aggregator( @@ -255,7 +268,7 @@ def _sparse_aggregator( if normalize: n_row = A.shape[0] row_sums = np.asarray(A.sum(axis=1)) - D = dia_matrix(((row_sums.T ** -1), [0]), shape=(n_row, n_row)) + D = dia_matrix(((row_sums.T**-1), [0]), shape=(n_row, n_row)) A = D * A return A, keys @@ -279,14 +292,14 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): return keys, key_index, df_index, weight_value key_value = getattr(self._adata, self._groupby_df_key)[self._key] - keys, key_index = np.unique( - _ndarray_from_seq(key_value), return_inverse=True - ) + keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) df_index = np.arange(len(key_index)) if self._weight is None: weight_value = None else: - weight_value = getattr(self._adata, self._groupby_df_key)[self._weight].values[df_index] + weight_value = getattr(self._adata, self._groupby_df_key)[ + self._weight + ].values[df_index] if self._key_set is not None: keys, key_index, df_index, weight_value = _filter_indices( self._key_set, keys, key_index, df_index, weight_value @@ -294,6 +307,7 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): self._key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter return keys, key_index, df_index, weight_value + def _power(X, power): return X ** power if isinstance(X, np.ndarray) else X.power(power) diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index 870aee692..5d3a739ba 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -5,6 +5,7 @@ from scipy.sparse import csr_matrix import pytest + @pytest.mark.parametrize( 'use_layers', [ @@ -39,7 +40,9 @@ def test_groupby(use_layers, groupby_df_key): df_groupby = pd.DataFrame(index=pd.Index(ax_groupby, name="cell")) df_groupby["key"] = pd.Categorical([c[0] for c in ax_groupby]) - df_groupby["key_superset"] = pd.Categorical([c[0] for c in ax_groupby]).map({'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'}) + df_groupby["key_superset"] = pd.Categorical([c[0] for c in ax_groupby]).map( + {'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'} + ) df_groupby["key_subset"] = pd.Categorical([c[1] for c in ax_groupby]) df_groupby["weight"] = 2.0 @@ -64,50 +67,95 @@ def test_groupby(use_layers, groupby_df_key): dtype=np.float32, ) if groupby_df_key == 'obs': - adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X), layers={ 'test': csr_matrix(X) }) - adata_dense = ad.AnnData(obs=df_groupby, var=df_base, X=X, layers={ 'test': X.copy() }) # .copy needed? + adata_sparse = ad.AnnData( + obs=df_groupby, var=df_base, X=csr_matrix(X), layers={'test': csr_matrix(X)} + ) + adata_dense = ad.AnnData( + obs=df_groupby, var=df_base, X=X, layers={'test': X.copy()} + ) # .copy needed? else: - adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, X=csr_matrix(X.T), layers={ 'test': csr_matrix(X.T) }) - adata_dense = ad.AnnData(obs=df_base, var=df_groupby, X=X.T, layers={ 'test': X.T.copy() }) # .copy needed? + adata_sparse = ad.AnnData( + obs=df_base, + var=df_groupby, + X=csr_matrix(X.T), + layers={'test': csr_matrix(X.T)}, + ) + adata_dense = ad.AnnData( + obs=df_base, var=df_groupby, X=X.T, layers={'test': X.T.copy()} + ) # .copy needed? data_sparse = adata_sparse.layers['test'] if use_layers else None if groupby_df_key == 'var' and use_layers: data_sparse = data_sparse.T - gb = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, groupby_df_key=groupby_df_key) + gb = sc.get.GroupBy( + adata_sparse, key="key", data=data_sparse, groupby_df_key=groupby_df_key + ) stats_sparse = gb.count_mean_var() data_dense = adata_dense.layers['test'] if use_layers else None if groupby_df_key == 'var' and use_layers: data_dense = data_dense.T - stats_dense = sc.get.GroupBy(adata_dense, key="key", data=data_dense, groupby_df_key=groupby_df_key).count_mean_var() + stats_dense = sc.get.GroupBy( + adata_dense, key="key", data=data_dense, groupby_df_key=groupby_df_key + ).count_mean_var() # superset columns can be kept but not subsets assert 'key_superset' in getattr(stats_sparse, groupby_df_key) assert 'key_subset' not in getattr(stats_sparse, groupby_df_key) - assert np.allclose(getattr(stats_sparse, groupby_df_key)['count'], getattr(stats_sparse, groupby_df_key)['count']) + assert np.allclose( + getattr(stats_sparse, groupby_df_key)['count'], + getattr(stats_sparse, groupby_df_key)['count'], + ) assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) - assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) - gb_weight = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, weight="weight", groupby_df_key=groupby_df_key) + assert np.allclose( + stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True + ) + gb_weight = sc.get.GroupBy( + adata_sparse, + key="key", + data=data_sparse, + weight="weight", + groupby_df_key=groupby_df_key, + ) stats_weight = gb_weight.count_mean_var() sum_ = gb.sum() sum_weight = gb_weight.sum() assert np.allclose(2 * sum_.X, sum_weight.X) assert np.allclose(stats_sparse.layers['mean'], stats_weight.layers['mean']) - assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) + assert np.allclose( + stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True + ) key_set = ["v", "w"] - mean_key_set = sc.get.GroupBy(adata_sparse, key="key", data=data_sparse, key_set=key_set, groupby_df_key=groupby_df_key).mean() + mean_key_set = sc.get.GroupBy( + adata_sparse, + key="key", + data=data_sparse, + key_set=key_set, + groupby_df_key=groupby_df_key, + ).mean() subset_idx = getattr(stats_sparse, groupby_df_key).index.isin(key_set) - subset = stats_sparse[subset_idx, :] if groupby_df_key == 'obs' else stats_sparse[:, subset_idx] + subset = ( + stats_sparse[subset_idx, :] + if groupby_df_key == 'obs' + else stats_sparse[:, subset_idx] + ) assert np.allclose(subset.layers['mean'], mean_key_set.X) df = pd.DataFrame( index=getattr(adata_dense, groupby_df_key)["key"], - columns=getattr(adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names"), + columns=getattr( + adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names" + ), data=adata_dense.X if groupby_df_key == 'obs' else adata_dense.X.T, ) - grouped_agg_df = df.groupby('key').agg(["count", "mean", "var"]).swaplevel(axis=1).sort_index(axis=1) + grouped_agg_df = ( + df.groupby('key') + .agg(["count", "mean", "var"]) + .swaplevel(axis=1) + .sort_index(axis=1) + ) mean = stats_dense.layers['mean'] if groupby_df_key == 'var': mean = mean.T @@ -116,5 +164,7 @@ def test_groupby(use_layers, groupby_df_key): if groupby_df_key == 'var': var = var.T assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) - assert np.allclose(getattr(stats_dense, groupby_df_key)['count'], grouped_agg_df['count']['A'].values) # returns for both columns but counts only needs one because it is the same - \ No newline at end of file + assert np.allclose( + getattr(stats_dense, groupby_df_key)['count'], + grouped_agg_df['count']['A'].values, + ) # returns for both columns but counts only needs one because it is the same From a50ea3b1a0edd6a2de8422e860dc7429ecbfbd0d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 12:14:58 +0200 Subject: [PATCH 12/89] (feat): refactor to allow for functional API --- scanpy/get/__init__.py | 2 +- scanpy/get/groupby.py | 140 +++++++++++------ scanpy/tests/test_groupby.py | 288 +++++++++++++++++++++++++++++------ 3 files changed, 331 insertions(+), 99 deletions(-) diff --git a/scanpy/get/__init__.py b/scanpy/get/__init__.py index 243669b0f..7a4398879 100644 --- a/scanpy/get/__init__.py +++ b/scanpy/get/__init__.py @@ -4,4 +4,4 @@ # Private from .get import _get_obs_rep, _set_obs_rep -from .groupby import GroupBy +from .groupby import aggregated diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index ac5a753bc..85d63e7e7 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -56,8 +56,8 @@ class GroupBy: Weight field in adata.obs of type float. key_set Subset of keys to which to filter. - groupby_df_key - One of 'obs' or 'var' on which to groupby + data_loc_key + One of 'obsm', 'layers', or 'varm' where data should be written """ _adata: AnnData @@ -66,31 +66,24 @@ class GroupBy: _weight: Optional[str] _key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated - _groupby_df_key: Literal['obs', 'var'] - _fixed_df_key: str def __init__( self, adata: AnnData, key: str, + data: Union[np.ndarray, spmatrix], *, - data: Optional[Union[np.ndarray, spmatrix]] = None, weight: Optional[str] = None, key_set: Optional[Iterable[str]] = None, - groupby_df_key: Literal['obs', 'var'] = 'obs', + data_loc_key: Optional[Literal['obsm']] = None, ): self._adata = adata - self._data = adata.X if data is None else data - if groupby_df_key == 'var' and data is None: - self._data = ( - self._data.T - ) # transposing once here is easier than transposing this everywhere. first axis should be the groupby axis. + self._data = data self._key = key self._weight = weight self._key_set = None if key_set is None else dict.fromkeys(key_set).keys() - self._groupby_df_key = groupby_df_key - self._fixed_df_key = 'obs' if groupby_df_key != 'obs' else 'var' self._key_index = None + self._data_loc_key = data_loc_key @cached_property def _superset_columns(self) -> List[str]: @@ -100,14 +93,10 @@ def _superset_columns(self) -> List[str]: List[str]: Superset columns. """ columns = [] - groupy_key_codes = getattr(self._adata, self._groupby_df_key)[self._key].astype( - 'category' - ) - for key in getattr(self._adata, self._groupby_df_key): + groupy_key_codes = self._adata.obs[self._key].astype('category') + for key in self._adata.obs: if key != self._key: - key_codes = getattr(self._adata, self._groupby_df_key)[key].astype( - 'category' - ) + key_codes = self._adata.obs[key].astype('category') if all( [ key_codes[groupy_key_codes == group_key_code].nunique() == 1 @@ -119,7 +108,7 @@ def _superset_columns(self) -> List[str]: @cached_property def _df_grouped(self) -> pd.DataFrame: - df = getattr(self._adata, self._groupby_df_key).copy() + df = self._adata.obs.copy() if self._key_set is not None: df = df[df[self._key].isin(self._key_set)] if df[self._key].dtype.name == 'category': @@ -128,18 +117,16 @@ def _df_grouped(self) -> pd.DataFrame: @cached_property def _base_axis_indices(self) -> pd.Index: - return pd.DataFrame( - index=pd.Index(getattr(self._adata, f'{self._fixed_df_key}_names').copy()) - ) + return pd.DataFrame(index=pd.Index(self._adata.var_names).copy()) @cached_property def _obs_var_dict(self) -> dict: return { - self._groupby_df_key: self._df_grouped, - self._fixed_df_key: self._base_axis_indices, + 'obs': self._df_grouped, + 'var': self._base_axis_indices, } - def count(self) -> pd.Series: + def count(self) -> AnnData: """ Count the number of observations in each group. @@ -147,13 +134,11 @@ def count(self) -> pd.Series: ------- Series of counts indexed by key. """ - keys, key_index, _, _ = self._extract_indices() + _, key_index, _, _ = self._extract_indices() count_ = np.bincount(key_index) - return pd.Series( - data=count_, - index=pd.Index(keys, name=self._key, tupleize_cols=False), - name="count", - ) + obs_var_dict = self._obs_var_dict + obs_var_dict['obs']['count'] = count_ + return AnnData(**obs_var_dict) def sum(self) -> AnnData: """ @@ -165,9 +150,11 @@ def sum(self) -> AnnData: """ A, _ = self._sparse_aggregator(normalize=False) X = utils.asarray(A * self._data) - return AnnData( - **self._obs_var_dict, X=X if self._groupby_df_key == 'obs' else X.T + data_loc_key = self._data_loc_key if self._data_loc_key is not None else 'X' + data_dict = ( + {data_loc_key: X} if data_loc_key == 'X' else {data_loc_key: {'sum': X}} ) + return AnnData(**{**self._obs_var_dict, **data_dict}) def mean(self) -> AnnData: """ @@ -179,9 +166,11 @@ def mean(self) -> AnnData: """ A, _ = self._sparse_aggregator(normalize=True) X = utils.asarray(A * self._data) - return AnnData( - **self._obs_var_dict, X=X if self._groupby_df_key == 'obs' else X.T + data_loc_key = self._data_loc_key if self._data_loc_key is not None else 'X' + data_dict = ( + {data_loc_key: X} if data_loc_key == 'X' else {data_loc_key: {'mean': X}} ) + return AnnData(**{**self._obs_var_dict, **data_dict}) def count_mean_var(self, dof: int = 1) -> AnnData: """ @@ -211,10 +200,11 @@ def count_mean_var(self, dof: int = 1) -> AnnData: sq_mean = mean_**2 else: A_unweighted, _ = GroupBy( - self._adata, - self._key, + adata=self._adata, + data=self._data, + key=self._key, key_set=self._key_set, - groupby_df_key=self._groupby_df_key, + data_loc_key=self._data_loc_key, )._sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self._data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 @@ -225,13 +215,18 @@ def count_mean_var(self, dof: int = 1) -> AnnData: if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] obs_var_dict = self._obs_var_dict - obs_var_dict[self._groupby_df_key]['count'] = count_ + obs_var_dict['obs']['count'] = count_ + data_loc_key = ( + self._data_loc_key if self._data_loc_key is not None else 'layers' + ) return AnnData( - **obs_var_dict, - layers={ - 'mean': mean_ if self._groupby_df_key == 'obs' else mean_.T, - 'var': var_ if self._groupby_df_key == 'obs' else var_.T, - }, + **{ + **obs_var_dict, + data_loc_key: { + 'mean': mean_, + 'var': var_, + }, + } ) def _sparse_aggregator( @@ -291,15 +286,13 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): weight_value = weight_value[mask] return keys, key_index, df_index, weight_value - key_value = getattr(self._adata, self._groupby_df_key)[self._key] + key_value = self._adata.obs[self._key] keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) df_index = np.arange(len(key_index)) if self._weight is None: weight_value = None else: - weight_value = getattr(self._adata, self._groupby_df_key)[ - self._weight - ].values[df_index] + weight_value = self._adata.obs[self._weight].values[df_index] if self._key_set is not None: keys, key_index, df_index, weight_value = _filter_indices( self._key_set, keys, key_index, df_index, weight_value @@ -321,3 +314,50 @@ def _ndarray_from_seq(lst: Sequence): else: arr = np.array(lst) return arr + + +def aggregated( + adata: AnnData, + by: str, + how: Literal['count', 'mean', 'sum', 'count_mean_var'], + df_key: Literal['obs', 'var'] = 'obs', + weight: Optional[str] = None, + key_set: Optional[Iterable[str]] = None, + dof: int = 1, + layer=None, + obsm=None, + varm=None, +): + data = adata.X + data_loc_key = None + if varm is not None: + data = adata.varm[varm] + data_loc_key = 'obsm' + elif obsm is not None: + data = adata.obsm[obsm] + data_loc_key = 'obsm' + elif layer is not None: + data = adata.layers[layer] + if df_key == 'var': + data = data.T + elif df_key == 'var': + data = data.T + groupby = GroupBy( + adata=adata if df_key == 'obs' else adata.T, + data=data, + key=by, + weight=weight, + key_set=key_set, + data_loc_key=data_loc_key, + ) + if how == 'count': + data = groupby.count() + elif how == 'mean': + data = groupby.mean() + elif how == 'sum': + data = groupby.sum() + else: + data = groupby.count_mean_var(dof) + if df_key == 'var': + return data.T + return data diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index 5d3a739ba..8fc8343ab 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -7,11 +7,8 @@ @pytest.mark.parametrize( - 'use_layers', - [ - False, - True, - ], + 'data_key', + ['layers', 'obsm', 'varm'], ) @pytest.mark.parametrize( 'groupby_df_key', @@ -20,7 +17,11 @@ 'var', ], ) -def test_groupby(use_layers, groupby_df_key): +def test_groupby_different_data_locations(data_key, groupby_df_key): + if (data_key == 'varm' and groupby_df_key == 'obs') or ( + data_key == 'obsm' and groupby_df_key == 'var' + ): + pytest.skip("invalid parameter combination") ax_base = ["A", "B"] ax_groupby = [ "v0", @@ -66,37 +67,43 @@ def test_groupby(use_layers, groupby_df_key): ], dtype=np.float32, ) + data_dense = X if groupby_df_key == 'obs': + data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} adata_sparse = ad.AnnData( - obs=df_groupby, var=df_base, X=csr_matrix(X), layers={'test': csr_matrix(X)} + **{'obs': df_groupby, 'var': df_base, **data_sparse_mat_dict} ) + data_dense_mat_dict = {data_key: {'test': X}} adata_dense = ad.AnnData( - obs=df_groupby, var=df_base, X=X, layers={'test': X.copy()} - ) # .copy needed? + **{'obs': df_groupby, 'var': df_base, **data_dense_mat_dict} + ) else: + if data_key != 'varm': + data_dense = X.T + data_sparse_mat_dict = {data_key: {'test': csr_matrix(data_dense)}} adata_sparse = ad.AnnData( - obs=df_base, - var=df_groupby, - X=csr_matrix(X.T), - layers={'test': csr_matrix(X.T)}, + **{'obs': df_base, 'var': df_groupby, **data_sparse_mat_dict} ) + data_dense_mat_dict = {data_key: {'test': data_dense}} adata_dense = ad.AnnData( - obs=df_base, var=df_groupby, X=X.T, layers={'test': X.T.copy()} - ) # .copy needed? - - data_sparse = adata_sparse.layers['test'] if use_layers else None - if groupby_df_key == 'var' and use_layers: - data_sparse = data_sparse.T - gb = sc.get.GroupBy( - adata_sparse, key="key", data=data_sparse, groupby_df_key=groupby_df_key - ) - stats_sparse = gb.count_mean_var() - data_dense = adata_dense.layers['test'] if use_layers else None - if groupby_df_key == 'var' and use_layers: - data_dense = data_dense.T - stats_dense = sc.get.GroupBy( - adata_dense, key="key", data=data_dense, groupby_df_key=groupby_df_key - ).count_mean_var() + **{'obs': df_base, 'var': df_groupby, **data_dense_mat_dict} + ) + + data_dict = {(data_key if data_key != 'layers' else 'layer'): 'test'} + stats_sparse = sc.get.aggregated( + adata=adata_sparse, + by="key", + df_key=groupby_df_key, + how='count_mean_var', + **data_dict, + ) + stats_dense = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='count_mean_var', + **data_dict, + ) # superset columns can be kept but not subsets assert 'key_superset' in getattr(stats_sparse, groupby_df_key) @@ -106,20 +113,203 @@ def test_groupby(use_layers, groupby_df_key): getattr(stats_sparse, groupby_df_key)['count'], getattr(stats_sparse, groupby_df_key)['count'], ) - assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) assert np.allclose( - stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True + getattr(stats_sparse, data_key)['mean'], getattr(stats_dense, data_key)['mean'] ) - gb_weight = sc.get.GroupBy( - adata_sparse, - key="key", - data=data_sparse, + assert np.allclose( + getattr(stats_sparse, data_key)['var'], + getattr(stats_dense, data_key)['var'], + equal_nan=True, + ) + + stats_weight = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='count_mean_var', + weight="weight", + **data_dict, + ) + sum_ = sc.get.aggregated( + adata=adata_sparse, by="key", df_key=groupby_df_key, how='sum', **data_dict + ) + sum_weight = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='sum', + weight="weight", + **data_dict, + ) + + def get_single_agg(adata, key, agg): + if key == 'obsm' or key == 'varm': + return getattr(adata, key)[agg] + return adata.X + + assert np.allclose( + 2 * get_single_agg(sum_, data_key, 'sum'), + get_single_agg(sum_weight, data_key, 'sum'), + ) + assert np.allclose( + getattr(stats_sparse, data_key)['mean'], getattr(stats_weight, data_key)['mean'] + ) + assert np.allclose( + getattr(stats_sparse, data_key)['var'], + getattr(stats_dense, data_key)['var'], + equal_nan=True, + ) + + key_set = ["v", "w"] + mean_key_set_adata = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='mean', + key_set=key_set, + **data_dict, + ) + subset_idx = getattr(stats_sparse, groupby_df_key).index.isin(key_set) + subset_adata = ( + stats_sparse[subset_idx, :] + if groupby_df_key == 'obs' + else stats_sparse[:, subset_idx] + ) + subset_mean = getattr(subset_adata, data_key)['mean'] + key_set_mean = get_single_agg(mean_key_set_adata, data_key, 'mean') + assert np.allclose(subset_mean, key_set_mean) + + df = pd.DataFrame( + index=getattr(adata_dense, groupby_df_key)["key"], + columns=getattr( + adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names" + ), + data=data_dense.T + if groupby_df_key == 'var' and data_key != 'varm' + else data_dense, + ) + grouped_agg_df = ( + df.groupby('key') + .agg(["count", "mean", "var"]) + .swaplevel(axis=1) + .sort_index(axis=1) + ) + mean = getattr(stats_dense, data_key)['mean'] + if groupby_df_key == 'var' and data_key != 'varm': + mean = mean.T + assert np.allclose(mean, grouped_agg_df['mean'].values) + var = getattr(stats_dense, data_key)['var'] + if groupby_df_key == 'var' and data_key != 'varm': + var = var.T + assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) + assert np.allclose( + getattr(stats_dense, groupby_df_key)['count'], + grouped_agg_df['count']['A'].values, + ) # returns for both columns but counts only needs one because it is the same + + +@pytest.mark.parametrize( + 'groupby_df_key', + [ + 'obs', + 'var', + ], +) +def test_groupby_X(groupby_df_key): + ax_base = ["A", "B"] + ax_groupby = [ + "v0", + "v1", + "v2", + "w0", + "w1", + "a1", + "a2", + "a3", + "b1", + "b2", + "c1", + "c2", + "d0", + ] + + df_groupby = pd.DataFrame(index=pd.Index(ax_groupby, name="cell")) + df_groupby["key"] = pd.Categorical([c[0] for c in ax_groupby]) + df_groupby["key_superset"] = pd.Categorical([c[0] for c in ax_groupby]).map( + {'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'} + ) + df_groupby["key_subset"] = pd.Categorical([c[1] for c in ax_groupby]) + df_groupby["weight"] = 2.0 + + df_base = pd.DataFrame(index=ax_base) + + X = np.array( + [ + [0, -2], + [1, 13], + [2, 1], # v + [3, 12], + [4, 2], # w + [5, 11], + [6, 3], + [7, 10], # a + [8, 4], + [9, 9], # b + [10, 5], + [11, 8], # c + [12, 6], # d + ], + dtype=np.float32, + ) + data_dense = X + if groupby_df_key == 'obs': + adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X)) + adata_dense = ad.AnnData(obs=df_groupby, var=df_base, X=X) + else: + adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, X=data_dense.T) + adata_dense = ad.AnnData(obs=df_base, var=df_groupby, X=csr_matrix(X).T) + + stats_sparse = sc.get.aggregated( + adata=adata_sparse, + by="key", + df_key=groupby_df_key, + how='count_mean_var', + ) + stats_dense = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='count_mean_var', + ) + + # superset columns can be kept but not subsets + assert 'key_superset' in getattr(stats_sparse, groupby_df_key) + assert 'key_subset' not in getattr(stats_sparse, groupby_df_key) + + assert np.allclose( + getattr(stats_sparse, groupby_df_key)['count'], + getattr(stats_sparse, groupby_df_key)['count'], + ) + assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) + assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) + + stats_weight = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='count_mean_var', + weight="weight", + ) + sum_ = sc.get.aggregated( + adata=adata_sparse, by="key", df_key=groupby_df_key, how='sum' + ) + sum_weight = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='sum', weight="weight", - groupby_df_key=groupby_df_key, ) - stats_weight = gb_weight.count_mean_var() - sum_ = gb.sum() - sum_weight = gb_weight.sum() assert np.allclose(2 * sum_.X, sum_weight.X) assert np.allclose(stats_sparse.layers['mean'], stats_weight.layers['mean']) @@ -128,27 +318,29 @@ def test_groupby(use_layers, groupby_df_key): ) key_set = ["v", "w"] - mean_key_set = sc.get.GroupBy( - adata_sparse, - key="key", - data=data_sparse, + mean_key_set_adata = sc.get.aggregated( + adata=adata_dense, + by="key", + df_key=groupby_df_key, + how='mean', key_set=key_set, - groupby_df_key=groupby_df_key, - ).mean() + ) subset_idx = getattr(stats_sparse, groupby_df_key).index.isin(key_set) - subset = ( + subset_adata = ( stats_sparse[subset_idx, :] if groupby_df_key == 'obs' else stats_sparse[:, subset_idx] ) - assert np.allclose(subset.layers['mean'], mean_key_set.X) + subset_mean = subset_adata.layers['mean'] + key_set_mean = mean_key_set_adata.X + assert np.allclose(subset_mean, key_set_mean) df = pd.DataFrame( index=getattr(adata_dense, groupby_df_key)["key"], columns=getattr( adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names" ), - data=adata_dense.X if groupby_df_key == 'obs' else adata_dense.X.T, + data=data_dense ) grouped_agg_df = ( df.groupby('key') From 0060e0e00aaccdec6829f20e6793c70be428ea5e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 12:44:49 +0200 Subject: [PATCH 13/89] (style): use `bool` for writing to `obsm` --- scanpy/get/groupby.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index 85d63e7e7..d69d9d7fa 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -19,14 +19,14 @@ class GroupBy: """ - Functionality for grouping and aggregating AnnData observations by key, per variable. + Functionality for grouping and aggregating AnnData observations by key on `obs` + and if you want to do `var`, transpose the AnnData object first. There is currently support for count, sum, mean, and varience per group, and for scores derived from these per pair of groups. Set `weight` for weighted sum, mean, and variance. - Set `key_set` to a list of keys to most efficiently compute results for a subset of groups. **Implementation** @@ -56,8 +56,8 @@ class GroupBy: Weight field in adata.obs of type float. key_set Subset of keys to which to filter. - data_loc_key - One of 'obsm', 'layers', or 'varm' where data should be written + write_to_obsm + Whether to write to `obsm` or not """ _adata: AnnData @@ -66,6 +66,7 @@ class GroupBy: _weight: Optional[str] _key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated + _write_to_obsm: bool def __init__( self, @@ -75,7 +76,7 @@ def __init__( *, weight: Optional[str] = None, key_set: Optional[Iterable[str]] = None, - data_loc_key: Optional[Literal['obsm']] = None, + write_to_obsm: bool = False, ): self._adata = adata self._data = data @@ -83,7 +84,7 @@ def __init__( self._weight = weight self._key_set = None if key_set is None else dict.fromkeys(key_set).keys() self._key_index = None - self._data_loc_key = data_loc_key + self._write_to_obsm = write_to_obsm @cached_property def _superset_columns(self) -> List[str]: @@ -150,9 +151,8 @@ def sum(self) -> AnnData: """ A, _ = self._sparse_aggregator(normalize=False) X = utils.asarray(A * self._data) - data_loc_key = self._data_loc_key if self._data_loc_key is not None else 'X' data_dict = ( - {data_loc_key: X} if data_loc_key == 'X' else {data_loc_key: {'sum': X}} + {'obsm': {'sum': X}} if self._write_to_obsm else { 'X': X } ) return AnnData(**{**self._obs_var_dict, **data_dict}) @@ -166,9 +166,8 @@ def mean(self) -> AnnData: """ A, _ = self._sparse_aggregator(normalize=True) X = utils.asarray(A * self._data) - data_loc_key = self._data_loc_key if self._data_loc_key is not None else 'X' data_dict = ( - {data_loc_key: X} if data_loc_key == 'X' else {data_loc_key: {'mean': X}} + {'obsm': {'mean': X}} if self._write_to_obsm else { 'X': X } ) return AnnData(**{**self._obs_var_dict, **data_dict}) @@ -204,7 +203,7 @@ def count_mean_var(self, dof: int = 1) -> AnnData: data=self._data, key=self._key, key_set=self._key_set, - data_loc_key=self._data_loc_key, + write_to_obsm=self._write_to_obsm, )._sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self._data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 @@ -216,13 +215,11 @@ def count_mean_var(self, dof: int = 1) -> AnnData: var_ *= (count_ / (count_ - dof))[:, np.newaxis] obs_var_dict = self._obs_var_dict obs_var_dict['obs']['count'] = count_ - data_loc_key = ( - self._data_loc_key if self._data_loc_key is not None else 'layers' - ) + write_to_obsm = 'obsm' if self._write_to_obsm else 'layers' return AnnData( **{ **obs_var_dict, - data_loc_key: { + write_to_obsm: { 'mean': mean_, 'var': var_, }, @@ -329,13 +326,13 @@ def aggregated( varm=None, ): data = adata.X - data_loc_key = None + write_to_obsm = None if varm is not None: data = adata.varm[varm] - data_loc_key = 'obsm' + write_to_obsm = True # the data will have to be transposed so this is accurate elif obsm is not None: data = adata.obsm[obsm] - data_loc_key = 'obsm' + write_to_obsm = True elif layer is not None: data = adata.layers[layer] if df_key == 'var': @@ -348,7 +345,7 @@ def aggregated( key=by, weight=weight, key_set=key_set, - data_loc_key=data_loc_key, + write_to_obsm=write_to_obsm, ) if how == 'count': data = groupby.count() From 5a56c6a89002424b14278df72f0a6fd990e962ab Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:04:49 +0200 Subject: [PATCH 14/89] (refactor): `AnnData` object separate from groupby --- scanpy/get/groupby.py | 189 ++++++++++++++++++----------------- scanpy/tests/test_groupby.py | 50 ++++----- 2 files changed, 121 insertions(+), 118 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index d69d9d7fa..cd1f53a24 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -1,4 +1,4 @@ -from functools import cached_property +from functools import singledispatch from typing import ( Optional, Iterable, @@ -16,6 +16,8 @@ import collections.abc as cabc from scipy.sparse import coo_matrix, dia_matrix, spmatrix +Array = Union[np.ndarray, spmatrix] + class GroupBy: """ @@ -62,70 +64,26 @@ class GroupBy: _adata: AnnData _key: str - _data: Union[np.ndarray, spmatrix] + _data: Array _weight: Optional[str] _key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated - _write_to_obsm: bool def __init__( self, - adata: AnnData, + df: pd.DataFrame, key: str, - data: Union[np.ndarray, spmatrix], + data: Array, *, weight: Optional[str] = None, key_set: Optional[Iterable[str]] = None, - write_to_obsm: bool = False, ): - self._adata = adata + self._df = df self._data = data self._key = key self._weight = weight self._key_set = None if key_set is None else dict.fromkeys(key_set).keys() self._key_index = None - self._write_to_obsm = write_to_obsm - - @cached_property - def _superset_columns(self) -> List[str]: - """Find all columns which are a superset of the key column. - - Returns: - List[str]: Superset columns. - """ - columns = [] - groupy_key_codes = self._adata.obs[self._key].astype('category') - for key in self._adata.obs: - if key != self._key: - key_codes = self._adata.obs[key].astype('category') - if all( - [ - key_codes[groupy_key_codes == group_key_code].nunique() == 1 - for group_key_code in groupy_key_codes - ] - ): - columns += [key] - return columns - - @cached_property - def _df_grouped(self) -> pd.DataFrame: - df = self._adata.obs.copy() - if self._key_set is not None: - df = df[df[self._key].isin(self._key_set)] - if df[self._key].dtype.name == 'category': - df[self._key] = df[self._key].cat.remove_unused_categories() - return df.groupby(self._key).first()[self._superset_columns] - - @cached_property - def _base_axis_indices(self) -> pd.Index: - return pd.DataFrame(index=pd.Index(self._adata.var_names).copy()) - - @cached_property - def _obs_var_dict(self) -> dict: - return { - 'obs': self._df_grouped, - 'var': self._base_axis_indices, - } def count(self) -> AnnData: """ @@ -137,11 +95,9 @@ def count(self) -> AnnData: """ _, key_index, _, _ = self._extract_indices() count_ = np.bincount(key_index) - obs_var_dict = self._obs_var_dict - obs_var_dict['obs']['count'] = count_ - return AnnData(**obs_var_dict) + return count_ - def sum(self) -> AnnData: + def sum(self) -> Array: """ Compute the sum per feature per group of observations. @@ -150,13 +106,9 @@ def sum(self) -> AnnData: AnnData with sum in X indexed on obs by key with var from adata. """ A, _ = self._sparse_aggregator(normalize=False) - X = utils.asarray(A * self._data) - data_dict = ( - {'obsm': {'sum': X}} if self._write_to_obsm else { 'X': X } - ) - return AnnData(**{**self._obs_var_dict, **data_dict}) + return utils.asarray(A * self._data) - def mean(self) -> AnnData: + def mean(self) -> Array: """ Compute the mean per feature per group of observations. @@ -165,13 +117,9 @@ def mean(self) -> AnnData: AnnData with means in X indexed on obs by key with var from adata. """ A, _ = self._sparse_aggregator(normalize=True) - X = utils.asarray(A * self._data) - data_dict = ( - {'obsm': {'mean': X}} if self._write_to_obsm else { 'X': X } - ) - return AnnData(**{**self._obs_var_dict, **data_dict}) + return utils.asarray(A * self._data) - def count_mean_var(self, dof: int = 1) -> AnnData: + def count_mean_var(self, dof: int = 1) -> dict: """ Compute the count, as well as mean and variance per feature, per group of observations. @@ -199,11 +147,10 @@ def count_mean_var(self, dof: int = 1) -> AnnData: sq_mean = mean_**2 else: A_unweighted, _ = GroupBy( - adata=self._adata, + df=self._df, data=self._data, key=self._key, key_set=self._key_set, - write_to_obsm=self._write_to_obsm, )._sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self._data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 @@ -213,18 +160,7 @@ def count_mean_var(self, dof: int = 1) -> AnnData: var_[precision * var_ < sq_mean] = 0 if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] - obs_var_dict = self._obs_var_dict - obs_var_dict['obs']['count'] = count_ - write_to_obsm = 'obsm' if self._write_to_obsm else 'layers' - return AnnData( - **{ - **obs_var_dict, - write_to_obsm: { - 'mean': mean_, - 'var': var_, - }, - } - ) + return {'mean': mean_, 'var': var_, 'count': count_} def _sparse_aggregator( self, normalize: bool = False @@ -283,13 +219,13 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): weight_value = weight_value[mask] return keys, key_index, df_index, weight_value - key_value = self._adata.obs[self._key] + key_value = self._df[self._key] keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) df_index = np.arange(len(key_index)) if self._weight is None: weight_value = None else: - weight_value = self._adata.obs[self._weight].values[df_index] + weight_value = self._df[self._weight].values[df_index] if self._key_set is not None: keys, key_index, df_index, weight_value = _filter_indices( self._key_set, keys, key_index, df_index, weight_value @@ -313,11 +249,42 @@ def _ndarray_from_seq(lst: Sequence): return arr +def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: + """Find all columns which are a superset of the key column. + + Returns: + List[str]: Superset columns. + """ + columns = [] + groupy_key_codes = df[groupby_key].astype('category') + for key in df: + if key != groupby_key: + key_codes = df[key].astype('category') + if all( + [ + key_codes[groupy_key_codes == group_key_code].nunique() == 1 + for group_key_code in groupy_key_codes + ] + ): + columns += [key] + return columns + + +def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: + df = df.copy() + if key_set is not None: + df = df[df[key].isin(key_set)] + if df[key].dtype.name == 'category': + df[key] = df[key].cat.remove_unused_categories() + return df.groupby(key).first()[_superset_columns(df, key)] + + +@singledispatch def aggregated( adata: AnnData, by: str, - how: Literal['count', 'mean', 'sum', 'count_mean_var'], - df_key: Literal['obs', 'var'] = 'obs', + how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', + groupby_df_key: Literal['obs', 'var'] = 'obs', weight: Optional[str] = None, key_set: Optional[Iterable[str]] = None, dof: int = 1, @@ -335,26 +302,60 @@ def aggregated( write_to_obsm = True elif layer is not None: data = adata.layers[layer] - if df_key == 'var': + if groupby_df_key == 'var': data = data.T - elif df_key == 'var': + elif groupby_df_key == 'var': data = data.T + return aggregated_from_array( + data, + groupby_df=getattr(adata, groupby_df_key), + groupby_df_key=groupby_df_key, + no_groupby_df=getattr(adata, 'var' if groupby_df_key == 'obs' else 'obs'), + by=by, + weight=weight, + key_set=key_set, + how=how, + dof=dof, + write_to_obsm=write_to_obsm, + ) + + +@aggregated.register(Array) +def aggregated_from_array( + data, + groupby_df: pd.DataFrame, + groupby_df_key: str, + no_groupby_df: pd.DataFrame, + by: str, + write_to_obsm: bool, + weight: Optional[str] = None, + key_set: Optional[Iterable[str]] = None, + how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', + dof: int = 1, +): groupby = GroupBy( - adata=adata if df_key == 'obs' else adata.T, + df=groupby_df, data=data, key=by, weight=weight, key_set=key_set, - write_to_obsm=write_to_obsm, ) + obs_var_dict = {'obs': _df_grouped(groupby_df, by, key_set), 'var': no_groupby_df} + data_dict = {} if how == 'count': - data = groupby.count() + obs_var_dict['obs']['count'] = groupby.count() elif how == 'mean': - data = groupby.mean() + agg = groupby.mean() + data_dict = {'obsm': {'mean': agg}} if write_to_obsm else {'X': agg} elif how == 'sum': - data = groupby.sum() + agg = groupby.sum() + data_dict = {'obsm': {'sum': agg}} if write_to_obsm else {'X': agg} else: - data = groupby.count_mean_var(dof) - if df_key == 'var': - return data.T - return data + agg = groupby.count_mean_var(dof) + write_to_obsm = 'obsm' if write_to_obsm else 'layers' + obs_var_dict['obs']['count'] = agg['count'] + data_dict = {write_to_obsm: {'mean': agg['mean'], 'var': agg['var']}} + adata_agg = AnnData(**{**data_dict, **obs_var_dict}) + if groupby_df_key == 'var': + return adata_agg.T + return adata_agg diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index 8fc8343ab..11b95aa9d 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -91,16 +91,16 @@ def test_groupby_different_data_locations(data_key, groupby_df_key): data_dict = {(data_key if data_key != 'layers' else 'layer'): 'test'} stats_sparse = sc.get.aggregated( - adata=adata_sparse, + adata_sparse, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='count_mean_var', **data_dict, ) stats_dense = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='count_mean_var', **data_dict, ) @@ -123,20 +123,20 @@ def test_groupby_different_data_locations(data_key, groupby_df_key): ) stats_weight = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='count_mean_var', weight="weight", **data_dict, ) sum_ = sc.get.aggregated( - adata=adata_sparse, by="key", df_key=groupby_df_key, how='sum', **data_dict + adata_sparse, by="key", groupby_df_key=groupby_df_key, how='sum', **data_dict ) sum_weight = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='sum', weight="weight", **data_dict, @@ -162,9 +162,9 @@ def get_single_agg(adata, key, agg): key_set = ["v", "w"] mean_key_set_adata = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='mean', key_set=key_set, **data_dict, @@ -270,15 +270,15 @@ def test_groupby_X(groupby_df_key): adata_dense = ad.AnnData(obs=df_base, var=df_groupby, X=csr_matrix(X).T) stats_sparse = sc.get.aggregated( - adata=adata_sparse, + adata_sparse, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='count_mean_var', ) stats_dense = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='count_mean_var', ) @@ -291,22 +291,24 @@ def test_groupby_X(groupby_df_key): getattr(stats_sparse, groupby_df_key)['count'], ) assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) - assert np.allclose(stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True) + assert np.allclose( + stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True + ) stats_weight = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='count_mean_var', weight="weight", ) sum_ = sc.get.aggregated( - adata=adata_sparse, by="key", df_key=groupby_df_key, how='sum' + adata_sparse, by="key", groupby_df_key=groupby_df_key, how='sum' ) sum_weight = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='sum', weight="weight", ) @@ -319,9 +321,9 @@ def test_groupby_X(groupby_df_key): key_set = ["v", "w"] mean_key_set_adata = sc.get.aggregated( - adata=adata_dense, + adata_dense, by="key", - df_key=groupby_df_key, + groupby_df_key=groupby_df_key, how='mean', key_set=key_set, ) @@ -340,7 +342,7 @@ def test_groupby_X(groupby_df_key): columns=getattr( adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names" ), - data=data_dense + data=data_dense, ) grouped_agg_df = ( df.groupby('key') From 61b934583407e4f0e81830e94e9d6c37c31e0138 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:05:07 +0200 Subject: [PATCH 15/89] (chore): export `aggregated_from_array` --- scanpy/get/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/__init__.py b/scanpy/get/__init__.py index 7a4398879..1789a3dad 100644 --- a/scanpy/get/__init__.py +++ b/scanpy/get/__init__.py @@ -4,4 +4,4 @@ # Private from .get import _get_obs_rep, _set_obs_rep -from .groupby import aggregated +from .groupby import aggregated, aggregated_from_array From 6408daf2d9845d684ffbe56cb202659e7cc08fb7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:37:03 +0200 Subject: [PATCH 16/89] (refactor): remove `GroupBy` dependence of `df` --- scanpy/get/groupby.py | 145 +++++++++++++++++++++++------------ scanpy/tests/test_groupby.py | 8 +- 2 files changed, 99 insertions(+), 54 deletions(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index cd1f53a24..d4def7b5d 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -49,49 +49,42 @@ class GroupBy: Params ------ - adata - key - Group key field in adata.obs. - data - Element of the AnnData to aggregate (default None yields adata.X). Should have the same dimensions as the AnnData object. - weight - Weight field in adata.obs of type float. + _groupby + `Series` containing values for grouping by. + _data + Data matrix for aggregation. + _weight + Weights to be used for aggergation. key_set Subset of keys to which to filter. - write_to_obsm - Whether to write to `obsm` or not """ - _adata: AnnData - _key: str + _groupby: pd.Series _data: Array - _weight: Optional[str] + _weight: Union[pd.Series, Array] _key_set: AbstractSet[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated def __init__( self, - df: pd.DataFrame, - key: str, + groupby: pd.Series, data: Array, - *, - weight: Optional[str] = None, + weight: Union[pd.Series, Array] = None, key_set: Optional[Iterable[str]] = None, ): - self._df = df + self._groupby = groupby self._data = data - self._key = key self._weight = weight self._key_set = None if key_set is None else dict.fromkeys(key_set).keys() self._key_index = None - def count(self) -> AnnData: + def count(self) -> np.ndarray: """ Count the number of observations in each group. Returns ------- - Series of counts indexed by key. + Array of counts. """ _, key_index, _, _ = self._extract_indices() count_ = np.bincount(key_index) @@ -103,7 +96,7 @@ def sum(self) -> Array: Returns ------- - AnnData with sum in X indexed on obs by key with var from adata. + Array of sum. """ A, _ = self._sparse_aggregator(normalize=False) return utils.asarray(A * self._data) @@ -114,7 +107,7 @@ def mean(self) -> Array: Returns ------- - AnnData with means in X indexed on obs by key with var from adata. + Array of mean. """ A, _ = self._sparse_aggregator(normalize=True) return utils.asarray(A * self._data) @@ -136,7 +129,7 @@ def count_mean_var(self, dof: int = 1) -> dict: Returns ------- - AnnData with mean and var in layers indexed on obs by key with var from adata. Counts are in obs under counts. + dict with mean, count, and var keys. """ assert dof >= 0 A, _ = self._sparse_aggregator(normalize=True) @@ -147,9 +140,9 @@ def count_mean_var(self, dof: int = 1) -> dict: sq_mean = mean_**2 else: A_unweighted, _ = GroupBy( - df=self._df, + groupby=self._groupby, data=self._data, - key=self._key, + weight=self._weight, key_set=self._key_set, )._sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self._data) @@ -219,13 +212,13 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): weight_value = weight_value[mask] return keys, key_index, df_index, weight_value - key_value = self._df[self._key] + key_value = self._groupby keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) df_index = np.arange(len(key_index)) if self._weight is None: weight_value = None else: - weight_value = self._df[self._weight].values[df_index] + weight_value = self._weight.values[df_index] if self._key_set is not None: keys, key_index, df_index, weight_value = _filter_indices( self._key_set, keys, key_index, df_index, weight_value @@ -252,6 +245,10 @@ def _ndarray_from_seq(lst: Sequence): def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: """Find all columns which are a superset of the key column. + Args: + df (pd.DataFrame): DataFrame which contains candidate columns. + groupby_key (str): Key for column of which to find superset of columns. + Returns: List[str]: Superset columns. """ @@ -271,6 +268,16 @@ def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: + """Generate a grouped-by dataframe (no aggregation) by a key with columns that are supersets of the key column + + Args: + df (pd.DataFrame): DataFrame to be grouped. + key (str): Column to be grouped on. + key_set (List[str]): values in the `key` column to keep before groupby. + + Returns: + pd.DataFrame: Grouped-by Dataframe. + """ df = df.copy() if key_set is not None: df = df[df[key].isin(key_set)] @@ -285,38 +292,57 @@ def aggregated( by: str, how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', groupby_df_key: Literal['obs', 'var'] = 'obs', - weight: Optional[str] = None, + weight_key: Optional[str] = None, key_set: Optional[Iterable[str]] = None, dof: int = 1, layer=None, obsm=None, varm=None, -): +) -> AnnData: + """Aggregate data based on one of the columns of one of the axes (`obs` or `var`). If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. + + Args: + adata (AnnData): AnnData to be aggregated. + by (str): Key of the column to be grouped-by. + how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): _description_. Defaults to 'count_mean_var'. + groupby_df_key (Literal['obs', 'var'], optional): _description_. Defaults to 'obs'. + weight_key (Optional[str], optional): _description_. Defaults to None. Key of the `groupby_df_key` containing weights for a weighted sum aggregation. + key_set (Optional[Iterable[str]], optional): _description_. Defaults to None. Subset of groupby_df_key on which to filter. + dof (int, optional): _description_. Defaults to 1. Degrees of freedom for variance. + layer (_type_, optional): _description_. Defaults to None. If not None, key for aggregation data. + obsm (_type_, optional): _description_. Defaults to None. If not None, key for aggregation data. + varm (_type_, optional): _description_. Defaults to None. If not None, key for aggregation data. + + Returns: + _type_: _description_ + """ data = adata.X - write_to_obsm = None + write_to_xxxm = None if varm is not None: data = adata.varm[varm] - write_to_obsm = True # the data will have to be transposed so this is accurate + write_to_xxxm = True # the data will have to be transposed so this is accurate elif obsm is not None: data = adata.obsm[obsm] - write_to_obsm = True + write_to_xxxm = True elif layer is not None: data = adata.layers[layer] if groupby_df_key == 'var': data = data.T - elif groupby_df_key == 'var': + elif ( + groupby_df_key == 'var' + ): # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T - return aggregated_from_array( + return aggregated( data, groupby_df=getattr(adata, groupby_df_key), groupby_df_key=groupby_df_key, - no_groupby_df=getattr(adata, 'var' if groupby_df_key == 'obs' else 'obs'), by=by, - weight=weight, + write_to_xxxm=write_to_xxxm, + no_groupby_df=getattr(adata, 'var' if groupby_df_key == 'obs' else 'obs'), + weight_key=weight_key, key_set=key_set, how=how, dof=dof, - write_to_obsm=write_to_obsm, ) @@ -325,36 +351,55 @@ def aggregated_from_array( data, groupby_df: pd.DataFrame, groupby_df_key: str, - no_groupby_df: pd.DataFrame, by: str, - write_to_obsm: bool, - weight: Optional[str] = None, + write_to_xxxm: bool, + no_groupby_df: pd.DataFrame, + weight_key: Optional[str] = None, key_set: Optional[Iterable[str]] = None, how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', dof: int = 1, -): +) -> AnnData: + """Aggregate data based on one of the columns of one of a DataFrame. + + Args: + data (Array): Data for aggregation. + groupby_df (pd.DataFrame): DataFrame with column to be grouped on. + groupby_df_key (str): Key of AnnData corresponding to the axis on which the grouped by data belongs. + by (str): Key of the groupby DataFrame for grouping. + write_to_xxxm (bool): Whether or not to write aggregation data to `varm` or `obsm` (based on `groupby_df_key`) + no_groupby_df (pd.DataFrame): DataFrame on the opposite axis of groupby_df_key. + weight_key (Optional[str], optional): _description_. Defaults to None. Key of the `groupby_df_key` containing weights for a weighted sum aggregation. + key_set (Optional[Iterable[str]], optional): _description_. Defaults to None. Subset of groupby_df_key on which to filter. + how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): _description_. Defaults to 'count_mean_var'. + dof (int, optional): _description_. Defaults to 1. Degrees of freedom for variance. + + Returns: + AnnData: _description_ + """ groupby = GroupBy( - df=groupby_df, + groupby=groupby_df[by], data=data, - key=by, - weight=weight, + weight=groupby_df[weight_key] if weight_key is not None else None, key_set=key_set, ) + # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate obs_var_dict = {'obs': _df_grouped(groupby_df, by, key_set), 'var': no_groupby_df} data_dict = {} if how == 'count': - obs_var_dict['obs']['count'] = groupby.count() + obs_var_dict['obs']['count'] = groupby.count() # count goes in df elif how == 'mean': agg = groupby.mean() - data_dict = {'obsm': {'mean': agg}} if write_to_obsm else {'X': agg} + data_dict = {'obsm': {'mean': agg}} if write_to_xxxm else {'X': agg} elif how == 'sum': agg = groupby.sum() - data_dict = {'obsm': {'sum': agg}} if write_to_obsm else {'X': agg} + data_dict = {'obsm': {'sum': agg}} if write_to_xxxm else {'X': agg} else: agg = groupby.count_mean_var(dof) - write_to_obsm = 'obsm' if write_to_obsm else 'layers' - obs_var_dict['obs']['count'] = agg['count'] - data_dict = {write_to_obsm: {'mean': agg['mean'], 'var': agg['var']}} + write_key = 'obsm' if write_to_xxxm else 'layers' + obs_var_dict['obs']['count'] = agg['count'] # count in df + data_dict = { + write_key: {'mean': agg['mean'], 'var': agg['var']} + } # others in layers/obsm adata_agg = AnnData(**{**data_dict, **obs_var_dict}) if groupby_df_key == 'var': return adata_agg.T diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_groupby.py index 11b95aa9d..89eaf7c20 100644 --- a/scanpy/tests/test_groupby.py +++ b/scanpy/tests/test_groupby.py @@ -127,7 +127,7 @@ def test_groupby_different_data_locations(data_key, groupby_df_key): by="key", groupby_df_key=groupby_df_key, how='count_mean_var', - weight="weight", + weight_key="weight", **data_dict, ) sum_ = sc.get.aggregated( @@ -138,7 +138,7 @@ def test_groupby_different_data_locations(data_key, groupby_df_key): by="key", groupby_df_key=groupby_df_key, how='sum', - weight="weight", + weight_key="weight", **data_dict, ) @@ -300,7 +300,7 @@ def test_groupby_X(groupby_df_key): by="key", groupby_df_key=groupby_df_key, how='count_mean_var', - weight="weight", + weight_key="weight", ) sum_ = sc.get.aggregated( adata_sparse, by="key", groupby_df_key=groupby_df_key, how='sum' @@ -310,7 +310,7 @@ def test_groupby_X(groupby_df_key): by="key", groupby_df_key=groupby_df_key, how='sum', - weight="weight", + weight_key="weight", ) assert np.allclose(2 * sum_.X, sum_weight.X) From ed953734ee10bbfc4bf84bf461c7a41e42694813 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:37:30 +0200 Subject: [PATCH 17/89] (chore): `black` --- scanpy/get/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/groupby.py b/scanpy/get/groupby.py index d4def7b5d..a371b7988 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/groupby.py @@ -320,7 +320,7 @@ def aggregated( write_to_xxxm = None if varm is not None: data = adata.varm[varm] - write_to_xxxm = True # the data will have to be transposed so this is accurate + write_to_xxxm = True # the data will have to be transposed so this is accurate elif obsm is not None: data = adata.obsm[obsm] write_to_xxxm = True From f1b9d4dcfb7da09ad3ac9fa763bb9ef4c46b7116 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:40:20 +0200 Subject: [PATCH 18/89] (chore): `g(G)roupby` -> `a(A)ggregated` --- scanpy/get/__init__.py | 2 +- scanpy/get/{groupby.py => aggregated.py} | 6 +++--- scanpy/tests/{test_groupby.py => test_aggregated.py} | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename scanpy/get/{groupby.py => aggregated.py} (99%) rename scanpy/tests/{test_groupby.py => test_aggregated.py} (100%) diff --git a/scanpy/get/__init__.py b/scanpy/get/__init__.py index 1789a3dad..e4409e84d 100644 --- a/scanpy/get/__init__.py +++ b/scanpy/get/__init__.py @@ -4,4 +4,4 @@ # Private from .get import _get_obs_rep, _set_obs_rep -from .groupby import aggregated, aggregated_from_array +from .aggregated import aggregated, aggregated_from_array diff --git a/scanpy/get/groupby.py b/scanpy/get/aggregated.py similarity index 99% rename from scanpy/get/groupby.py rename to scanpy/get/aggregated.py index a371b7988..790540b56 100644 --- a/scanpy/get/groupby.py +++ b/scanpy/get/aggregated.py @@ -19,7 +19,7 @@ Array = Union[np.ndarray, spmatrix] -class GroupBy: +class Aggregate: """ Functionality for grouping and aggregating AnnData observations by key on `obs` and if you want to do `var`, transpose the AnnData object first. @@ -139,7 +139,7 @@ def count_mean_var(self, dof: int = 1) -> dict: if self._weight is None: sq_mean = mean_**2 else: - A_unweighted, _ = GroupBy( + A_unweighted, _ = Aggregate( groupby=self._groupby, data=self._data, weight=self._weight, @@ -376,7 +376,7 @@ def aggregated_from_array( Returns: AnnData: _description_ """ - groupby = GroupBy( + groupby = Aggregate( groupby=groupby_df[by], data=data, weight=groupby_df[weight_key] if weight_key is not None else None, diff --git a/scanpy/tests/test_groupby.py b/scanpy/tests/test_aggregated.py similarity index 100% rename from scanpy/tests/test_groupby.py rename to scanpy/tests/test_aggregated.py From 8faeec8c6bb0131f6f74449180e28a2bd2c96f87 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:44:56 +0200 Subject: [PATCH 19/89] (style): small docstring changes + export docs --- docs/api/get.md | 2 ++ scanpy/get/aggregated.py | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/docs/api/get.md b/docs/api/get.md index 82f74ed41..224f74e31 100644 --- a/docs/api/get.md +++ b/docs/api/get.md @@ -19,5 +19,7 @@ useful formats. get.obs_df get.var_df get.rank_genes_groups_df + get.aggregated + get.aggregated_from_array ``` diff --git a/scanpy/get/aggregated.py b/scanpy/get/aggregated.py index 790540b56..a4e256526 100644 --- a/scanpy/get/aggregated.py +++ b/scanpy/get/aggregated.py @@ -295,26 +295,26 @@ def aggregated( weight_key: Optional[str] = None, key_set: Optional[Iterable[str]] = None, dof: int = 1, - layer=None, - obsm=None, - varm=None, + layer: Optional[str] = None, + obsm: Optional[str] = None, + varm: Optional[str] = None, ) -> AnnData: """Aggregate data based on one of the columns of one of the axes (`obs` or `var`). If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. Args: adata (AnnData): AnnData to be aggregated. by (str): Key of the column to be grouped-by. - how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): _description_. Defaults to 'count_mean_var'. - groupby_df_key (Literal['obs', 'var'], optional): _description_. Defaults to 'obs'. - weight_key (Optional[str], optional): _description_. Defaults to None. Key of the `groupby_df_key` containing weights for a weighted sum aggregation. - key_set (Optional[Iterable[str]], optional): _description_. Defaults to None. Subset of groupby_df_key on which to filter. - dof (int, optional): _description_. Defaults to 1. Degrees of freedom for variance. - layer (_type_, optional): _description_. Defaults to None. If not None, key for aggregation data. - obsm (_type_, optional): _description_. Defaults to None. If not None, key for aggregation data. - varm (_type_, optional): _description_. Defaults to None. If not None, key for aggregation data. + how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): How to aggregate. Defaults to 'count_mean_var'. + groupby_df_key (Literal['obs', 'var'], optional): Axis on which to find group by column. Defaults to 'obs'. + weight_key (Optional[str], optional): Key of the `groupby_df_key` containing weights for a weighted sum aggregation. Defaults to None. + key_set (Optional[Iterable[str]], optional): Subset of groupby_df_key on which to filter. Defaults to None. + dof (int, optional): Degrees of freedom for variance. Defaults to 1. + layer (str, optional): If not None, key for aggregation data. Defaults to None. + obsm (str, optional): If not None, key for aggregation data. Defaults to None. + varm (str, optional): If not None, key for aggregation data. Defaults to None. Returns: - _type_: _description_ + AnnData: Aggregated AnnData. """ data = adata.X write_to_xxxm = None @@ -368,13 +368,13 @@ def aggregated_from_array( by (str): Key of the groupby DataFrame for grouping. write_to_xxxm (bool): Whether or not to write aggregation data to `varm` or `obsm` (based on `groupby_df_key`) no_groupby_df (pd.DataFrame): DataFrame on the opposite axis of groupby_df_key. - weight_key (Optional[str], optional): _description_. Defaults to None. Key of the `groupby_df_key` containing weights for a weighted sum aggregation. - key_set (Optional[Iterable[str]], optional): _description_. Defaults to None. Subset of groupby_df_key on which to filter. - how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): _description_. Defaults to 'count_mean_var'. - dof (int, optional): _description_. Defaults to 1. Degrees of freedom for variance. + weight_key (Optional[str], optional): Key of the `groupby_df_key` containing weights for a weighted sum aggregation. Defaults to None. + key_set (Optional[Iterable[str]], optional): Defaults to None. Subset of groupby_df_key on which to filter. + how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): How to aggregate. Defaults to 'count_mean_var'. + dof (int, optional): Degrees of freedom for variance. Defaults to 1. Returns: - AnnData: _description_ + AnnData: Aggregated AnnData """ groupby = Aggregate( groupby=groupby_df[by], From 10b2056bead26d2d823d49c02a91ec112f36f99a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:46:33 +0200 Subject: [PATCH 20/89] (chore): small doc fix --- scanpy/get/aggregated.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scanpy/get/aggregated.py b/scanpy/get/aggregated.py index a4e256526..77cc33954 100644 --- a/scanpy/get/aggregated.py +++ b/scanpy/get/aggregated.py @@ -21,8 +21,7 @@ class Aggregate: """ - Functionality for grouping and aggregating AnnData observations by key on `obs` - and if you want to do `var`, transpose the AnnData object first. + Functionality for generic grouping and aggregating. There is currently support for count, sum, mean, and varience per group, and for scores derived from these per pair of groups. From 225ee790fdb658c5d04ba20fa01173071d0243a0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 15:55:16 +0200 Subject: [PATCH 21/89] (fix): replace `Union` in `singledispatch` with classes --- scanpy/get/aggregated.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scanpy/get/aggregated.py b/scanpy/get/aggregated.py index 77cc33954..e6b3d998e 100644 --- a/scanpy/get/aggregated.py +++ b/scanpy/get/aggregated.py @@ -345,7 +345,8 @@ def aggregated( ) -@aggregated.register(Array) +@aggregated.register(np.ndarray) +@aggregated.register(spmatrix) def aggregated_from_array( data, groupby_df: pd.DataFrame, From b8e4fda1caaee7cf08dbcb25a2bac2fb9dbfd9b7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 4 Aug 2023 16:23:40 +0200 Subject: [PATCH 22/89] (fix): remove final menions of `score` and other small doc fixes --- scanpy/get/aggregated.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/scanpy/get/aggregated.py b/scanpy/get/aggregated.py index e6b3d998e..893092701 100644 --- a/scanpy/get/aggregated.py +++ b/scanpy/get/aggregated.py @@ -23,8 +23,7 @@ class Aggregate: """ Functionality for generic grouping and aggregating. - There is currently support for count, sum, mean, and varience per group, and for scores - derived from these per pair of groups. + There is currently support for count, sum, mean, and variance. Set `weight` for weighted sum, mean, and variance. @@ -32,8 +31,8 @@ class Aggregate: **Implementation** - Moments are computed using weighted sum aggregation of AnnData obsevations per variable - (i.e., feature) via multiplication by a sparse coordinate matrix A, exposed by + Moments are computed using weighted sum aggregation of data by some feature + via multiplication by a sparse coordinate matrix A, exposed by `_sparse_aggregator`. The approach works with data in ndarray or scipy sparse formats, with no view or copy overhead on runtime or memory, even when filtering keys. @@ -42,10 +41,6 @@ class Aggregate: O(data) for partitions (each observation belonging to exactly one group), independent of the number of groups. - To compute scores, first statistics are computed for each group in at least one pair, and - then scores are computed for each pair using the statistics. Runtime is dominated by the - former, so is effectively independent of the number of pairs. - Params ------ _groupby From c3a082390d440736e5baf26dc2cebe2174db3a97 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 09:30:33 +0200 Subject: [PATCH 23/89] (fix): doc string --- scanpy/get/aggregated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/aggregated.py b/scanpy/get/aggregated.py index 893092701..11f13c5ee 100644 --- a/scanpy/get/aggregated.py +++ b/scanpy/get/aggregated.py @@ -49,7 +49,7 @@ class Aggregate: Data matrix for aggregation. _weight Weights to be used for aggergation. - key_set + _key_set Subset of keys to which to filter. """ From 24b074967c0390e36fa35eed48c448f596ee5eb6 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 8 Aug 2023 09:36:01 +0200 Subject: [PATCH 24/89] Apply suggestions from code review Co-authored-by: Isaac Virshup --- scanpy/get/__init__.py | 2 +- scanpy/get/aggregated.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scanpy/get/__init__.py b/scanpy/get/__init__.py index e4409e84d..98ab53314 100644 --- a/scanpy/get/__init__.py +++ b/scanpy/get/__init__.py @@ -4,4 +4,4 @@ # Private from .get import _get_obs_rep, _set_obs_rep -from .aggregated import aggregated, aggregated_from_array +from ._aggregated import aggregated diff --git a/scanpy/get/aggregated.py b/scanpy/get/aggregated.py index 11f13c5ee..3cd737932 100644 --- a/scanpy/get/aggregated.py +++ b/scanpy/get/aggregated.py @@ -275,7 +275,7 @@ def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: df = df.copy() if key_set is not None: df = df[df[key].isin(key_set)] - if df[key].dtype.name == 'category': + if pd.api.types.is_categorical_dtype(df[key]): df[key] = df[key].cat.remove_unused_categories() return df.groupby(key).first()[_superset_columns(df, key)] @@ -285,6 +285,7 @@ def aggregated( adata: AnnData, by: str, how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', + *, groupby_df_key: Literal['obs', 'var'] = 'obs', weight_key: Optional[str] = None, key_set: Optional[Iterable[str]] = None, From cf5e9525806b9db8745097118c0b7e4eeeaf992f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 09:50:27 +0200 Subject: [PATCH 25/89] (chore): apply style/args changes --- scanpy/get/{aggregated.py => _aggregated.py} | 109 ++++++++++++------- scanpy/tests/test_aggregated.py | 86 +++++++-------- 2 files changed, 112 insertions(+), 83 deletions(-) rename scanpy/get/{aggregated.py => _aggregated.py} (81%) diff --git a/scanpy/get/aggregated.py b/scanpy/get/_aggregated.py similarity index 81% rename from scanpy/get/aggregated.py rename to scanpy/get/_aggregated.py index 3cd737932..7d218c562 100644 --- a/scanpy/get/aggregated.py +++ b/scanpy/get/_aggregated.py @@ -286,7 +286,7 @@ def aggregated( by: str, how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', *, - groupby_df_key: Literal['obs', 'var'] = 'obs', + dim: Literal['obs', 'var'] = 'obs', weight_key: Optional[str] = None, key_set: Optional[Iterable[str]] = None, dof: int = 1, @@ -294,22 +294,37 @@ def aggregated( obsm: Optional[str] = None, varm: Optional[str] = None, ) -> AnnData: - """Aggregate data based on one of the columns of one of the axes (`obs` or `var`). If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. - - Args: - adata (AnnData): AnnData to be aggregated. - by (str): Key of the column to be grouped-by. - how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): How to aggregate. Defaults to 'count_mean_var'. - groupby_df_key (Literal['obs', 'var'], optional): Axis on which to find group by column. Defaults to 'obs'. - weight_key (Optional[str], optional): Key of the `groupby_df_key` containing weights for a weighted sum aggregation. Defaults to None. - key_set (Optional[Iterable[str]], optional): Subset of groupby_df_key on which to filter. Defaults to None. - dof (int, optional): Degrees of freedom for variance. Defaults to 1. - layer (str, optional): If not None, key for aggregation data. Defaults to None. - obsm (str, optional): If not None, key for aggregation data. Defaults to None. - varm (str, optional): If not None, key for aggregation data. Defaults to None. - - Returns: - AnnData: Aggregated AnnData. + """\ + Aggregate data based on one of the columns of one of the axes (`obs` or `var`). + If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. + + Parameters + ---------- + adata: + :class:`~anndata.AnnData` to be aggregated. + by: + Key of the column to be grouped-by. + how: + How to aggregate. Defaults to 'count_mean_var'. + dim: + Axis on which to find group by column. Defaults to 'obs'. + weight_key: + Key of the `dim` containing weights for a weighted sum aggregation. Defaults to None. + key_set: + Subset of dim on which to filter. Defaults to None. + dof: + Degrees of freedom for variance. Defaults to 1. + layer: + If not None, key for aggregation data. Defaults to None. + obsm: + If not None, key for aggregation data. Defaults to None. + varm: + If not None, key for aggregation data. Defaults to None. + + Returns + ------- + AnnData: + Aggregated :class:`~anndata.AnnData`. """ data = adata.X write_to_xxxm = None @@ -321,19 +336,19 @@ def aggregated( write_to_xxxm = True elif layer is not None: data = adata.layers[layer] - if groupby_df_key == 'var': + if dim == 'var': data = data.T elif ( - groupby_df_key == 'var' + dim == 'var' ): # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T return aggregated( data, - groupby_df=getattr(adata, groupby_df_key), - groupby_df_key=groupby_df_key, + groupby_df=getattr(adata, dim), + dim=dim, by=by, write_to_xxxm=write_to_xxxm, - no_groupby_df=getattr(adata, 'var' if groupby_df_key == 'obs' else 'obs'), + no_groupby_df=getattr(adata, 'var' if dim == 'obs' else 'obs'), weight_key=weight_key, key_set=key_set, how=how, @@ -346,7 +361,7 @@ def aggregated( def aggregated_from_array( data, groupby_df: pd.DataFrame, - groupby_df_key: str, + dim: str, by: str, write_to_xxxm: bool, no_groupby_df: pd.DataFrame, @@ -355,22 +370,36 @@ def aggregated_from_array( how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', dof: int = 1, ) -> AnnData: - """Aggregate data based on one of the columns of one of a DataFrame. - - Args: - data (Array): Data for aggregation. - groupby_df (pd.DataFrame): DataFrame with column to be grouped on. - groupby_df_key (str): Key of AnnData corresponding to the axis on which the grouped by data belongs. - by (str): Key of the groupby DataFrame for grouping. - write_to_xxxm (bool): Whether or not to write aggregation data to `varm` or `obsm` (based on `groupby_df_key`) - no_groupby_df (pd.DataFrame): DataFrame on the opposite axis of groupby_df_key. - weight_key (Optional[str], optional): Key of the `groupby_df_key` containing weights for a weighted sum aggregation. Defaults to None. - key_set (Optional[Iterable[str]], optional): Defaults to None. Subset of groupby_df_key on which to filter. - how (Literal['count', 'mean', 'sum', 'count_mean_var'], optional): How to aggregate. Defaults to 'count_mean_var'. - dof (int, optional): Degrees of freedom for variance. Defaults to 1. - - Returns: - AnnData: Aggregated AnnData + """\ + Aggregate data based on one of the columns of one of a `~pd.DataFrame`. + + Parameters + ---------- + data: + Data for aggregation. + groupby_df: + `~pd.DataFrame` with column to be grouped on. + dim: + Key of AnnData corresponding to the dim on which the grouped by data belongs. + by: + Key of the groupby `~pd.DataFrame` for grouping. + write_to_xxxm: + Whether or not to write aggregation data to `varm` or `obsm` (based on `dim`) + no_groupby_df: + `~pd.DataFrame` on the opposite dim of dim. + weight_key: + Key of the `dim` containing weights for a weighted sum aggregation. Defaults to None. + key_set: + Defaults to None. Subset of dim on which to filter. + how: + How to aggregate. Defaults to 'count_mean_var'. + dof: + Degrees of freedom for variance. Defaults to 1. + + Returns + ------- + AnnData: + Aggregated :class:`~anndata.AnnData`. """ groupby = Aggregate( groupby=groupby_df[by], @@ -397,6 +426,6 @@ def aggregated_from_array( write_key: {'mean': agg['mean'], 'var': agg['var']} } # others in layers/obsm adata_agg = AnnData(**{**data_dict, **obs_var_dict}) - if groupby_df_key == 'var': + if dim == 'var': return adata_agg.T return adata_agg diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 89eaf7c20..3590e59a1 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -11,15 +11,15 @@ ['layers', 'obsm', 'varm'], ) @pytest.mark.parametrize( - 'groupby_df_key', + 'dim', [ 'obs', 'var', ], ) -def test_groupby_different_data_locations(data_key, groupby_df_key): - if (data_key == 'varm' and groupby_df_key == 'obs') or ( - data_key == 'obsm' and groupby_df_key == 'var' +def test_groupby_different_data_locations(data_key, dim): + if (data_key == 'varm' and dim == 'obs') or ( + data_key == 'obsm' and dim == 'var' ): pytest.skip("invalid parameter combination") ax_base = ["A", "B"] @@ -68,7 +68,7 @@ def test_groupby_different_data_locations(data_key, groupby_df_key): dtype=np.float32, ) data_dense = X - if groupby_df_key == 'obs': + if dim == 'obs': data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} adata_sparse = ad.AnnData( **{'obs': df_groupby, 'var': df_base, **data_sparse_mat_dict} @@ -93,25 +93,25 @@ def test_groupby_different_data_locations(data_key, groupby_df_key): stats_sparse = sc.get.aggregated( adata_sparse, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='count_mean_var', **data_dict, ) stats_dense = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='count_mean_var', **data_dict, ) # superset columns can be kept but not subsets - assert 'key_superset' in getattr(stats_sparse, groupby_df_key) - assert 'key_subset' not in getattr(stats_sparse, groupby_df_key) + assert 'key_superset' in getattr(stats_sparse, dim) + assert 'key_subset' not in getattr(stats_sparse, dim) assert np.allclose( - getattr(stats_sparse, groupby_df_key)['count'], - getattr(stats_sparse, groupby_df_key)['count'], + getattr(stats_sparse, dim)['count'], + getattr(stats_sparse, dim)['count'], ) assert np.allclose( getattr(stats_sparse, data_key)['mean'], getattr(stats_dense, data_key)['mean'] @@ -125,18 +125,18 @@ def test_groupby_different_data_locations(data_key, groupby_df_key): stats_weight = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='count_mean_var', weight_key="weight", **data_dict, ) sum_ = sc.get.aggregated( - adata_sparse, by="key", groupby_df_key=groupby_df_key, how='sum', **data_dict + adata_sparse, by="key", dim=dim, how='sum', **data_dict ) sum_weight = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='sum', weight_key="weight", **data_dict, @@ -164,15 +164,15 @@ def get_single_agg(adata, key, agg): mean_key_set_adata = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='mean', key_set=key_set, **data_dict, ) - subset_idx = getattr(stats_sparse, groupby_df_key).index.isin(key_set) + subset_idx = getattr(stats_sparse, dim).index.isin(key_set) subset_adata = ( stats_sparse[subset_idx, :] - if groupby_df_key == 'obs' + if dim == 'obs' else stats_sparse[:, subset_idx] ) subset_mean = getattr(subset_adata, data_key)['mean'] @@ -180,12 +180,12 @@ def get_single_agg(adata, key, agg): assert np.allclose(subset_mean, key_set_mean) df = pd.DataFrame( - index=getattr(adata_dense, groupby_df_key)["key"], + index=getattr(adata_dense, dim)["key"], columns=getattr( - adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names" + adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names" ), data=data_dense.T - if groupby_df_key == 'var' and data_key != 'varm' + if dim == 'var' and data_key != 'varm' else data_dense, ) grouped_agg_df = ( @@ -195,27 +195,27 @@ def get_single_agg(adata, key, agg): .sort_index(axis=1) ) mean = getattr(stats_dense, data_key)['mean'] - if groupby_df_key == 'var' and data_key != 'varm': + if dim == 'var' and data_key != 'varm': mean = mean.T assert np.allclose(mean, grouped_agg_df['mean'].values) var = getattr(stats_dense, data_key)['var'] - if groupby_df_key == 'var' and data_key != 'varm': + if dim == 'var' and data_key != 'varm': var = var.T assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) assert np.allclose( - getattr(stats_dense, groupby_df_key)['count'], + getattr(stats_dense, dim)['count'], grouped_agg_df['count']['A'].values, ) # returns for both columns but counts only needs one because it is the same @pytest.mark.parametrize( - 'groupby_df_key', + 'dim', [ 'obs', 'var', ], ) -def test_groupby_X(groupby_df_key): +def test_groupby_X(dim): ax_base = ["A", "B"] ax_groupby = [ "v0", @@ -262,7 +262,7 @@ def test_groupby_X(groupby_df_key): dtype=np.float32, ) data_dense = X - if groupby_df_key == 'obs': + if dim == 'obs': adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X)) adata_dense = ad.AnnData(obs=df_groupby, var=df_base, X=X) else: @@ -272,23 +272,23 @@ def test_groupby_X(groupby_df_key): stats_sparse = sc.get.aggregated( adata_sparse, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='count_mean_var', ) stats_dense = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='count_mean_var', ) # superset columns can be kept but not subsets - assert 'key_superset' in getattr(stats_sparse, groupby_df_key) - assert 'key_subset' not in getattr(stats_sparse, groupby_df_key) + assert 'key_superset' in getattr(stats_sparse, dim) + assert 'key_subset' not in getattr(stats_sparse, dim) assert np.allclose( - getattr(stats_sparse, groupby_df_key)['count'], - getattr(stats_sparse, groupby_df_key)['count'], + getattr(stats_sparse, dim)['count'], + getattr(stats_sparse, dim)['count'], ) assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) assert np.allclose( @@ -298,17 +298,17 @@ def test_groupby_X(groupby_df_key): stats_weight = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='count_mean_var', weight_key="weight", ) sum_ = sc.get.aggregated( - adata_sparse, by="key", groupby_df_key=groupby_df_key, how='sum' + adata_sparse, by="key", dim=dim, how='sum' ) sum_weight = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='sum', weight_key="weight", ) @@ -323,14 +323,14 @@ def test_groupby_X(groupby_df_key): mean_key_set_adata = sc.get.aggregated( adata_dense, by="key", - groupby_df_key=groupby_df_key, + dim=dim, how='mean', key_set=key_set, ) - subset_idx = getattr(stats_sparse, groupby_df_key).index.isin(key_set) + subset_idx = getattr(stats_sparse, dim).index.isin(key_set) subset_adata = ( stats_sparse[subset_idx, :] - if groupby_df_key == 'obs' + if dim == 'obs' else stats_sparse[:, subset_idx] ) subset_mean = subset_adata.layers['mean'] @@ -338,9 +338,9 @@ def test_groupby_X(groupby_df_key): assert np.allclose(subset_mean, key_set_mean) df = pd.DataFrame( - index=getattr(adata_dense, groupby_df_key)["key"], + index=getattr(adata_dense, dim)["key"], columns=getattr( - adata_dense, f"{'var' if groupby_df_key == 'obs' else 'obs'}_names" + adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names" ), data=data_dense, ) @@ -351,14 +351,14 @@ def test_groupby_X(groupby_df_key): .sort_index(axis=1) ) mean = stats_dense.layers['mean'] - if groupby_df_key == 'var': + if dim == 'var': mean = mean.T assert np.allclose(mean, grouped_agg_df['mean'].values) var = stats_dense.layers['var'] - if groupby_df_key == 'var': + if dim == 'var': var = var.T assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) assert np.allclose( - getattr(stats_dense, groupby_df_key)['count'], + getattr(stats_dense, dim)['count'], grouped_agg_df['count']['A'].values, ) # returns for both columns but counts only needs one because it is the same From b0a449be03db08a6abdeac593b13975a5df7d611 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 10:13:12 +0200 Subject: [PATCH 26/89] (chore): Remove defaults. --- docs/api/get.md | 1 - scanpy/get/_aggregated.py | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/api/get.md b/docs/api/get.md index 224f74e31..74865d184 100644 --- a/docs/api/get.md +++ b/docs/api/get.md @@ -20,6 +20,5 @@ useful formats. get.var_df get.rank_genes_groups_df get.aggregated - get.aggregated_from_array ``` diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 7d218c562..489ab1d11 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -305,21 +305,21 @@ def aggregated( by: Key of the column to be grouped-by. how: - How to aggregate. Defaults to 'count_mean_var'. + How to aggregate. dim: - Axis on which to find group by column. Defaults to 'obs'. + Axis on which to find group by column. weight_key: - Key of the `dim` containing weights for a weighted sum aggregation. Defaults to None. + Key of the `dim` containing weights for a weighted sum aggregation. key_set: - Subset of dim on which to filter. Defaults to None. + Subset of dim on which to filter. dof: Degrees of freedom for variance. Defaults to 1. layer: - If not None, key for aggregation data. Defaults to None. + If not None, key for aggregation data. obsm: - If not None, key for aggregation data. Defaults to None. + If not None, key for aggregation data. varm: - If not None, key for aggregation data. Defaults to None. + If not None, key for aggregation data. Returns ------- @@ -384,15 +384,15 @@ def aggregated_from_array( by: Key of the groupby `~pd.DataFrame` for grouping. write_to_xxxm: - Whether or not to write aggregation data to `varm` or `obsm` (based on `dim`) + Whether or not to write aggregation data to `varm` or `obsm` (based on `dim`). no_groupby_df: `~pd.DataFrame` on the opposite dim of dim. weight_key: - Key of the `dim` containing weights for a weighted sum aggregation. Defaults to None. + Key of the `dim` containing weights for a weighted sum aggregation. key_set: - Defaults to None. Subset of dim on which to filter. + Subset of dim on which to filter. how: - How to aggregate. Defaults to 'count_mean_var'. + How to aggregate. dof: Degrees of freedom for variance. Defaults to 1. From 7c6727d8bae9a3f6e67929adf85c43157a0d998a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 10:34:45 +0200 Subject: [PATCH 27/89] (chore): `black` --- scanpy/tests/test_aggregated.py | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 3590e59a1..d05b8af87 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -18,9 +18,7 @@ ], ) def test_groupby_different_data_locations(data_key, dim): - if (data_key == 'varm' and dim == 'obs') or ( - data_key == 'obsm' and dim == 'var' - ): + if (data_key == 'varm' and dim == 'obs') or (data_key == 'obsm' and dim == 'var'): pytest.skip("invalid parameter combination") ax_base = ["A", "B"] ax_groupby = [ @@ -130,9 +128,7 @@ def test_groupby_different_data_locations(data_key, dim): weight_key="weight", **data_dict, ) - sum_ = sc.get.aggregated( - adata_sparse, by="key", dim=dim, how='sum', **data_dict - ) + sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, how='sum', **data_dict) sum_weight = sc.get.aggregated( adata_dense, by="key", @@ -171,9 +167,7 @@ def get_single_agg(adata, key, agg): ) subset_idx = getattr(stats_sparse, dim).index.isin(key_set) subset_adata = ( - stats_sparse[subset_idx, :] - if dim == 'obs' - else stats_sparse[:, subset_idx] + stats_sparse[subset_idx, :] if dim == 'obs' else stats_sparse[:, subset_idx] ) subset_mean = getattr(subset_adata, data_key)['mean'] key_set_mean = get_single_agg(mean_key_set_adata, data_key, 'mean') @@ -181,12 +175,8 @@ def get_single_agg(adata, key, agg): df = pd.DataFrame( index=getattr(adata_dense, dim)["key"], - columns=getattr( - adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names" - ), - data=data_dense.T - if dim == 'var' and data_key != 'varm' - else data_dense, + columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), + data=data_dense.T if dim == 'var' and data_key != 'varm' else data_dense, ) grouped_agg_df = ( df.groupby('key') @@ -302,9 +292,7 @@ def test_groupby_X(dim): how='count_mean_var', weight_key="weight", ) - sum_ = sc.get.aggregated( - adata_sparse, by="key", dim=dim, how='sum' - ) + sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, how='sum') sum_weight = sc.get.aggregated( adata_dense, by="key", @@ -329,9 +317,7 @@ def test_groupby_X(dim): ) subset_idx = getattr(stats_sparse, dim).index.isin(key_set) subset_adata = ( - stats_sparse[subset_idx, :] - if dim == 'obs' - else stats_sparse[:, subset_idx] + stats_sparse[subset_idx, :] if dim == 'obs' else stats_sparse[:, subset_idx] ) subset_mean = subset_adata.layers['mean'] key_set_mean = mean_key_set_adata.X @@ -339,9 +325,7 @@ def test_groupby_X(dim): df = pd.DataFrame( index=getattr(adata_dense, dim)["key"], - columns=getattr( - adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names" - ), + columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), data=data_dense, ) grouped_agg_df = ( From 4ad8f14b06a860bbf8063cee7f1e5d97fc7ce524 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 10:36:47 +0200 Subject: [PATCH 28/89] (chore): `how` -> `func` --- scanpy/get/_aggregated.py | 16 ++++++++-------- scanpy/tests/test_aggregated.py | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 489ab1d11..a4c9171bd 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -284,7 +284,7 @@ def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: def aggregated( adata: AnnData, by: str, - how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', + func: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', *, dim: Literal['obs', 'var'] = 'obs', weight_key: Optional[str] = None, @@ -304,7 +304,7 @@ def aggregated( :class:`~anndata.AnnData` to be aggregated. by: Key of the column to be grouped-by. - how: + func: How to aggregate. dim: Axis on which to find group by column. @@ -351,7 +351,7 @@ def aggregated( no_groupby_df=getattr(adata, 'var' if dim == 'obs' else 'obs'), weight_key=weight_key, key_set=key_set, - how=how, + func=func, dof=dof, ) @@ -367,7 +367,7 @@ def aggregated_from_array( no_groupby_df: pd.DataFrame, weight_key: Optional[str] = None, key_set: Optional[Iterable[str]] = None, - how: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', + func: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', dof: int = 1, ) -> AnnData: """\ @@ -391,7 +391,7 @@ def aggregated_from_array( Key of the `dim` containing weights for a weighted sum aggregation. key_set: Subset of dim on which to filter. - how: + func: How to aggregate. dof: Degrees of freedom for variance. Defaults to 1. @@ -410,12 +410,12 @@ def aggregated_from_array( # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate obs_var_dict = {'obs': _df_grouped(groupby_df, by, key_set), 'var': no_groupby_df} data_dict = {} - if how == 'count': + if func == 'count': obs_var_dict['obs']['count'] = groupby.count() # count goes in df - elif how == 'mean': + elif func == 'mean': agg = groupby.mean() data_dict = {'obsm': {'mean': agg}} if write_to_xxxm else {'X': agg} - elif how == 'sum': + elif func == 'sum': agg = groupby.sum() data_dict = {'obsm': {'sum': agg}} if write_to_xxxm else {'X': agg} else: diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index d05b8af87..d5098be64 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -92,14 +92,14 @@ def test_groupby_different_data_locations(data_key, dim): adata_sparse, by="key", dim=dim, - how='count_mean_var', + func='count_mean_var', **data_dict, ) stats_dense = sc.get.aggregated( adata_dense, by="key", dim=dim, - how='count_mean_var', + func='count_mean_var', **data_dict, ) @@ -124,16 +124,16 @@ def test_groupby_different_data_locations(data_key, dim): adata_dense, by="key", dim=dim, - how='count_mean_var', + func='count_mean_var', weight_key="weight", **data_dict, ) - sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, how='sum', **data_dict) + sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, func='sum', **data_dict) sum_weight = sc.get.aggregated( adata_dense, by="key", dim=dim, - how='sum', + func='sum', weight_key="weight", **data_dict, ) @@ -161,7 +161,7 @@ def get_single_agg(adata, key, agg): adata_dense, by="key", dim=dim, - how='mean', + func='mean', key_set=key_set, **data_dict, ) @@ -263,13 +263,13 @@ def test_groupby_X(dim): adata_sparse, by="key", dim=dim, - how='count_mean_var', + func='count_mean_var', ) stats_dense = sc.get.aggregated( adata_dense, by="key", dim=dim, - how='count_mean_var', + func='count_mean_var', ) # superset columns can be kept but not subsets @@ -289,15 +289,15 @@ def test_groupby_X(dim): adata_dense, by="key", dim=dim, - how='count_mean_var', + func='count_mean_var', weight_key="weight", ) - sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, how='sum') + sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, func='sum') sum_weight = sc.get.aggregated( adata_dense, by="key", dim=dim, - how='sum', + func='sum', weight_key="weight", ) @@ -312,7 +312,7 @@ def test_groupby_X(dim): adata_dense, by="key", dim=dim, - how='mean', + func='mean', key_set=key_set, ) subset_idx = getattr(stats_sparse, dim).index.isin(key_set) From d159166c43f8e2c73c0b6c053e516851336b3a1c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 11:04:07 +0200 Subject: [PATCH 29/89] (feat): allow for list of `AggType`s --- scanpy/get/_aggregated.py | 61 +++++++++++++++++++++++---------- scanpy/tests/test_aggregated.py | 12 +++---- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index a4c9171bd..768c4f2f2 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -17,6 +17,7 @@ from scipy.sparse import coo_matrix, dia_matrix, spmatrix Array = Union[np.ndarray, spmatrix] +AggType = Literal['count', 'mean', 'sum', 'var'] class Aggregate: @@ -284,7 +285,7 @@ def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: def aggregated( adata: AnnData, by: str, - func: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', + func: AggType | List[AggType], *, dim: Literal['obs', 'var'] = 'obs', weight_key: Optional[str] = None, @@ -297,6 +298,8 @@ def aggregated( """\ Aggregate data based on one of the columns of one of the axes (`obs` or `var`). If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. + If `func` only has length 1 or is just an `AggType`, then aggregation data is written to `X`. + Otherwise, it is written to `layers` or `xxxm` as appropriate for the dimensions of the aggregation data. Parameters ---------- @@ -361,13 +364,13 @@ def aggregated( def aggregated_from_array( data, groupby_df: pd.DataFrame, + func: AggType | List[AggType], dim: str, by: str, write_to_xxxm: bool, no_groupby_df: pd.DataFrame, weight_key: Optional[str] = None, key_set: Optional[Iterable[str]] = None, - func: Literal['count', 'mean', 'sum', 'count_mean_var'] = 'count_mean_var', dof: int = 1, ) -> AnnData: """\ @@ -379,6 +382,8 @@ def aggregated_from_array( Data for aggregation. groupby_df: `~pd.DataFrame` with column to be grouped on. + func: + How to aggregate. dim: Key of AnnData corresponding to the dim on which the grouped by data belongs. by: @@ -391,8 +396,6 @@ def aggregated_from_array( Key of the `dim` containing weights for a weighted sum aggregation. key_set: Subset of dim on which to filter. - func: - How to aggregate. dof: Degrees of freedom for variance. Defaults to 1. @@ -409,22 +412,44 @@ def aggregated_from_array( ) # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate obs_var_dict = {'obs': _df_grouped(groupby_df, by, key_set), 'var': no_groupby_df} - data_dict = {} - if func == 'count': - obs_var_dict['obs']['count'] = groupby.count() # count goes in df - elif func == 'mean': - agg = groupby.mean() - data_dict = {'obsm': {'mean': agg}} if write_to_xxxm else {'X': agg} - elif func == 'sum': + data_dict = { + 'layers': {}, + 'X': None, + 'obsm': {}, + } + func_set = func + write_key = 'obsm' if write_to_xxxm else 'layers' + if not isinstance(func, list): + func_set = [func] + func_set = set(func_set) + if 'sum' in func_set: # sum is calculated separately from the rest agg = groupby.sum() - data_dict = {'obsm': {'sum': agg}} if write_to_xxxm else {'X': agg} - else: + if ( + len(func_set) == 1 and not write_to_xxxm + ): # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` + data_dict['X'] = agg + else: + data_dict[write_key]['sum'] = agg + if ( + 'mean' in func_set and 'var' not in func_set + ): # here and below for count, if var is present, these can be calculate alongside var + agg = groupby.mean() + if len(func_set) == 1 and not write_to_xxxm: + data_dict['X'] = agg + else: + data_dict[write_key]['mean'] = agg + if 'count' in func_set and 'var' not in func_set: + obs_var_dict['obs']['count'] = groupby.count() # count goes in dim df + if 'var' in func_set: agg = groupby.count_mean_var(dof) - write_key = 'obsm' if write_to_xxxm else 'layers' - obs_var_dict['obs']['count'] = agg['count'] # count in df - data_dict = { - write_key: {'mean': agg['mean'], 'var': agg['var']} - } # others in layers/obsm + if len(func_set) == 1 and not write_to_xxxm: + data_dict['X'] = agg['var'] + else: + data_dict[write_key]['var'] = agg['var'] + if 'mean' in func_set: + data_dict[write_key]['mean'] = agg['mean'] + if 'count' in func_set: + obs_var_dict['obs']['count'] = agg['count'] adata_agg = AnnData(**{**data_dict, **obs_var_dict}) if dim == 'var': return adata_agg.T diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index d5098be64..cf44214fd 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -92,14 +92,14 @@ def test_groupby_different_data_locations(data_key, dim): adata_sparse, by="key", dim=dim, - func='count_mean_var', + func=['count', 'mean', 'var'], **data_dict, ) stats_dense = sc.get.aggregated( adata_dense, by="key", dim=dim, - func='count_mean_var', + func=['count', 'mean', 'var'], **data_dict, ) @@ -124,7 +124,7 @@ def test_groupby_different_data_locations(data_key, dim): adata_dense, by="key", dim=dim, - func='count_mean_var', + func=['count', 'mean', 'var'], weight_key="weight", **data_dict, ) @@ -263,13 +263,13 @@ def test_groupby_X(dim): adata_sparse, by="key", dim=dim, - func='count_mean_var', + func=['count', 'mean', 'var'], ) stats_dense = sc.get.aggregated( adata_dense, by="key", dim=dim, - func='count_mean_var', + func=['count', 'mean', 'var'], ) # superset columns can be kept but not subsets @@ -289,7 +289,7 @@ def test_groupby_X(dim): adata_dense, by="key", dim=dim, - func='count_mean_var', + func=['count', 'mean', 'var'], weight_key="weight", ) sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, func='sum') From e9eede5d6f66593b20a342ef8324ca891588f9bc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 11:08:48 +0200 Subject: [PATCH 30/89] (chore): `pre-commit` --- scanpy/get/_aggregated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 768c4f2f2..f1e7da98e 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -307,7 +307,7 @@ def aggregated( :class:`~anndata.AnnData` to be aggregated. by: Key of the column to be grouped-by. - func: + func: How to aggregate. dim: Axis on which to find group by column. @@ -396,7 +396,7 @@ def aggregated_from_array( Key of the `dim` containing weights for a weighted sum aggregation. key_set: Subset of dim on which to filter. - dof: + dof: Degrees of freedom for variance. Defaults to 1. Returns From 19a3da1a3e0fbe35b15178d6898b2a3ab4fb9451 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 11:14:04 +0200 Subject: [PATCH 31/89] (fix): `|` -> `Union` for 3.8 --- scanpy/get/_aggregated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index f1e7da98e..f31803360 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -285,7 +285,7 @@ def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: def aggregated( adata: AnnData, by: str, - func: AggType | List[AggType], + func: Union[AggType, List[AggType]], *, dim: Literal['obs', 'var'] = 'obs', weight_key: Optional[str] = None, @@ -364,7 +364,7 @@ def aggregated( def aggregated_from_array( data, groupby_df: pd.DataFrame, - func: AggType | List[AggType], + func: Union[AggType, List[AggType]], dim: str, by: str, write_to_xxxm: bool, From 231612c2f769e2e1b68ced3a5773a5c77c13b3b6 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Mon, 14 Aug 2023 12:52:15 +0200 Subject: [PATCH 32/89] Fix doc formatting --- scanpy/get/_aggregated.py | 145 +++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 82 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index f31803360..411bced94 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -21,7 +21,7 @@ class Aggregate: - """ + """\ Functionality for generic grouping and aggregating. There is currently support for count, sum, mean, and variance. @@ -44,13 +44,13 @@ class Aggregate: Params ------ - _groupby + groupby `Series` containing values for grouping by. - _data + data Data matrix for aggregation. - _weight + weight Weights to be used for aggergation. - _key_set + key_set Subset of keys to which to filter. """ @@ -97,18 +97,18 @@ def sum(self) -> Array: return utils.asarray(A * self._data) def mean(self) -> Array: - """ + """\ Compute the mean per feature per group of observations. Returns ------- - Array of mean. + Array of mean. """ A, _ = self._sparse_aggregator(normalize=True) return utils.asarray(A * self._data) def count_mean_var(self, dof: int = 1) -> dict: - """ + """\ Compute the count, as well as mean and variance per feature, per group of observations. The formula `Var(X) = E(X^2) - E(X)^2` suffers loss of precision when the variance is a @@ -124,7 +124,7 @@ def count_mean_var(self, dof: int = 1) -> dict: Returns ------- - dict with mean, count, and var keys. + dict with mean, count, and var keys. """ assert dof >= 0 A, _ = self._sparse_aggregator(normalize=True) @@ -238,14 +238,19 @@ def _ndarray_from_seq(lst: Sequence): def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: - """Find all columns which are a superset of the key column. + """\ + Find all columns which are a superset of the key column. - Args: - df (pd.DataFrame): DataFrame which contains candidate columns. - groupby_key (str): Key for column of which to find superset of columns. + Params + ------ + df + DataFrame which contains candidate columns. + groupby_key + Key for column of which to find superset of columns. - Returns: - List[str]: Superset columns. + Returns + ------- + Superset columns. """ columns = [] groupy_key_codes = df[groupby_key].astype('category') @@ -263,15 +268,22 @@ def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: - """Generate a grouped-by dataframe (no aggregation) by a key with columns that are supersets of the key column + """\ + Generate a grouped-by dataframe (no aggregation) by + a key with columns that are supersets of the key column. - Args: - df (pd.DataFrame): DataFrame to be grouped. - key (str): Column to be grouped on. - key_set (List[str]): values in the `key` column to keep before groupby. + Params + ------ + df + DataFrame to be grouped. + key + Column to be grouped on. + key_set + Values in the `key` column to keep before groupby. - Returns: - pd.DataFrame: Grouped-by Dataframe. + Returns + ------- + pd.DataFrame: Grouped-by Dataframe. """ df = df.copy() if key_set is not None: @@ -296,38 +308,37 @@ def aggregated( varm: Optional[str] = None, ) -> AnnData: """\ - Aggregate data based on one of the columns of one of the axes (`obs` or `var`). - If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. - If `func` only has length 1 or is just an `AggType`, then aggregation data is written to `X`. - Otherwise, it is written to `layers` or `xxxm` as appropriate for the dimensions of the aggregation data. - - Parameters - ---------- - adata: - :class:`~anndata.AnnData` to be aggregated. - by: - Key of the column to be grouped-by. - func: - How to aggregate. - dim: - Axis on which to find group by column. - weight_key: - Key of the `dim` containing weights for a weighted sum aggregation. - key_set: - Subset of dim on which to filter. - dof: - Degrees of freedom for variance. Defaults to 1. - layer: - If not None, key for aggregation data. - obsm: - If not None, key for aggregation data. - varm: - If not None, key for aggregation data. + Aggregate data based on one of the columns of one of the axes (`obs` or `var`). + If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. + If `func` only has length 1 or is just an `AggType`, then aggregation data is written to `X`. + Otherwise, it is written to `layers` or `xxxm` as appropriate for the dimensions of the aggregation data. + + Params + ------ + adata + :class:`~anndata.AnnData` to be aggregated. + by + Key of the column to be grouped-by. + func + How to aggregate. + dim + Axis on which to find group by column. + weight_key + Key of the `dim` containing weights for a weighted sum aggregation. + key_set + Subset of dim on which to filter. + dof + Degrees of freedom for variance. Defaults to 1. + layer + If not None, key for aggregation data. + obsm + If not None, key for aggregation data. + varm + If not None, key for aggregation data. Returns ------- - AnnData: - Aggregated :class:`~anndata.AnnData`. + Aggregated :class:`~anndata.AnnData`. """ data = adata.X write_to_xxxm = None @@ -373,37 +384,7 @@ def aggregated_from_array( key_set: Optional[Iterable[str]] = None, dof: int = 1, ) -> AnnData: - """\ - Aggregate data based on one of the columns of one of a `~pd.DataFrame`. - - Parameters - ---------- - data: - Data for aggregation. - groupby_df: - `~pd.DataFrame` with column to be grouped on. - func: - How to aggregate. - dim: - Key of AnnData corresponding to the dim on which the grouped by data belongs. - by: - Key of the groupby `~pd.DataFrame` for grouping. - write_to_xxxm: - Whether or not to write aggregation data to `varm` or `obsm` (based on `dim`). - no_groupby_df: - `~pd.DataFrame` on the opposite dim of dim. - weight_key: - Key of the `dim` containing weights for a weighted sum aggregation. - key_set: - Subset of dim on which to filter. - dof: - Degrees of freedom for variance. Defaults to 1. - - Returns - ------- - AnnData: - Aggregated :class:`~anndata.AnnData`. - """ + """Aggregate data based on one of the columns of one of a `~pd.DataFrame`.""" groupby = Aggregate( groupby=groupby_df[by], data=data, From 99553addaf31819d6258f1f7f69e3b3a7463bda4 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Mon, 14 Aug 2023 13:41:29 +0200 Subject: [PATCH 33/89] more doc format fix --- scanpy/get/_aggregated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 411bced94..1903c73f5 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -79,7 +79,7 @@ def count(self) -> np.ndarray: Returns ------- - Array of counts. + Array of counts. """ _, key_index, _, _ = self._extract_indices() count_ = np.bincount(key_index) @@ -91,7 +91,7 @@ def sum(self) -> Array: Returns ------- - Array of sum. + Array of sum. """ A, _ = self._sparse_aggregator(normalize=False) return utils.asarray(A * self._data) From cdbc2288098830a040f9c4712a617db7933b9dfb Mon Sep 17 00:00:00 2001 From: Philipp A Date: Mon, 14 Aug 2023 13:51:51 +0200 Subject: [PATCH 34/89] clearer test data --- scanpy/tests/test_aggregated.py | 73 ++++++++++----------------------- 1 file changed, 21 insertions(+), 52 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index cf44214fd..dfdbc2c38 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -22,18 +22,11 @@ def test_groupby_different_data_locations(data_key, dim): pytest.skip("invalid parameter combination") ax_base = ["A", "B"] ax_groupby = [ - "v0", - "v1", - "v2", - "w0", - "w1", - "a1", - "a2", - "a3", - "b1", - "b2", - "c1", - "c2", + *["v0", "v1", "v2"], + *["w0", "w1"], + *["a1", "a2", "a3"], + *["b1", "b2"], + *["c1", "c2"], "d0", ] @@ -49,18 +42,11 @@ def test_groupby_different_data_locations(data_key, dim): X = np.array( [ - [0, -2], - [1, 13], - [2, 1], # v - [3, 12], - [4, 2], # w - [5, 11], - [6, 3], - [7, 10], # a - [8, 4], - [9, 9], # b - [10, 5], - [11, 8], # c + *[[0, -2], [1, 13], [2, 1]], # v + *[[3, 12], [4, 2]], # w + *[[5, 11], [6, 3], [7, 10]], # a + *[[8, 4], [9, 9]], # b + *[[10, 5], [11, 8]], # c [12, 6], # d ], dtype=np.float32, @@ -200,26 +186,16 @@ def get_single_agg(adata, key, agg): @pytest.mark.parametrize( 'dim', - [ - 'obs', - 'var', - ], + ['obs', 'var'], ) def test_groupby_X(dim): ax_base = ["A", "B"] ax_groupby = [ - "v0", - "v1", - "v2", - "w0", - "w1", - "a1", - "a2", - "a3", - "b1", - "b2", - "c1", - "c2", + *["v0", "v1", "v2"], + *["w0", "w1"], + *["a1", "a2", "a3"], + *["b1", "b2"], + *["c1", "c2"], "d0", ] @@ -235,18 +211,11 @@ def test_groupby_X(dim): X = np.array( [ - [0, -2], - [1, 13], - [2, 1], # v - [3, 12], - [4, 2], # w - [5, 11], - [6, 3], - [7, 10], # a - [8, 4], - [9, 9], # b - [10, 5], - [11, 8], # c + *[[0, -2], [1, 13], [2, 1]], # v + *[[3, 12], [4, 2]], # w + *[[5, 11], [6, 3], [7, 10]], # a + *[[8, 4], [9, 9]], # b + *[[10, 5], [11, 8]], # c [12, 6], # d ], dtype=np.float32, From 4b71da555800e4be1c7a568586fb1bbee97fe506 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 14 Aug 2023 16:50:53 +0200 Subject: [PATCH 35/89] Update scanpy/get/_aggregated.py Co-authored-by: Philipp A. --- scanpy/get/_aggregated.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 1903c73f5..45ba19db2 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -352,9 +352,8 @@ def aggregated( data = adata.layers[layer] if dim == 'var': data = data.T - elif ( - dim == 'var' - ): # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed + elif dim == 'var': + # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T return aggregated( data, From 4aa5e5873150f5b7d51e67ba1a107da702e2b7ee Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:39:33 +0200 Subject: [PATCH 36/89] (chore): comment on power for sparse matrices --- scanpy/get/_aggregated.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 45ba19db2..2b2b37f86 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -130,6 +130,7 @@ def count_mean_var(self, dof: int = 1) -> dict: A, _ = self._sparse_aggregator(normalize=True) count_ = np.bincount(self._key_index) mean_ = utils.asarray(A @ self._data) + # sparse matrices do not support ** for elementwise power. mean_sq = utils.asarray(A @ _power(self._data, 2)) if self._weight is None: sq_mean = mean_**2 @@ -222,7 +223,20 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): return keys, key_index, df_index, weight_value -def _power(X, power): +def _power(X: Array, power: Union[float, int]) -> Array: + """Generate elementwise power of a matrix. Needed for sparse matrices because they do not support ** so the `.power` function is used. + + Parameters + ---------- + X + Matrix whose power is to be raised. + power + Integer power value + + Returns + ------- + Matrix whose power has been raised. + """ return X ** power if isinstance(X, np.ndarray) else X.power(power) From 8d8eb1e16dc695815465eeb5af0a635a1be88d84 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:40:07 +0200 Subject: [PATCH 37/89] (chore): add `TODO` for old code --- scanpy/get/_aggregated.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 2b2b37f86..69172a822 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -144,6 +144,7 @@ def count_mean_var(self, dof: int = 1) -> dict: mean_unweighted = utils.asarray(A_unweighted * self._data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 var_ = mean_sq - sq_mean + # TODO: Why these values exaclty? Because they are high relative to the datatype? (unchanged from original code: https://github.com/scverse/anndata/pull/564) precision = 2 << (42 if self._data.dtype == np.float64 else 20) # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 var_[precision * var_ < sq_mean] = 0 @@ -178,6 +179,7 @@ def _sparse_aggregator( df_index = np.arange(len(key_index)) if self._weight is None: weight_value = np.ones(len(key_index)) + # TODO: why a coo matrix here and a dia matrix below? (unchanged from original code: https://github.com/scverse/anndata/pull/564) A = coo_matrix( (weight_value, (key_index, df_index)), shape=(len(keys), self._data.shape[0]), From 34b3f21ce787cc5354976e0f7c53143907b01e63 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:40:55 +0200 Subject: [PATCH 38/89] (style): refactor `_extract_indices` + `_filter_indices` with docstrings added --- scanpy/get/_aggregated.py | 75 ++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 69172a822..685be2383 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -191,25 +191,60 @@ def _sparse_aggregator( A = D * A return A, keys - def _extract_indices(self): - def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): - keep = [i for i, k in enumerate(keys) if k in set(key_set)] - if len(keep) == 0: - raise ValueError("No keys in key_set found in adata.obs[key].") - elif len(keep) < len(keys): - mask = np.in1d(key_index, keep) - remap = np.zeros(len(keys), dtype=np.int64) - for i, j in enumerate(keep): - remap[j] = i - keys = [keys[j] for j in keep] - key_index = np.array( - [remap[i] for i in key_index[mask]], dtype=np.int64 - ) - df_index = df_index[mask] - if weight_value is not None: - weight_value = weight_value[mask] - return keys, key_index, df_index, weight_value + def _filter_indices( + self, + keys: np.ndarray, + key_index: np.ndarray, + df_index: np.ndarray, + weight_value: Optional[Union[pd.Series, Array]] = None, + ) -> Tuple[np.ndarray, np.ndarray, Union[pd.Series, Array, None]]: + """Filter the values of keys, key_index, df_index, and optionally weight_value based on self._key_set. + + Parameters + ---------- + keys + Unique key values to be filtered. + key_index + Non-unique integer indices mapping keys to the df_index to be filtered. + df_index + An Index that the keys + key_index constitute to be filtered. + weight_value, optional + Weight values to be filtered., by default None + + Returns + ------- + Filtered versions of all arguments. + + Raises + ------ + ValueError + If no keys in key_set found in keys. + """ + keep = [i for i, k in enumerate(keys) if k in set(self._key_set)] + if len(keep) == 0: + raise ValueError("No keys in key_set found in keys.") + elif len(keep) < len(keys): + mask = np.in1d(key_index, keep) + remap = np.zeros(len(keys), dtype=np.int64) + for i, j in enumerate(keep): + remap[j] = i + keys = [keys[j] for j in keep] + key_index = np.array([remap[i] for i in key_index[mask]], dtype=np.int64) + df_index = df_index[mask] + if weight_value is not None: + weight_value = weight_value[mask] + return keys, key_index, df_index, weight_value + + def _extract_indices( + self, + ) -> Tuple[np.ndarray, np.ndarray, Union[pd.Series, Array, None]]: + """Extract indices from self._groupby with the goal of building a matrix that can be multiplied with the data to produce an aggregation statistics e.g., mean or variance. + These are filtered if a self._key_set is present. + Returns + ------- + Unique keys, an array mapping those unique keys to an index, said index, and a weight if present. + """ key_value = self._groupby keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) df_index = np.arange(len(key_index)) @@ -218,8 +253,8 @@ def _filter_indices(key_set, keys, key_index, df_index, weight_value=None): else: weight_value = self._weight.values[df_index] if self._key_set is not None: - keys, key_index, df_index, weight_value = _filter_indices( - self._key_set, keys, key_index, df_index, weight_value + keys, key_index, df_index, weight_value = self._filter_indices( + keys, key_index, df_index, weight_value ) self._key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter return keys, key_index, df_index, weight_value From b6dc02654fdb484904b2e5ca489de501428732f2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:41:29 +0200 Subject: [PATCH 39/89] (style): use generator for `all` argument --- scanpy/get/_aggregated.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 685be2383..8d5895b9e 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -309,10 +309,8 @@ def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: if key != groupby_key: key_codes = df[key].astype('category') if all( - [ - key_codes[groupy_key_codes == group_key_code].nunique() == 1 - for group_key_code in groupy_key_codes - ] + key_codes[groupy_key_codes == group_key_code].nunique() == 1 + for group_key_code in groupy_key_codes ): columns += [key] return columns From d1cc61bb0e325d46fa2fbb28f2a0145bf238f62e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:43:45 +0200 Subject: [PATCH 40/89] (fix): use `Iterable` for `func` argument instead of `List` --- scanpy/get/_aggregated.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 8d5895b9e..aab64b6f9 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -8,6 +8,7 @@ Union, Literal, List, + get_args, ) from anndata import AnnData, utils @@ -346,7 +347,7 @@ def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: def aggregated( adata: AnnData, by: str, - func: Union[AggType, List[AggType]], + func: Union[AggType, Iterable[AggType]], *, dim: Literal['obs', 'var'] = 'obs', weight_key: Optional[str] = None, @@ -423,7 +424,7 @@ def aggregated( def aggregated_from_array( data, groupby_df: pd.DataFrame, - func: Union[AggType, List[AggType]], + func: Union[AggType, Iterable[AggType]], dim: str, by: str, write_to_xxxm: bool, @@ -446,38 +447,37 @@ def aggregated_from_array( 'X': None, 'obsm': {}, } - func_set = func write_key = 'obsm' if write_to_xxxm else 'layers' - if not isinstance(func, list): - func_set = [func] - func_set = set(func_set) - if 'sum' in func_set: # sum is calculated separately from the rest + funcs = set([func] if isinstance(func, str) else func) + if unknown := funcs - set(get_args(AggType)): + raise ValueError(f'… {unknown} …') + if 'sum' in funcs: # sum is calculated separately from the rest agg = groupby.sum() if ( - len(func_set) == 1 and not write_to_xxxm + len(funcs) == 1 and not write_to_xxxm ): # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` data_dict['X'] = agg else: data_dict[write_key]['sum'] = agg if ( - 'mean' in func_set and 'var' not in func_set + 'mean' in funcs and 'var' not in funcs ): # here and below for count, if var is present, these can be calculate alongside var agg = groupby.mean() - if len(func_set) == 1 and not write_to_xxxm: + if len(funcs) == 1 and not write_to_xxxm: data_dict['X'] = agg else: data_dict[write_key]['mean'] = agg - if 'count' in func_set and 'var' not in func_set: + if 'count' in funcs and 'var' not in funcs: obs_var_dict['obs']['count'] = groupby.count() # count goes in dim df - if 'var' in func_set: + if 'var' in funcs: agg = groupby.count_mean_var(dof) - if len(func_set) == 1 and not write_to_xxxm: + if len(funcs) == 1 and not write_to_xxxm: data_dict['X'] = agg['var'] else: data_dict[write_key]['var'] = agg['var'] - if 'mean' in func_set: + if 'mean' in funcs: data_dict[write_key]['mean'] = agg['mean'] - if 'count' in func_set: + if 'count' in funcs: obs_var_dict['obs']['count'] = agg['count'] adata_agg = AnnData(**{**data_dict, **obs_var_dict}) if dim == 'var': From 57e17ee319291f146b456d84961362aed6e906ba Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:44:21 +0200 Subject: [PATCH 41/89] (feat): check `layer` `obsm` `varm` combination --- scanpy/get/_aggregated.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index aab64b6f9..00ddb481b 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -392,13 +392,19 @@ def aggregated( """ data = adata.X write_to_xxxm = None - if varm is not None: + is_varm_none = varm is None + is_obsm_none = obsm is None + is_layer_none = layer is None + assert ( + sum([is_varm_none, is_obsm_none, is_layer_none]) > 1 + ), "Please only provide one (or none) of varm, obsm, or layer" + if not is_varm_none: data = adata.varm[varm] write_to_xxxm = True # the data will have to be transposed so this is accurate - elif obsm is not None: + elif not is_obsm_none: data = adata.obsm[obsm] write_to_xxxm = True - elif layer is not None: + elif not is_layer_none: data = adata.layers[layer] if dim == 'var': data = data.T From a2c08ab1878b961eba8010748e7d5bc552fbf6bc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:51:17 +0200 Subject: [PATCH 42/89] (style): refactor to use fixtures for backing data --- scanpy/tests/test_aggregated.py | 90 +++++++++++++-------------------- 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index dfdbc2c38..ba156b233 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -6,21 +6,14 @@ import pytest -@pytest.mark.parametrize( - 'data_key', - ['layers', 'obsm', 'varm'], -) -@pytest.mark.parametrize( - 'dim', - [ - 'obs', - 'var', - ], -) -def test_groupby_different_data_locations(data_key, dim): - if (data_key == 'varm' and dim == 'obs') or (data_key == 'obsm' and dim == 'var'): - pytest.skip("invalid parameter combination") +@pytest.fixture +def df_base(): ax_base = ["A", "B"] + return pd.DataFrame(index=ax_base) + + +@pytest.fixture +def df_groupby(): ax_groupby = [ *["v0", "v1", "v2"], *["w0", "w1"], @@ -37,10 +30,12 @@ def test_groupby_different_data_locations(data_key, dim): ) df_groupby["key_subset"] = pd.Categorical([c[1] for c in ax_groupby]) df_groupby["weight"] = 2.0 + return df_groupby - df_base = pd.DataFrame(index=ax_base) - X = np.array( +@pytest.fixture +def X(): + return np.array( [ *[[0, -2], [1, 13], [2, 1]], # v *[[3, 12], [4, 2]], # w @@ -51,7 +46,22 @@ def test_groupby_different_data_locations(data_key, dim): ], dtype=np.float32, ) - data_dense = X + + +@pytest.mark.parametrize( + 'data_key', + ['layers', 'obsm', 'varm'], +) +@pytest.mark.parametrize( + 'dim', + [ + 'obs', + 'var', + ], +) +def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X): + if (data_key == 'varm' and dim == 'obs') or (data_key == 'obsm' and dim == 'var'): + pytest.skip("invalid parameter combination") if dim == 'obs': data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} adata_sparse = ad.AnnData( @@ -63,12 +73,12 @@ def test_groupby_different_data_locations(data_key, dim): ) else: if data_key != 'varm': - data_dense = X.T - data_sparse_mat_dict = {data_key: {'test': csr_matrix(data_dense)}} + X = X.T + data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} adata_sparse = ad.AnnData( **{'obs': df_base, 'var': df_groupby, **data_sparse_mat_dict} ) - data_dense_mat_dict = {data_key: {'test': data_dense}} + data_dense_mat_dict = {data_key: {'test': X}} adata_dense = ad.AnnData( **{'obs': df_base, 'var': df_groupby, **data_dense_mat_dict} ) @@ -162,7 +172,7 @@ def get_single_agg(adata, key, agg): df = pd.DataFrame( index=getattr(adata_dense, dim)["key"], columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), - data=data_dense.T if dim == 'var' and data_key != 'varm' else data_dense, + data=X.T if dim == 'var' and data_key != 'varm' else X, ) grouped_agg_df = ( df.groupby('key') @@ -188,44 +198,12 @@ def get_single_agg(adata, key, agg): 'dim', ['obs', 'var'], ) -def test_groupby_X(dim): - ax_base = ["A", "B"] - ax_groupby = [ - *["v0", "v1", "v2"], - *["w0", "w1"], - *["a1", "a2", "a3"], - *["b1", "b2"], - *["c1", "c2"], - "d0", - ] - - df_groupby = pd.DataFrame(index=pd.Index(ax_groupby, name="cell")) - df_groupby["key"] = pd.Categorical([c[0] for c in ax_groupby]) - df_groupby["key_superset"] = pd.Categorical([c[0] for c in ax_groupby]).map( - {'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'} - ) - df_groupby["key_subset"] = pd.Categorical([c[1] for c in ax_groupby]) - df_groupby["weight"] = 2.0 - - df_base = pd.DataFrame(index=ax_base) - - X = np.array( - [ - *[[0, -2], [1, 13], [2, 1]], # v - *[[3, 12], [4, 2]], # w - *[[5, 11], [6, 3], [7, 10]], # a - *[[8, 4], [9, 9]], # b - *[[10, 5], [11, 8]], # c - [12, 6], # d - ], - dtype=np.float32, - ) - data_dense = X +def test_groupby_X(dim, df_base, df_groupby, X): if dim == 'obs': adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X)) adata_dense = ad.AnnData(obs=df_groupby, var=df_base, X=X) else: - adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, X=data_dense.T) + adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, X=X.T) adata_dense = ad.AnnData(obs=df_base, var=df_groupby, X=csr_matrix(X).T) stats_sparse = sc.get.aggregated( @@ -295,7 +273,7 @@ def test_groupby_X(dim): df = pd.DataFrame( index=getattr(adata_dense, dim)["key"], columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), - data=data_dense, + data=X, ) grouped_agg_df = ( df.groupby('key') From c5e5b9413ce5305f32ed09bf1c3b87442d134302 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Aug 2023 17:53:30 +0200 Subject: [PATCH 43/89] (style): don't use spreading in `AnnData` constructor --- scanpy/tests/test_aggregated.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index ba156b233..ab3b7018b 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -64,24 +64,16 @@ def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X) pytest.skip("invalid parameter combination") if dim == 'obs': data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} - adata_sparse = ad.AnnData( - **{'obs': df_groupby, 'var': df_base, **data_sparse_mat_dict} - ) + adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, **data_sparse_mat_dict) data_dense_mat_dict = {data_key: {'test': X}} - adata_dense = ad.AnnData( - **{'obs': df_groupby, 'var': df_base, **data_dense_mat_dict} - ) + adata_dense = ad.AnnData(obs=df_groupby, var=df_base, **data_dense_mat_dict) else: if data_key != 'varm': X = X.T data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} - adata_sparse = ad.AnnData( - **{'obs': df_base, 'var': df_groupby, **data_sparse_mat_dict} - ) + adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, **data_sparse_mat_dict) data_dense_mat_dict = {data_key: {'test': X}} - adata_dense = ad.AnnData( - **{'obs': df_base, 'var': df_groupby, **data_dense_mat_dict} - ) + adata_dense = ad.AnnData(obs=df_base, var=df_groupby, **data_dense_mat_dict) data_dict = {(data_key if data_key != 'layers' else 'layer'): 'test'} stats_sparse = sc.get.aggregated( From 8d78e36d5ec54ff8e4d8c2a11cc424f19a8d742f Mon Sep 17 00:00:00 2001 From: Philipp A Date: Fri, 18 Aug 2023 08:47:45 +0200 Subject: [PATCH 44/89] typo --- scanpy/get/_aggregated.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 00ddb481b..13c63148e 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -145,7 +145,8 @@ def count_mean_var(self, dof: int = 1) -> dict: mean_unweighted = utils.asarray(A_unweighted * self._data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 var_ = mean_sq - sq_mean - # TODO: Why these values exaclty? Because they are high relative to the datatype? (unchanged from original code: https://github.com/scverse/anndata/pull/564) + # TODO: Why these values exactly? Because they are high relative to the datatype? + # (unchanged from original code: https://github.com/scverse/anndata/pull/564) precision = 2 << (42 if self._data.dtype == np.float64 else 20) # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 var_[precision * var_ < sq_mean] = 0 From d70d0864204aab2f872dbb66ebb5221b50eec3e1 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Fri, 18 Aug 2023 08:52:47 +0200 Subject: [PATCH 45/89] simplify --- scanpy/get/_aggregated.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 13c63148e..ec6dadf33 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -393,19 +393,15 @@ def aggregated( """ data = adata.X write_to_xxxm = None - is_varm_none = varm is None - is_obsm_none = obsm is None - is_layer_none = layer is None - assert ( - sum([is_varm_none, is_obsm_none, is_layer_none]) > 1 - ), "Please only provide one (or none) of varm, obsm, or layer" - if not is_varm_none: + if sum([varm is None, obsm is None, layer is None]) > 1: + raise TypeError("Please only provide one (or none) of varm, obsm, or layer") + if not varm is None: data = adata.varm[varm] write_to_xxxm = True # the data will have to be transposed so this is accurate - elif not is_obsm_none: + elif not obsm is None: data = adata.obsm[obsm] write_to_xxxm = True - elif not is_layer_none: + elif not layer is None: data = adata.layers[layer] if dim == 'var': data = data.T From 6bac717d22f9534dab432099e59af72ef44f4004 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Fri, 18 Aug 2023 09:04:02 +0200 Subject: [PATCH 46/89] condense --- scanpy/tests/test_aggregated.py | 38 +++++++++++---------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index ab3b7018b..c5a623035 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -35,30 +35,19 @@ def df_groupby(): @pytest.fixture def X(): - return np.array( - [ - *[[0, -2], [1, 13], [2, 1]], # v - *[[3, 12], [4, 2]], # w - *[[5, 11], [6, 3], [7, 10]], # a - *[[8, 4], [9, 9]], # b - *[[10, 5], [11, 8]], # c - [12, 6], # d - ], - dtype=np.float32, - ) + data = [ + *[[0, -2], [1, 13], [2, 1]], # v + *[[3, 12], [4, 2]], # w + *[[5, 11], [6, 3], [7, 10]], # a + *[[8, 4], [9, 9]], # b + *[[10, 5], [11, 8]], # c + [12, 6], # d + ] + return np.array(data, dtype=np.float32) -@pytest.mark.parametrize( - 'data_key', - ['layers', 'obsm', 'varm'], -) -@pytest.mark.parametrize( - 'dim', - [ - 'obs', - 'var', - ], -) +@pytest.mark.parametrize('data_key', ['layers', 'obsm', 'varm']) +@pytest.mark.parametrize('dim', ['obs', 'var']) def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X): if (data_key == 'varm' and dim == 'obs') or (data_key == 'obsm' and dim == 'var'): pytest.skip("invalid parameter combination") @@ -186,10 +175,7 @@ def get_single_agg(adata, key, agg): ) # returns for both columns but counts only needs one because it is the same -@pytest.mark.parametrize( - 'dim', - ['obs', 'var'], -) +@pytest.mark.parametrize('dim', ['obs', 'var']) def test_groupby_X(dim, df_base, df_groupby, X): if dim == 'obs': adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X)) From 99a3f2eb77c0f3fe444d17821cd61b8a8ac54733 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Fri, 18 Aug 2023 09:05:45 +0200 Subject: [PATCH 47/89] style --- scanpy/get/_aggregated.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index ec6dadf33..d80e66ce2 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -456,15 +456,13 @@ def aggregated_from_array( raise ValueError(f'… {unknown} …') if 'sum' in funcs: # sum is calculated separately from the rest agg = groupby.sum() - if ( - len(funcs) == 1 and not write_to_xxxm - ): # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` + # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` + if len(funcs) == 1 and not write_to_xxxm: data_dict['X'] = agg else: data_dict[write_key]['sum'] = agg - if ( - 'mean' in funcs and 'var' not in funcs - ): # here and below for count, if var is present, these can be calculate alongside var + # here and below for count, if var is present, these can be calculate alongside var + if 'mean' in funcs and 'var' not in funcs: agg = groupby.mean() if len(funcs) == 1 and not write_to_xxxm: data_dict['X'] = agg From f89c4d3b772fbf6456fbe8d8347f9794f77a5d9c Mon Sep 17 00:00:00 2001 From: Philipp A Date: Fri, 18 Aug 2023 09:37:46 +0200 Subject: [PATCH 48/89] minor deduplication --- scanpy/tests/test_aggregated.py | 76 +++++++++++++-------------------- 1 file changed, 30 insertions(+), 46 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index c5a623035..1f3d6a493 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -2,7 +2,7 @@ import scanpy as sc import numpy as np import pandas as pd -from scipy.sparse import csr_matrix +from scipy.sparse import csr_matrix, csc_matrix import pytest @@ -51,33 +51,22 @@ def X(): def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X): if (data_key == 'varm' and dim == 'obs') or (data_key == 'obsm' and dim == 'var'): pytest.skip("invalid parameter combination") - if dim == 'obs': - data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} - adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, **data_sparse_mat_dict) - data_dense_mat_dict = {data_key: {'test': X}} - adata_dense = ad.AnnData(obs=df_groupby, var=df_base, **data_dense_mat_dict) - else: - if data_key != 'varm': - X = X.T - data_sparse_mat_dict = {data_key: {'test': csr_matrix(X)}} - adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, **data_sparse_mat_dict) - data_dense_mat_dict = {data_key: {'test': X}} - adata_dense = ad.AnnData(obs=df_base, var=df_groupby, **data_dense_mat_dict) + + obs_df, var_df = (df_groupby, df_base) if dim == 'obs' else (df_base, df_groupby) + data = X.T if dim == 'var' and data_key != 'varm' else X + adata_sparse = ad.AnnData(obs=obs_df, var=var_df, **{data_key: {'test': csr_matrix(data)}}) + adata_dense = ad.AnnData(obs=obs_df, var=var_df, **{data_key: {'test': data}}) data_dict = {(data_key if data_key != 'layers' else 'layer'): 'test'} - stats_sparse = sc.get.aggregated( - adata_sparse, - by="key", - dim=dim, - func=['count', 'mean', 'var'], - **data_dict, - ) - stats_dense = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func=['count', 'mean', 'var'], - **data_dict, + stats_sparse, stats_dense = ( + sc.get.aggregated( + adata, + by="key", + dim=dim, + func=['count', 'mean', 'var'], + **data_dict, + ) + for adata in [adata_sparse, adata_dense] ) # superset columns can be kept but not subsets @@ -86,7 +75,7 @@ def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X) assert np.allclose( getattr(stats_sparse, dim)['count'], - getattr(stats_sparse, dim)['count'], + getattr(stats_dense, dim)['count'], ) assert np.allclose( getattr(stats_sparse, data_key)['mean'], getattr(stats_dense, data_key)['mean'] @@ -153,7 +142,7 @@ def get_single_agg(adata, key, agg): df = pd.DataFrame( index=getattr(adata_dense, dim)["key"], columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), - data=X.T if dim == 'var' and data_key != 'varm' else X, + data=X, ) grouped_agg_df = ( df.groupby('key') @@ -177,24 +166,19 @@ def get_single_agg(adata, key, agg): @pytest.mark.parametrize('dim', ['obs', 'var']) def test_groupby_X(dim, df_base, df_groupby, X): - if dim == 'obs': - adata_sparse = ad.AnnData(obs=df_groupby, var=df_base, X=csr_matrix(X)) - adata_dense = ad.AnnData(obs=df_groupby, var=df_base, X=X) - else: - adata_sparse = ad.AnnData(obs=df_base, var=df_groupby, X=X.T) - adata_dense = ad.AnnData(obs=df_base, var=df_groupby, X=csr_matrix(X).T) + obs_df, var_df = (df_groupby, df_base) if dim == 'obs' else (df_base, df_groupby) + data = X if dim == 'obs' else X.T + adata_sparse = ad.AnnData(obs=obs_df, var=var_df, X=csc_matrix(data)) + adata_dense = ad.AnnData(obs=obs_df, var=var_df, X=data) - stats_sparse = sc.get.aggregated( - adata_sparse, - by="key", - dim=dim, - func=['count', 'mean', 'var'], - ) - stats_dense = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func=['count', 'mean', 'var'], + stats_sparse, stats_dense = ( + sc.get.aggregated( + adata, + by="key", + dim=dim, + func=['count', 'mean', 'var'], + ) + for adata in [adata_sparse, adata_dense] ) # superset columns can be kept but not subsets @@ -203,7 +187,7 @@ def test_groupby_X(dim, df_base, df_groupby, X): assert np.allclose( getattr(stats_sparse, dim)['count'], - getattr(stats_sparse, dim)['count'], + getattr(stats_dense, dim)['count'], ) assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) assert np.allclose( From 73aa963a6017d8de371feaf6622bf3b53e4e64fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Aug 2023 07:42:19 +0000 Subject: [PATCH 49/89] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scanpy/get/_aggregated.py | 10 +++++----- scanpy/tests/test_aggregated.py | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index d80e66ce2..f4028126b 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -395,13 +395,13 @@ def aggregated( write_to_xxxm = None if sum([varm is None, obsm is None, layer is None]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") - if not varm is None: + if varm is not None: data = adata.varm[varm] write_to_xxxm = True # the data will have to be transposed so this is accurate - elif not obsm is None: + elif obsm is not None: data = adata.obsm[obsm] write_to_xxxm = True - elif not layer is None: + elif layer is not None: data = adata.layers[layer] if dim == 'var': data = data.T @@ -457,12 +457,12 @@ def aggregated_from_array( if 'sum' in funcs: # sum is calculated separately from the rest agg = groupby.sum() # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` - if len(funcs) == 1 and not write_to_xxxm: + if len(funcs) == 1 and not write_to_xxxm: data_dict['X'] = agg else: data_dict[write_key]['sum'] = agg # here and below for count, if var is present, these can be calculate alongside var - if 'mean' in funcs and 'var' not in funcs: + if 'mean' in funcs and 'var' not in funcs: agg = groupby.mean() if len(funcs) == 1 and not write_to_xxxm: data_dict['X'] = agg diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 1f3d6a493..cc30a020d 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -54,7 +54,9 @@ def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X) obs_df, var_df = (df_groupby, df_base) if dim == 'obs' else (df_base, df_groupby) data = X.T if dim == 'var' and data_key != 'varm' else X - adata_sparse = ad.AnnData(obs=obs_df, var=var_df, **{data_key: {'test': csr_matrix(data)}}) + adata_sparse = ad.AnnData( + obs=obs_df, var=var_df, **{data_key: {'test': csr_matrix(data)}} + ) adata_dense = ad.AnnData(obs=obs_df, var=var_df, **{data_key: {'test': data}}) data_dict = {(data_key if data_key != 'layers' else 'layer'): 'test'} From 88521f4ac78a37f430d3655487cd0a63948d1bd4 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 21 Aug 2023 13:20:41 +0200 Subject: [PATCH 50/89] fix check --- scanpy/get/_aggregated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index f4028126b..5b9ebd4f5 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -393,7 +393,7 @@ def aggregated( """ data = adata.X write_to_xxxm = None - if sum([varm is None, obsm is None, layer is None]) > 1: + if sum([varm is None, obsm is None, layer is None]) < 2: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") if varm is not None: data = adata.varm[varm] From 06ecfff2fd9d8be995a44f8981405f8327fa406e Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 21 Aug 2023 13:27:51 +0200 Subject: [PATCH 51/89] simpler --- scanpy/get/_aggregated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 5b9ebd4f5..1c4c85bab 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -393,7 +393,7 @@ def aggregated( """ data = adata.X write_to_xxxm = None - if sum([varm is None, obsm is None, layer is None]) < 2: + if sum(p is not None for p in [varm, obsm, layer]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") if varm is not None: data = adata.varm[varm] From 4ae27a6e1042933464006730f862e3c704d5c18b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 22 Aug 2023 15:41:57 +0200 Subject: [PATCH 52/89] (chore): dedup tests --- scanpy/tests/test_aggregated.py | 166 +++++++++----------------------- 1 file changed, 48 insertions(+), 118 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index cc30a020d..b4e634a37 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -46,27 +46,44 @@ def X(): return np.array(data, dtype=np.float32) -@pytest.mark.parametrize('data_key', ['layers', 'obsm', 'varm']) -@pytest.mark.parametrize('dim', ['obs', 'var']) -def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X): +def gen_adata(data_key, dim, df_base, df_groupby, X): if (data_key == 'varm' and dim == 'obs') or (data_key == 'obsm' and dim == 'var'): pytest.skip("invalid parameter combination") obs_df, var_df = (df_groupby, df_base) if dim == 'obs' else (df_base, df_groupby) data = X.T if dim == 'var' and data_key != 'varm' else X - adata_sparse = ad.AnnData( - obs=obs_df, var=var_df, **{data_key: {'test': csr_matrix(data)}} + if data_key != 'X': + data_dict_sparse = {data_key: {'test': csr_matrix(data)}} + data_dict_dense = {data_key: {'test': data}} + else: + data_dict_sparse = {data_key: csr_matrix(data)} + data_dict_dense = {data_key: data} + + adata_sparse = ad.AnnData(obs=obs_df, var=var_df, **data_dict_sparse) + adata_dense = ad.AnnData(obs=obs_df, var=var_df, **data_dict_dense) + return adata_sparse, adata_dense + + +@pytest.mark.parametrize('data_key', ['layers', 'obsm', 'varm', 'X']) +@pytest.mark.parametrize('dim', ['obs', 'var']) +def test_groupby(data_key, dim, df_base, df_groupby, X): + adata_sparse, adata_dense = gen_adata(data_key, dim, df_base, df_groupby, X) + + data_loc_dict = ( + {(data_key if data_key != 'layers' else 'layer'): 'test'} + if data_key != 'X' + else {} ) - adata_dense = ad.AnnData(obs=obs_df, var=var_df, **{data_key: {'test': data}}) + # When `X` is not the `data_key`, the multi-aggregation data is colocated with the `data_key`. Otherwise it is in `layers`. + multi_agg_data_loc_key = data_key if data_key != 'X' else 'layers' - data_dict = {(data_key if data_key != 'layers' else 'layer'): 'test'} stats_sparse, stats_dense = ( sc.get.aggregated( adata, by="key", dim=dim, func=['count', 'mean', 'var'], - **data_dict, + **data_loc_dict, ) for adata in [adata_sparse, adata_dense] ) @@ -80,11 +97,12 @@ def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X) getattr(stats_dense, dim)['count'], ) assert np.allclose( - getattr(stats_sparse, data_key)['mean'], getattr(stats_dense, data_key)['mean'] + getattr(stats_sparse, multi_agg_data_loc_key)['mean'], + getattr(stats_dense, multi_agg_data_loc_key)['mean'], ) assert np.allclose( - getattr(stats_sparse, data_key)['var'], - getattr(stats_dense, data_key)['var'], + getattr(stats_sparse, multi_agg_data_loc_key)['var'], + getattr(stats_dense, multi_agg_data_loc_key)['var'], equal_nan=True, ) @@ -94,33 +112,37 @@ def test_groupby_different_data_locations(data_key, dim, df_base, df_groupby, X) dim=dim, func=['count', 'mean', 'var'], weight_key="weight", - **data_dict, + **data_loc_dict, + ) + sum_ = sc.get.aggregated( + adata_sparse, by="key", dim=dim, func='sum', **data_loc_dict ) - sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, func='sum', **data_dict) sum_weight = sc.get.aggregated( adata_dense, by="key", dim=dim, func='sum', weight_key="weight", - **data_dict, + **data_loc_dict, ) def get_single_agg(adata, key, agg): - if key == 'obsm' or key == 'varm': - return getattr(adata, key)[agg] - return adata.X + # Get the data of the aggregation from the correct location when only one `func` is passed in to `aggregated` + if (key != 'obsm' and key != 'varm') or data_key == 'X': + return adata.X + return getattr(adata, key)[agg] assert np.allclose( 2 * get_single_agg(sum_, data_key, 'sum'), get_single_agg(sum_weight, data_key, 'sum'), ) assert np.allclose( - getattr(stats_sparse, data_key)['mean'], getattr(stats_weight, data_key)['mean'] + getattr(stats_sparse, multi_agg_data_loc_key)['mean'], + getattr(stats_weight, multi_agg_data_loc_key)['mean'], ) assert np.allclose( - getattr(stats_sparse, data_key)['var'], - getattr(stats_dense, data_key)['var'], + getattr(stats_sparse, multi_agg_data_loc_key)['var'], + getattr(stats_dense, multi_agg_data_loc_key)['var'], equal_nan=True, ) @@ -131,107 +153,15 @@ def get_single_agg(adata, key, agg): dim=dim, func='mean', key_set=key_set, - **data_dict, + **data_loc_dict, ) subset_idx = getattr(stats_sparse, dim).index.isin(key_set) subset_adata = ( stats_sparse[subset_idx, :] if dim == 'obs' else stats_sparse[:, subset_idx] ) - subset_mean = getattr(subset_adata, data_key)['mean'] + subset_mean = getattr(subset_adata, multi_agg_data_loc_key)['mean'] key_set_mean = get_single_agg(mean_key_set_adata, data_key, 'mean') - assert np.allclose(subset_mean, key_set_mean) - - df = pd.DataFrame( - index=getattr(adata_dense, dim)["key"], - columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), - data=X, - ) - grouped_agg_df = ( - df.groupby('key') - .agg(["count", "mean", "var"]) - .swaplevel(axis=1) - .sort_index(axis=1) - ) - mean = getattr(stats_dense, data_key)['mean'] - if dim == 'var' and data_key != 'varm': - mean = mean.T - assert np.allclose(mean, grouped_agg_df['mean'].values) - var = getattr(stats_dense, data_key)['var'] - if dim == 'var' and data_key != 'varm': - var = var.T - assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) - assert np.allclose( - getattr(stats_dense, dim)['count'], - grouped_agg_df['count']['A'].values, - ) # returns for both columns but counts only needs one because it is the same - - -@pytest.mark.parametrize('dim', ['obs', 'var']) -def test_groupby_X(dim, df_base, df_groupby, X): - obs_df, var_df = (df_groupby, df_base) if dim == 'obs' else (df_base, df_groupby) - data = X if dim == 'obs' else X.T - adata_sparse = ad.AnnData(obs=obs_df, var=var_df, X=csc_matrix(data)) - adata_dense = ad.AnnData(obs=obs_df, var=var_df, X=data) - - stats_sparse, stats_dense = ( - sc.get.aggregated( - adata, - by="key", - dim=dim, - func=['count', 'mean', 'var'], - ) - for adata in [adata_sparse, adata_dense] - ) - - # superset columns can be kept but not subsets - assert 'key_superset' in getattr(stats_sparse, dim) - assert 'key_subset' not in getattr(stats_sparse, dim) - - assert np.allclose( - getattr(stats_sparse, dim)['count'], - getattr(stats_dense, dim)['count'], - ) - assert np.allclose(stats_sparse.layers['mean'], stats_dense.layers['mean']) - assert np.allclose( - stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True - ) - stats_weight = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func=['count', 'mean', 'var'], - weight_key="weight", - ) - sum_ = sc.get.aggregated(adata_sparse, by="key", dim=dim, func='sum') - sum_weight = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func='sum', - weight_key="weight", - ) - - assert np.allclose(2 * sum_.X, sum_weight.X) - assert np.allclose(stats_sparse.layers['mean'], stats_weight.layers['mean']) - assert np.allclose( - stats_sparse.layers['var'], stats_dense.layers['var'], equal_nan=True - ) - - key_set = ["v", "w"] - mean_key_set_adata = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func='mean', - key_set=key_set, - ) - subset_idx = getattr(stats_sparse, dim).index.isin(key_set) - subset_adata = ( - stats_sparse[subset_idx, :] if dim == 'obs' else stats_sparse[:, subset_idx] - ) - subset_mean = subset_adata.layers['mean'] - key_set_mean = mean_key_set_adata.X assert np.allclose(subset_mean, key_set_mean) df = pd.DataFrame( @@ -245,12 +175,12 @@ def test_groupby_X(dim, df_base, df_groupby, X): .swaplevel(axis=1) .sort_index(axis=1) ) - mean = stats_dense.layers['mean'] - if dim == 'var': + mean = getattr(stats_dense, multi_agg_data_loc_key)['mean'] + if dim == 'var' and data_key != 'varm': mean = mean.T assert np.allclose(mean, grouped_agg_df['mean'].values) - var = stats_dense.layers['var'] - if dim == 'var': + var = getattr(stats_dense, multi_agg_data_loc_key)['var'] + if dim == 'var' and multi_agg_data_loc_key != 'varm': var = var.T assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) assert np.allclose( From 6edf202ecd2a8381e43321556542f933bbce091c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 22 Aug 2023 16:04:36 +0200 Subject: [PATCH 53/89] (chore): update comments/errors --- scanpy/get/_aggregated.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 1c4c85bab..39053cb25 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -263,7 +263,7 @@ def _extract_indices( def _power(X: Array, power: Union[float, int]) -> Array: - """Generate elementwise power of a matrix. Needed for sparse matrices because they do not support ** so the `.power` function is used. + """Generate elementwise power of a matrix. Needed for non-square sparse matrices because they do not support ** so the `.power` function is used. Parameters ---------- @@ -453,7 +453,7 @@ def aggregated_from_array( write_key = 'obsm' if write_to_xxxm else 'layers' funcs = set([func] if isinstance(func, str) else func) if unknown := funcs - set(get_args(AggType)): - raise ValueError(f'… {unknown} …') + raise ValueError(f'func {unknown} is not one of {get_args(AggType)}') if 'sum' in funcs: # sum is calculated separately from the rest agg = groupby.sum() # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` From ca2ba76535579e430c5c1878fd9b2755c6c85d68 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Tue, 22 Aug 2023 17:50:43 +0200 Subject: [PATCH 54/89] improve typing --- scanpy/get/_aggregated.py | 65 +++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 39053cb25..b11a5bca9 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -1,26 +1,25 @@ +from __future__ import annotations + from functools import singledispatch -from typing import ( - Optional, - Iterable, - AbstractSet, - Sequence, - Tuple, - Union, - Literal, - List, - get_args, -) +from typing import NamedTuple, Optional, Literal, Union as _U, get_args +from collections.abc import Iterable, Set, Sequence from anndata import AnnData, utils import numpy as np import pandas as pd -import collections.abc as cabc +from numpy.typing import NDArray from scipy.sparse import coo_matrix, dia_matrix, spmatrix -Array = Union[np.ndarray, spmatrix] +Array = _U[np.ndarray, spmatrix] AggType = Literal['count', 'mean', 'sum', 'var'] +class CMV(NamedTuple): + count: NDArray[np.integer] + mean: NDArray[np.floating] + var: NDArray[np.floating] + + class Aggregate: """\ Functionality for generic grouping and aggregating. @@ -57,15 +56,15 @@ class Aggregate: _groupby: pd.Series _data: Array - _weight: Union[pd.Series, Array] - _key_set: AbstractSet[str] + _weight: pd.Series | Array + _key_set: Set[str] _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated def __init__( self, groupby: pd.Series, data: Array, - weight: Union[pd.Series, Array] = None, + weight: pd.Series | Array = None, key_set: Optional[Iterable[str]] = None, ): self._groupby = groupby @@ -108,7 +107,7 @@ def mean(self) -> Array: A, _ = self._sparse_aggregator(normalize=True) return utils.asarray(A * self._data) - def count_mean_var(self, dof: int = 1) -> dict: + def count_mean_var(self, dof: int = 1) -> CMV: """\ Compute the count, as well as mean and variance per feature, per group of observations. @@ -152,11 +151,11 @@ def count_mean_var(self, dof: int = 1) -> dict: var_[precision * var_ < sq_mean] = 0 if dof != 0: var_ *= (count_ / (count_ - dof))[:, np.newaxis] - return {'mean': mean_, 'var': var_, 'count': count_} + return CMV(count=count_, mean=mean_, var=var_) def _sparse_aggregator( self, normalize: bool = False - ) -> Tuple[coo_matrix, np.ndarray]: + ) -> tuple[coo_matrix, NDArray[np.floating]]: """ Form a coordinate-sparse matrix A such that rows of A * X are weighted sums of groups of rows of X. @@ -198,8 +197,8 @@ def _filter_indices( keys: np.ndarray, key_index: np.ndarray, df_index: np.ndarray, - weight_value: Optional[Union[pd.Series, Array]] = None, - ) -> Tuple[np.ndarray, np.ndarray, Union[pd.Series, Array, None]]: + weight_value: pd.Series | Array | None = None, + ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: """Filter the values of keys, key_index, df_index, and optionally weight_value based on self._key_set. Parameters @@ -239,7 +238,7 @@ def _filter_indices( def _extract_indices( self, - ) -> Tuple[np.ndarray, np.ndarray, Union[pd.Series, Array, None]]: + ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: """Extract indices from self._groupby with the goal of building a matrix that can be multiplied with the data to produce an aggregation statistics e.g., mean or variance. These are filtered if a self._key_set is present. @@ -262,7 +261,7 @@ def _extract_indices( return keys, key_index, df_index, weight_value -def _power(X: Array, power: Union[float, int]) -> Array: +def _power(X: Array, power: float | int) -> Array: """Generate elementwise power of a matrix. Needed for non-square sparse matrices because they do not support ** so the `.power` function is used. Parameters @@ -282,7 +281,7 @@ def _power(X: Array, power: Union[float, int]) -> Array: def _ndarray_from_seq(lst: Sequence): # prevents expansion of iterables as axis n = len(lst) - if n > 0 and isinstance(lst[0], cabc.Iterable): + if n > 0 and isinstance(lst[0], Iterable): arr = np.empty(n, dtype=object) arr[:] = lst else: @@ -290,7 +289,7 @@ def _ndarray_from_seq(lst: Sequence): return arr -def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: +def _superset_columns(df: pd.DataFrame, groupby_key: str) -> list[str]: """\ Find all columns which are a superset of the key column. @@ -318,7 +317,7 @@ def _superset_columns(df: pd.DataFrame, groupby_key: str) -> List[str]: return columns -def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: +def _df_grouped(df: pd.DataFrame, key: str, key_set: list[str]) -> pd.DataFrame: """\ Generate a grouped-by dataframe (no aggregation) by a key with columns that are supersets of the key column. @@ -348,7 +347,7 @@ def _df_grouped(df: pd.DataFrame, key: str, key_set: List[str]) -> pd.DataFrame: def aggregated( adata: AnnData, by: str, - func: Union[AggType, Iterable[AggType]], + func: AggType | Iterable[AggType], *, dim: Literal['obs', 'var'] = 'obs', weight_key: Optional[str] = None, @@ -427,7 +426,7 @@ def aggregated( def aggregated_from_array( data, groupby_df: pd.DataFrame, - func: Union[AggType, Iterable[AggType]], + func: AggType | Iterable[AggType], dim: str, by: str, write_to_xxxm: bool, @@ -471,15 +470,15 @@ def aggregated_from_array( if 'count' in funcs and 'var' not in funcs: obs_var_dict['obs']['count'] = groupby.count() # count goes in dim df if 'var' in funcs: - agg = groupby.count_mean_var(dof) + aggs = groupby.count_mean_var(dof) if len(funcs) == 1 and not write_to_xxxm: - data_dict['X'] = agg['var'] + data_dict['X'] = aggs.var else: - data_dict[write_key]['var'] = agg['var'] + data_dict[write_key]['var'] = aggs.var if 'mean' in funcs: - data_dict[write_key]['mean'] = agg['mean'] + data_dict[write_key]['mean'] = aggs.mean if 'count' in funcs: - obs_var_dict['obs']['count'] = agg['count'] + obs_var_dict['obs']['count'] = aggs.count adata_agg = AnnData(**{**data_dict, **obs_var_dict}) if dim == 'var': return adata_agg.T From d114600803a079796106984bbb69dd9f8ac7c378 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Thu, 24 Aug 2023 10:10:27 +0200 Subject: [PATCH 55/89] doc consistency --- scanpy/get/_aggregated.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index b11a5bca9..232bee6e3 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -74,7 +74,7 @@ def __init__( self._key_index = None def count(self) -> np.ndarray: - """ + """\ Count the number of observations in each group. Returns @@ -86,7 +86,7 @@ def count(self) -> np.ndarray: return count_ def sum(self) -> Array: - """ + """\ Compute the sum per feature per group of observations. Returns @@ -124,7 +124,7 @@ def count_mean_var(self, dof: int = 1) -> CMV: Returns ------- - dict with mean, count, and var keys. + Object with `count`, `mean`, and `var` attributes. """ assert dof >= 0 A, _ = self._sparse_aggregator(normalize=True) @@ -156,7 +156,7 @@ def count_mean_var(self, dof: int = 1) -> CMV: def _sparse_aggregator( self, normalize: bool = False ) -> tuple[coo_matrix, NDArray[np.floating]]: - """ + """\ Form a coordinate-sparse matrix A such that rows of A * X are weighted sums of groups of rows of X. @@ -199,10 +199,11 @@ def _filter_indices( df_index: np.ndarray, weight_value: pd.Series | Array | None = None, ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: - """Filter the values of keys, key_index, df_index, and optionally weight_value based on self._key_set. + """\ + Filter the values of keys, key_index, df_index, and optionally weight_value based on self._key_set. - Parameters - ---------- + Params + ------ keys Unique key values to be filtered. key_index @@ -214,12 +215,12 @@ def _filter_indices( Returns ------- - Filtered versions of all arguments. + Filtered versions of all arguments. Raises ------ ValueError - If no keys in key_set found in keys. + If no keys in key_set found in keys. """ keep = [i for i, k in enumerate(keys) if k in set(self._key_set)] if len(keep) == 0: @@ -239,12 +240,14 @@ def _filter_indices( def _extract_indices( self, ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: - """Extract indices from self._groupby with the goal of building a matrix that can be multiplied with the data to produce an aggregation statistics e.g., mean or variance. + """\ + Extract indices from self._groupby with the goal of building a matrix + that can be multiplied with the data to produce an aggregation statistics e.g., mean or variance. These are filtered if a self._key_set is present. Returns ------- - Unique keys, an array mapping those unique keys to an index, said index, and a weight if present. + Unique keys, an array mapping those unique keys to an index, said index, and a weight if present. """ key_value = self._groupby keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) @@ -262,10 +265,13 @@ def _extract_indices( def _power(X: Array, power: float | int) -> Array: - """Generate elementwise power of a matrix. Needed for non-square sparse matrices because they do not support ** so the `.power` function is used. + """\ + Generate elementwise power of a matrix. - Parameters - ---------- + Needed for non-square sparse matrices because they do not support ** so the `.power` function is used. + + Params + ------ X Matrix whose power is to be raised. power @@ -273,7 +279,7 @@ def _power(X: Array, power: float | int) -> Array: Returns ------- - Matrix whose power has been raised. + Matrix whose power has been raised. """ return X ** power if isinstance(X, np.ndarray) else X.power(power) From d30cd616c0cabd8574ca9f0d3b0ee203fe569f2a Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Thu, 24 Aug 2023 10:20:54 +0200 Subject: [PATCH 56/89] improve typing --- scanpy/get/_aggregated.py | 90 ++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 48 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 232bee6e3..2efde8ca5 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -1,7 +1,8 @@ from __future__ import annotations +from dataclasses import dataclass from functools import singledispatch -from typing import NamedTuple, Optional, Literal, Union as _U, get_args +from typing import NamedTuple, Literal, Union as _U, get_args from collections.abc import Iterable, Set, Sequence from anndata import AnnData, utils @@ -20,6 +21,7 @@ class CMV(NamedTuple): var: NDArray[np.floating] +@dataclass class Aggregate: """\ Functionality for generic grouping and aggregating. @@ -54,24 +56,16 @@ class Aggregate: Subset of keys to which to filter. """ - _groupby: pd.Series - _data: Array - _weight: pd.Series | Array - _key_set: Set[str] - _key_index: Optional[np.ndarray] # caution, may be stale if attributes are updated + groupby: pd.Series + data: Array + weight: pd.Series | Array + key_set: Set[str] | None + # caution, may be stale if attributes are updated + key_index: Iterable[str] | None = None - def __init__( - self, - groupby: pd.Series, - data: Array, - weight: pd.Series | Array = None, - key_set: Optional[Iterable[str]] = None, - ): - self._groupby = groupby - self._data = data - self._weight = weight - self._key_set = None if key_set is None else dict.fromkeys(key_set).keys() - self._key_index = None + def __post_init__(self): + if self.key_set is not None and not isinstance(self.key_set, Set): + self.key_set = dict.fromkeys(self.key_set).keys() def count(self) -> np.ndarray: """\ @@ -94,7 +88,7 @@ def sum(self) -> Array: Array of sum. """ A, _ = self._sparse_aggregator(normalize=False) - return utils.asarray(A * self._data) + return utils.asarray(A * self.data) def mean(self) -> Array: """\ @@ -105,7 +99,7 @@ def mean(self) -> Array: Array of mean. """ A, _ = self._sparse_aggregator(normalize=True) - return utils.asarray(A * self._data) + return utils.asarray(A * self.data) def count_mean_var(self, dof: int = 1) -> CMV: """\ @@ -128,25 +122,25 @@ def count_mean_var(self, dof: int = 1) -> CMV: """ assert dof >= 0 A, _ = self._sparse_aggregator(normalize=True) - count_ = np.bincount(self._key_index) - mean_ = utils.asarray(A @ self._data) + count_ = np.bincount(self.key_index) + mean_ = utils.asarray(A @ self.data) # sparse matrices do not support ** for elementwise power. - mean_sq = utils.asarray(A @ _power(self._data, 2)) - if self._weight is None: + mean_sq = utils.asarray(A @ _power(self.data, 2)) + if self.weight is None: sq_mean = mean_**2 else: A_unweighted, _ = Aggregate( - groupby=self._groupby, - data=self._data, - weight=self._weight, - key_set=self._key_set, + groupby=self.groupby, + data=self.data, + weight=self.weight, + key_set=self.key_set, )._sparse_aggregator() - mean_unweighted = utils.asarray(A_unweighted * self._data) + mean_unweighted = utils.asarray(A_unweighted * self.data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 var_ = mean_sq - sq_mean # TODO: Why these values exactly? Because they are high relative to the datatype? # (unchanged from original code: https://github.com/scverse/anndata/pull/564) - precision = 2 << (42 if self._data.dtype == np.float64 else 20) + precision = 2 << (42 if self.data.dtype == np.float64 else 20) # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 var_[precision * var_ < sq_mean] = 0 if dof != 0: @@ -178,12 +172,12 @@ def _sparse_aggregator( keys, key_index, df_index, weight_value = self._extract_indices() if df_index is None: df_index = np.arange(len(key_index)) - if self._weight is None: + if self.weight is None: weight_value = np.ones(len(key_index)) # TODO: why a coo matrix here and a dia matrix below? (unchanged from original code: https://github.com/scverse/anndata/pull/564) A = coo_matrix( (weight_value, (key_index, df_index)), - shape=(len(keys), self._data.shape[0]), + shape=(len(keys), self.data.shape[0]), ) if normalize: n_row = A.shape[0] @@ -200,7 +194,7 @@ def _filter_indices( weight_value: pd.Series | Array | None = None, ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: """\ - Filter the values of keys, key_index, df_index, and optionally weight_value based on self._key_set. + Filter the values of keys, key_index, df_index, and optionally weight_value based on :attr:`key_set`. Params ------ @@ -222,7 +216,7 @@ def _filter_indices( ValueError If no keys in key_set found in keys. """ - keep = [i for i, k in enumerate(keys) if k in set(self._key_set)] + keep = [i for i, k in enumerate(keys) if k in set(self.key_set)] if len(keep) == 0: raise ValueError("No keys in key_set found in keys.") elif len(keep) < len(keys): @@ -241,26 +235,26 @@ def _extract_indices( self, ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: """\ - Extract indices from self._groupby with the goal of building a matrix + Extract indices from attr:`groupby` with the goal of building a matrix that can be multiplied with the data to produce an aggregation statistics e.g., mean or variance. - These are filtered if a self._key_set is present. + These are filtered if a :attr:`key_set` is present. Returns ------- Unique keys, an array mapping those unique keys to an index, said index, and a weight if present. """ - key_value = self._groupby + key_value = self.groupby keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) df_index = np.arange(len(key_index)) - if self._weight is None: + if self.weight is None: weight_value = None else: - weight_value = self._weight.values[df_index] - if self._key_set is not None: + weight_value = self.weight.values[df_index] + if self.key_set is not None: keys, key_index, df_index, weight_value = self._filter_indices( keys, key_index, df_index, weight_value ) - self._key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter + self.key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter return keys, key_index, df_index, weight_value @@ -356,12 +350,12 @@ def aggregated( func: AggType | Iterable[AggType], *, dim: Literal['obs', 'var'] = 'obs', - weight_key: Optional[str] = None, - key_set: Optional[Iterable[str]] = None, + weight_key: str | None = None, + key_set: Iterable[str] | None = None, dof: int = 1, - layer: Optional[str] = None, - obsm: Optional[str] = None, - varm: Optional[str] = None, + layer: str | None = None, + obsm: str | None = None, + varm: str | None = None, ) -> AnnData: """\ Aggregate data based on one of the columns of one of the axes (`obs` or `var`). @@ -437,8 +431,8 @@ def aggregated_from_array( by: str, write_to_xxxm: bool, no_groupby_df: pd.DataFrame, - weight_key: Optional[str] = None, - key_set: Optional[Iterable[str]] = None, + weight_key: str | None = None, + key_set: Iterable[str] | None = None, dof: int = 1, ) -> AnnData: """Aggregate data based on one of the columns of one of a `~pd.DataFrame`.""" From 834b9f9693b66317afea4b30bedccdb289ee363a Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Thu, 24 Aug 2023 11:38:14 +0200 Subject: [PATCH 57/89] correct caching --- scanpy/get/_aggregated.py | 67 ++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 2efde8ca5..b1cbe6f1f 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -21,6 +21,13 @@ class CMV(NamedTuple): var: NDArray[np.floating] +class Indices(NamedTuple): + keys: np.ndarray + key_index: np.ndarray + df_index: np.ndarray + weight_value: pd.Series | Array | None + + @dataclass class Aggregate: """\ @@ -60,14 +67,12 @@ class Aggregate: data: Array weight: pd.Series | Array key_set: Set[str] | None - # caution, may be stale if attributes are updated - key_index: Iterable[str] | None = None def __post_init__(self): if self.key_set is not None and not isinstance(self.key_set, Set): self.key_set = dict.fromkeys(self.key_set).keys() - def count(self) -> np.ndarray: + def count(self, *, _indices: Indices | None = None) -> np.ndarray: """\ Count the number of observations in each group. @@ -75,9 +80,9 @@ def count(self) -> np.ndarray: ------- Array of counts. """ - _, key_index, _, _ = self._extract_indices() - count_ = np.bincount(key_index) - return count_ + if _indices is None: + _indices = self._extract_indices() + return np.bincount(_indices.key_index) def sum(self) -> Array: """\ @@ -90,7 +95,7 @@ def sum(self) -> Array: A, _ = self._sparse_aggregator(normalize=False) return utils.asarray(A * self.data) - def mean(self) -> Array: + def mean(self, *, _A: spmatrix | None = None) -> Array: """\ Compute the mean per feature per group of observations. @@ -98,10 +103,11 @@ def mean(self) -> Array: ------- Array of mean. """ - A, _ = self._sparse_aggregator(normalize=True) - return utils.asarray(A * self.data) + if _A is None: + _A, _ = self._sparse_aggregator(normalize=True) + return utils.asarray(_A @ self.data) - def count_mean_var(self, dof: int = 1) -> CMV: + def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CMV: """\ Compute the count, as well as mean and variance per feature, per group of observations. @@ -121,9 +127,11 @@ def count_mean_var(self, dof: int = 1) -> CMV: Object with `count`, `mean`, and `var` attributes. """ assert dof >= 0 - A, _ = self._sparse_aggregator(normalize=True) - count_ = np.bincount(self.key_index) - mean_ = utils.asarray(A @ self.data) + if _indices is None: + _indices = self._extract_indices() + A, _ = self._sparse_aggregator(normalize=True, _indices=_indices) + count_ = self.count(_indices=_indices) + mean_ = self.mean(_A=A) # sparse matrices do not support ** for elementwise power. mean_sq = utils.asarray(A @ _power(self.data, 2)) if self.weight is None: @@ -132,7 +140,7 @@ def count_mean_var(self, dof: int = 1) -> CMV: A_unweighted, _ = Aggregate( groupby=self.groupby, data=self.data, - weight=self.weight, + weight=self.weight, # TODO: why pass weights when creating unweighted A? key_set=self.key_set, )._sparse_aggregator() mean_unweighted = utils.asarray(A_unweighted * self.data) @@ -148,7 +156,7 @@ def count_mean_var(self, dof: int = 1) -> CMV: return CMV(count=count_, mean=mean_, var=var_) def _sparse_aggregator( - self, normalize: bool = False + self, normalize: bool = False, *, _indices: Indices | None = None ) -> tuple[coo_matrix, NDArray[np.floating]]: """\ Form a coordinate-sparse matrix A such that rows of A * X @@ -169,7 +177,9 @@ def _sparse_aggregator( keys An ndarray with keys[i] the group key corresponding to row i of A. """ - keys, key_index, df_index, weight_value = self._extract_indices() + if _indices is None: + _indices = self._extract_indices() + keys, key_index, df_index, weight_value = _indices if df_index is None: df_index = np.arange(len(key_index)) if self.weight is None: @@ -186,13 +196,7 @@ def _sparse_aggregator( A = D * A return A, keys - def _filter_indices( - self, - keys: np.ndarray, - key_index: np.ndarray, - df_index: np.ndarray, - weight_value: pd.Series | Array | None = None, - ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: + def _filter_indices(self, indices: Indices) -> Indices: """\ Filter the values of keys, key_index, df_index, and optionally weight_value based on :attr:`key_set`. @@ -216,6 +220,7 @@ def _filter_indices( ValueError If no keys in key_set found in keys. """ + keys, key_index, df_index, weight_value = indices keep = [i for i, k in enumerate(keys) if k in set(self.key_set)] if len(keep) == 0: raise ValueError("No keys in key_set found in keys.") @@ -229,11 +234,9 @@ def _filter_indices( df_index = df_index[mask] if weight_value is not None: weight_value = weight_value[mask] - return keys, key_index, df_index, weight_value + return Indices(keys, key_index, df_index, weight_value) - def _extract_indices( - self, - ) -> tuple[np.ndarray, np.ndarray, pd.Series | Array | None]: + def _extract_indices(self) -> Indices: """\ Extract indices from attr:`groupby` with the goal of building a matrix that can be multiplied with the data to produce an aggregation statistics e.g., mean or variance. @@ -250,12 +253,10 @@ def _extract_indices( weight_value = None else: weight_value = self.weight.values[df_index] - if self.key_set is not None: - keys, key_index, df_index, weight_value = self._filter_indices( - keys, key_index, df_index, weight_value - ) - self.key_index = key_index # passed to count and count_mean_var to avoid re-extracting in the latter - return keys, key_index, df_index, weight_value + indices = Indices(keys, key_index, df_index, weight_value) + if self.key_set is None: + return indices + return self._filter_indices(indices) def _power(X: Array, power: float | int) -> Array: From 14b67f3b21130cebf772f2423ea970ebd4844808 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Tue, 29 Aug 2023 11:53:18 +0200 Subject: [PATCH 58/89] simplify --- scanpy/get/_aggregated.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index b1cbe6f1f..636028139 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -444,12 +444,13 @@ def aggregated_from_array( key_set=key_set, ) # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate - obs_var_dict = {'obs': _df_grouped(groupby_df, by, key_set), 'var': no_groupby_df} - data_dict = { - 'layers': {}, - 'X': None, - 'obsm': {}, - } + adata_kw = dict( + X=None, + layers={}, + obs=_df_grouped(groupby_df, by, key_set), + var=no_groupby_df, + obsm={}, + ) write_key = 'obsm' if write_to_xxxm else 'layers' funcs = set([func] if isinstance(func, str) else func) if unknown := funcs - set(get_args(AggType)): @@ -458,29 +459,29 @@ def aggregated_from_array( agg = groupby.sum() # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` if len(funcs) == 1 and not write_to_xxxm: - data_dict['X'] = agg + adata_kw['X'] = agg else: - data_dict[write_key]['sum'] = agg + adata_kw[write_key]['sum'] = agg # here and below for count, if var is present, these can be calculate alongside var if 'mean' in funcs and 'var' not in funcs: agg = groupby.mean() if len(funcs) == 1 and not write_to_xxxm: - data_dict['X'] = agg + adata_kw['X'] = agg else: - data_dict[write_key]['mean'] = agg + adata_kw[write_key]['mean'] = agg if 'count' in funcs and 'var' not in funcs: - obs_var_dict['obs']['count'] = groupby.count() # count goes in dim df + adata_kw['obs']['count'] = groupby.count() # count goes in dim df if 'var' in funcs: aggs = groupby.count_mean_var(dof) if len(funcs) == 1 and not write_to_xxxm: - data_dict['X'] = aggs.var + adata_kw['X'] = aggs.var else: - data_dict[write_key]['var'] = aggs.var + adata_kw[write_key]['var'] = aggs.var if 'mean' in funcs: - data_dict[write_key]['mean'] = aggs.mean + adata_kw[write_key]['mean'] = aggs.mean if 'count' in funcs: - obs_var_dict['obs']['count'] = aggs.count - adata_agg = AnnData(**{**data_dict, **obs_var_dict}) + adata_kw['obs']['count'] = aggs.count + adata_agg = AnnData(**adata_kw) if dim == 'var': return adata_agg.T return adata_agg From d2354871455da0064c9ef3fa0a9aecd84bb10176 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 27 Nov 2023 13:51:58 +0000 Subject: [PATCH 59/89] * Don't allocate for indicator matrix * Fight pre-commit/ ruff * Add helper for computing indicator categorical --- .pre-commit-config.yaml | 2 +- scanpy/get/_aggregated.py | 92 ++++++++++++------- scanpy/tests/test_aggregated.py | 158 ++++++++++++++++++++++---------- 3 files changed, 173 insertions(+), 79 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 443906bd8..44b4522dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: v0.1.5 hooks: - id: ruff - args: ["--fix"] + args: ["--fix", "--unsafe-fixes"] - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 636028139..4d792cbc2 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -1,18 +1,21 @@ from __future__ import annotations +from collections.abc import Iterable, Sequence, Set from dataclasses import dataclass from functools import singledispatch -from typing import NamedTuple, Literal, Union as _U, get_args -from collections.abc import Iterable, Set, Sequence +from typing import TYPE_CHECKING, Literal, NamedTuple, get_args +from typing import Union as _U -from anndata import AnnData, utils import numpy as np import pandas as pd -from numpy.typing import NDArray +from anndata import AnnData, utils from scipy.sparse import coo_matrix, dia_matrix, spmatrix +if TYPE_CHECKING: + from numpy.typing import NDArray + Array = _U[np.ndarray, spmatrix] -AggType = Literal['count', 'mean', 'sum', 'var'] +AggType = Literal["count", "mean", "sum", "var"] class CMV(NamedTuple): @@ -183,7 +186,7 @@ def _sparse_aggregator( if df_index is None: df_index = np.arange(len(key_index)) if self.weight is None: - weight_value = np.ones(len(key_index)) + weight_value = np.broadcast_to(1.0, len(key_index)) # TODO: why a coo matrix here and a dia matrix below? (unchanged from original code: https://github.com/scverse/anndata/pull/564) A = coo_matrix( (weight_value, (key_index, df_index)), @@ -276,7 +279,7 @@ def _power(X: Array, power: float | int) -> Array: ------- Matrix whose power has been raised. """ - return X ** power if isinstance(X, np.ndarray) else X.power(power) + return X**power if isinstance(X, np.ndarray) else X.power(power) def _ndarray_from_seq(lst: Sequence): @@ -306,10 +309,10 @@ def _superset_columns(df: pd.DataFrame, groupby_key: str) -> list[str]: Superset columns. """ columns = [] - groupy_key_codes = df[groupby_key].astype('category') + groupy_key_codes = df[groupby_key].astype("category") for key in df: if key != groupby_key: - key_codes = df[key].astype('category') + key_codes = df[key].astype("category") if all( key_codes[groupy_key_codes == group_key_code].nunique() == 1 for group_key_code in groupy_key_codes @@ -339,7 +342,7 @@ def _df_grouped(df: pd.DataFrame, key: str, key_set: list[str]) -> pd.DataFrame: df = df.copy() if key_set is not None: df = df[df[key].isin(key_set)] - if pd.api.types.is_categorical_dtype(df[key]): + if isinstance(df[key].dtype, pd.CategoricalDtype): df[key] = df[key].cat.remove_unused_categories() return df.groupby(key).first()[_superset_columns(df, key)] @@ -350,7 +353,7 @@ def aggregated( by: str, func: AggType | Iterable[AggType], *, - dim: Literal['obs', 'var'] = 'obs', + dim: Literal["obs", "var"] = "obs", weight_key: str | None = None, key_set: Iterable[str] | None = None, dof: int = 1, @@ -403,9 +406,9 @@ def aggregated( write_to_xxxm = True elif layer is not None: data = adata.layers[layer] - if dim == 'var': + if dim == "var": data = data.T - elif dim == 'var': + elif dim == "var": # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T return aggregated( @@ -414,7 +417,7 @@ def aggregated( dim=dim, by=by, write_to_xxxm=write_to_xxxm, - no_groupby_df=getattr(adata, 'var' if dim == 'obs' else 'obs'), + no_groupby_df=getattr(adata, "var" if dim == "obs" else "obs"), weight_key=weight_key, key_set=key_set, func=func, @@ -451,37 +454,62 @@ def aggregated_from_array( var=no_groupby_df, obsm={}, ) - write_key = 'obsm' if write_to_xxxm else 'layers' + write_key = "obsm" if write_to_xxxm else "layers" funcs = set([func] if isinstance(func, str) else func) if unknown := funcs - set(get_args(AggType)): - raise ValueError(f'func {unknown} is not one of {get_args(AggType)}') - if 'sum' in funcs: # sum is calculated separately from the rest + raise ValueError(f"func {unknown} is not one of {get_args(AggType)}") + if "sum" in funcs: # sum is calculated separately from the rest agg = groupby.sum() # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` if len(funcs) == 1 and not write_to_xxxm: - adata_kw['X'] = agg + adata_kw["X"] = agg else: - adata_kw[write_key]['sum'] = agg + adata_kw[write_key]["sum"] = agg # here and below for count, if var is present, these can be calculate alongside var - if 'mean' in funcs and 'var' not in funcs: + if "mean" in funcs and "var" not in funcs: agg = groupby.mean() if len(funcs) == 1 and not write_to_xxxm: - adata_kw['X'] = agg + adata_kw["X"] = agg else: - adata_kw[write_key]['mean'] = agg - if 'count' in funcs and 'var' not in funcs: - adata_kw['obs']['count'] = groupby.count() # count goes in dim df - if 'var' in funcs: + adata_kw[write_key]["mean"] = agg + if "count" in funcs and "var" not in funcs: + adata_kw["obs"]["count"] = groupby.count() # count goes in dim df + if "var" in funcs: aggs = groupby.count_mean_var(dof) if len(funcs) == 1 and not write_to_xxxm: - adata_kw['X'] = aggs.var + adata_kw["X"] = aggs.var else: - adata_kw[write_key]['var'] = aggs.var - if 'mean' in funcs: - adata_kw[write_key]['mean'] = aggs.mean - if 'count' in funcs: - adata_kw['obs']['count'] = aggs.count + adata_kw[write_key]["var"] = aggs.var + if "mean" in funcs: + adata_kw[write_key]["mean"] = aggs.mean + if "count" in funcs: + adata_kw["obs"]["count"] = aggs.count adata_agg = AnnData(**adata_kw) - if dim == 'var': + if dim == "var": return adata_agg.T return adata_agg + + +def _combine_categories(label_df: pd.DataFrame, cols: list[str]) -> pd.Categorical: + from itertools import product + + df = pd.DataFrame( + {c: pd.Categorical(label_df[c]).remove_unused_categories() for c in cols}, + ) + result_categories = [ + "_".join(x) for x in product(*[df[c].cat.categories for c in cols]) + ] + n_categories = [len(df[c].cat.categories) for c in cols] + + factors = np.ones(len(cols) + 1, dtype=np.int32) # First factor needs to be 1 + np.cumsum(n_categories[::-1], out=factors[1:]) + factors = factors[:-1][::-1] + + # TODO: pick a more optimal bit width + final_codes = np.zeros(df.shape[0], dtype=np.int32) + for factor, c in zip(factors, cols): + final_codes += df[c].cat.codes * factor + + return pd.Categorical.from_codes( + final_codes, categories=result_categories + ).remove_unused_categories() diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index b4e634a37..646513c94 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -1,9 +1,12 @@ +from __future__ import annotations + import anndata as ad -import scanpy as sc import numpy as np import pandas as pd -from scipy.sparse import csr_matrix, csc_matrix import pytest +from scipy.sparse import csr_matrix + +import scanpy as sc @pytest.fixture @@ -26,7 +29,7 @@ def df_groupby(): df_groupby = pd.DataFrame(index=pd.Index(ax_groupby, name="cell")) df_groupby["key"] = pd.Categorical([c[0] for c in ax_groupby]) df_groupby["key_superset"] = pd.Categorical([c[0] for c in ax_groupby]).map( - {'v': 'v', 'w': 'v', 'a': 'a', 'b': 'a', 'c': 'a', 'd': 'a'} + {"v": "v", "w": "v", "a": "a", "b": "a", "c": "a", "d": "a"} ) df_groupby["key_subset"] = pd.Categorical([c[1] for c in ax_groupby]) df_groupby["weight"] = 2.0 @@ -47,14 +50,14 @@ def X(): def gen_adata(data_key, dim, df_base, df_groupby, X): - if (data_key == 'varm' and dim == 'obs') or (data_key == 'obsm' and dim == 'var'): + if (data_key == "varm" and dim == "obs") or (data_key == "obsm" and dim == "var"): pytest.skip("invalid parameter combination") - obs_df, var_df = (df_groupby, df_base) if dim == 'obs' else (df_base, df_groupby) - data = X.T if dim == 'var' and data_key != 'varm' else X - if data_key != 'X': - data_dict_sparse = {data_key: {'test': csr_matrix(data)}} - data_dict_dense = {data_key: {'test': data}} + obs_df, var_df = (df_groupby, df_base) if dim == "obs" else (df_base, df_groupby) + data = X.T if dim == "var" and data_key != "varm" else X + if data_key != "X": + data_dict_sparse = {data_key: {"test": csr_matrix(data)}} + data_dict_dense = {data_key: {"test": data}} else: data_dict_sparse = {data_key: csr_matrix(data)} data_dict_dense = {data_key: data} @@ -64,45 +67,45 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): return adata_sparse, adata_dense -@pytest.mark.parametrize('data_key', ['layers', 'obsm', 'varm', 'X']) -@pytest.mark.parametrize('dim', ['obs', 'var']) +@pytest.mark.parametrize("data_key", ["layers", "obsm", "varm", "X"]) +@pytest.mark.parametrize("dim", ["obs", "var"]) def test_groupby(data_key, dim, df_base, df_groupby, X): adata_sparse, adata_dense = gen_adata(data_key, dim, df_base, df_groupby, X) data_loc_dict = ( - {(data_key if data_key != 'layers' else 'layer'): 'test'} - if data_key != 'X' + {(data_key if data_key != "layers" else "layer"): "test"} + if data_key != "X" else {} ) # When `X` is not the `data_key`, the multi-aggregation data is colocated with the `data_key`. Otherwise it is in `layers`. - multi_agg_data_loc_key = data_key if data_key != 'X' else 'layers' + multi_agg_data_loc_key = data_key if data_key != "X" else "layers" stats_sparse, stats_dense = ( sc.get.aggregated( adata, by="key", dim=dim, - func=['count', 'mean', 'var'], + func=["count", "mean", "var"], **data_loc_dict, ) for adata in [adata_sparse, adata_dense] ) # superset columns can be kept but not subsets - assert 'key_superset' in getattr(stats_sparse, dim) - assert 'key_subset' not in getattr(stats_sparse, dim) + assert "key_superset" in getattr(stats_sparse, dim) + assert "key_subset" not in getattr(stats_sparse, dim) assert np.allclose( - getattr(stats_sparse, dim)['count'], - getattr(stats_dense, dim)['count'], + getattr(stats_sparse, dim)["count"], + getattr(stats_dense, dim)["count"], ) assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)['mean'], - getattr(stats_dense, multi_agg_data_loc_key)['mean'], + getattr(stats_sparse, multi_agg_data_loc_key)["mean"], + getattr(stats_dense, multi_agg_data_loc_key)["mean"], ) assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)['var'], - getattr(stats_dense, multi_agg_data_loc_key)['var'], + getattr(stats_sparse, multi_agg_data_loc_key)["var"], + getattr(stats_dense, multi_agg_data_loc_key)["var"], equal_nan=True, ) @@ -110,39 +113,39 @@ def test_groupby(data_key, dim, df_base, df_groupby, X): adata_dense, by="key", dim=dim, - func=['count', 'mean', 'var'], + func=["count", "mean", "var"], weight_key="weight", **data_loc_dict, ) sum_ = sc.get.aggregated( - adata_sparse, by="key", dim=dim, func='sum', **data_loc_dict + adata_sparse, by="key", dim=dim, func="sum", **data_loc_dict ) sum_weight = sc.get.aggregated( adata_dense, by="key", dim=dim, - func='sum', + func="sum", weight_key="weight", **data_loc_dict, ) def get_single_agg(adata, key, agg): # Get the data of the aggregation from the correct location when only one `func` is passed in to `aggregated` - if (key != 'obsm' and key != 'varm') or data_key == 'X': + if (key != "obsm" and key != "varm") or data_key == "X": return adata.X return getattr(adata, key)[agg] assert np.allclose( - 2 * get_single_agg(sum_, data_key, 'sum'), - get_single_agg(sum_weight, data_key, 'sum'), + 2 * get_single_agg(sum_, data_key, "sum"), + get_single_agg(sum_weight, data_key, "sum"), ) assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)['mean'], - getattr(stats_weight, multi_agg_data_loc_key)['mean'], + getattr(stats_sparse, multi_agg_data_loc_key)["mean"], + getattr(stats_weight, multi_agg_data_loc_key)["mean"], ) assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)['var'], - getattr(stats_dense, multi_agg_data_loc_key)['var'], + getattr(stats_sparse, multi_agg_data_loc_key)["var"], + getattr(stats_dense, multi_agg_data_loc_key)["var"], equal_nan=True, ) @@ -151,16 +154,16 @@ def get_single_agg(adata, key, agg): adata_dense, by="key", dim=dim, - func='mean', + func="mean", key_set=key_set, **data_loc_dict, ) subset_idx = getattr(stats_sparse, dim).index.isin(key_set) subset_adata = ( - stats_sparse[subset_idx, :] if dim == 'obs' else stats_sparse[:, subset_idx] + stats_sparse[subset_idx, :] if dim == "obs" else stats_sparse[:, subset_idx] ) - subset_mean = getattr(subset_adata, multi_agg_data_loc_key)['mean'] - key_set_mean = get_single_agg(mean_key_set_adata, data_key, 'mean') + subset_mean = getattr(subset_adata, multi_agg_data_loc_key)["mean"] + key_set_mean = get_single_agg(mean_key_set_adata, data_key, "mean") assert np.allclose(subset_mean, key_set_mean) @@ -170,20 +173,83 @@ def get_single_agg(adata, key, agg): data=X, ) grouped_agg_df = ( - df.groupby('key') + df.groupby("key") .agg(["count", "mean", "var"]) .swaplevel(axis=1) .sort_index(axis=1) ) - mean = getattr(stats_dense, multi_agg_data_loc_key)['mean'] - if dim == 'var' and data_key != 'varm': + mean = getattr(stats_dense, multi_agg_data_loc_key)["mean"] + if dim == "var" and data_key != "varm": mean = mean.T - assert np.allclose(mean, grouped_agg_df['mean'].values) - var = getattr(stats_dense, multi_agg_data_loc_key)['var'] - if dim == 'var' and multi_agg_data_loc_key != 'varm': + assert np.allclose(mean, grouped_agg_df["mean"].values) + var = getattr(stats_dense, multi_agg_data_loc_key)["var"] + if dim == "var" and multi_agg_data_loc_key != "varm": var = var.T - assert np.allclose(var, grouped_agg_df['var'].values, equal_nan=True) + assert np.allclose(var, grouped_agg_df["var"].values, equal_nan=True) assert np.allclose( - getattr(stats_dense, dim)['count'], - grouped_agg_df['count']['A'].values, + getattr(stats_dense, dim)["count"], + grouped_agg_df["count"]["A"].values, ) # returns for both columns but counts only needs one because it is the same + + +@pytest.mark.parametrize( + "label_df,cols,expected", + [ + ( + pd.DataFrame( + { + "a": pd.Categorical(["a", "b", "c"]), + "b": pd.Categorical(["d", "d", "f"]), + } + ), + ["a", "b"], + pd.Categorical(["a_d", "b_d", "c_f"]), + ), + ( + pd.DataFrame( + { + "a": pd.Categorical(["a", "b", "c"]), + "b": pd.Categorical(["d", "d", "f"]), + "c": pd.Categorical(["g", "h", "h"]), + } + ), + ["a", "b", "c"], + pd.Categorical(["a_d_g", "b_d_h", "c_f_h"]), + ), + ( + pd.DataFrame( + { + "a": pd.Categorical(["a", "b", "c"]), + "b": pd.Categorical(["d", "d", "f"]), + "c": pd.Categorical(["g", "h", "h"]), + } + ), + ["a", "c"], + pd.Categorical(["a_g", "b_h", "c_h"]), + ), + ( + pd.DataFrame( + { + "a": pd.Categorical(["a", "b", "c"]), + "b": pd.Categorical(["d", "d", "f"]), + "c": pd.Categorical(["g", "h", "h"]), + } + ), + ["b", "c"], + pd.Categorical(["d_g", "d_h", "f_h"]), + ), + ], +) +def test_combine_categories(label_df, cols, expected): + from scanpy.get._aggregated import _combine_categories + + result = _combine_categories(label_df, cols) + + assert isinstance(result, pd.Categorical) + + # assert set(result.cat.categories) == {"a_d", "b_d", "c_f"} + # assert set(result.codes) == {0, 1, 2} + + # expected = pd.Series(["a_d", "b_d", "c_f"], dtype="category") + # TODO: is there a better function here? + pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected)) From 9ae8f884eb219cdc56919e5e38f2411687329c54 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 28 Nov 2023 14:07:04 +0000 Subject: [PATCH 60/89] Start clean up --- scanpy/get/_aggregated.py | 295 ++++++++------------------------ scanpy/tests/test_aggregated.py | 132 +++++++++++++- 2 files changed, 202 insertions(+), 225 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 4d792cbc2..edf929e87 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections.abc import Iterable, Sequence, Set -from dataclasses import dataclass from functools import singledispatch from typing import TYPE_CHECKING, Literal, NamedTuple, get_args from typing import Union as _U @@ -9,12 +8,12 @@ import numpy as np import pandas as pd from anndata import AnnData, utils -from scipy.sparse import coo_matrix, dia_matrix, spmatrix +from scipy import sparse if TYPE_CHECKING: from numpy.typing import NDArray -Array = _U[np.ndarray, spmatrix] +Array = _U[np.ndarray, sparse.spmatrix] AggType = Literal["count", "mean", "sum", "var"] @@ -31,7 +30,6 @@ class Indices(NamedTuple): weight_value: pd.Series | Array | None -@dataclass class Aggregate: """\ Functionality for generic grouping and aggregating. @@ -62,20 +60,20 @@ class Aggregate: Data matrix for aggregation. weight Weights to be used for aggergation. - key_set - Subset of keys to which to filter. """ + def __init__(self, groupby, data, weight=None): + self.groupby = groupby + self.indicator_matrix = sparse_indicator(groupby) + self.data = data + self.weight = weight + groupby: pd.Series data: Array weight: pd.Series | Array key_set: Set[str] | None - def __post_init__(self): - if self.key_set is not None and not isinstance(self.key_set, Set): - self.key_set = dict.fromkeys(self.key_set).keys() - - def count(self, *, _indices: Indices | None = None) -> np.ndarray: + def count(self) -> np.ndarray: """\ Count the number of observations in each group. @@ -83,9 +81,9 @@ def count(self, *, _indices: Indices | None = None) -> np.ndarray: ------- Array of counts. """ - if _indices is None: - _indices = self._extract_indices() - return np.bincount(_indices.key_index) + # pattern = self.data._with_data(np.broadcast_to(1, len(self.data.data))) + # return self.indicator_matrix @ pattern + return self.indicator_matrix @ (self.data != 0) def sum(self) -> Array: """\ @@ -95,10 +93,9 @@ def sum(self) -> Array: ------- Array of sum. """ - A, _ = self._sparse_aggregator(normalize=False) - return utils.asarray(A * self.data) + return utils.asarray(self.indicator_matrix @ self.data) - def mean(self, *, _A: spmatrix | None = None) -> Array: + def mean(self) -> Array: """\ Compute the mean per feature per group of observations. @@ -106,9 +103,10 @@ def mean(self, *, _A: spmatrix | None = None) -> Array: ------- Array of mean. """ - if _A is None: - _A, _ = self._sparse_aggregator(normalize=True) - return utils.asarray(_A @ self.data) + return ( + utils.asarray(self.indicator_matrix @ self.data) + / np.bincount(self.groupby.codes)[:, None] + ) def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CMV: """\ @@ -130,23 +128,22 @@ def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CM Object with `count`, `mean`, and `var` attributes. """ assert dof >= 0 - if _indices is None: - _indices = self._extract_indices() - A, _ = self._sparse_aggregator(normalize=True, _indices=_indices) - count_ = self.count(_indices=_indices) - mean_ = self.mean(_A=A) + count_ = self.count() + group_counts = np.bincount(self.groupby.codes) + mean_ = self.mean() # sparse matrices do not support ** for elementwise power. - mean_sq = utils.asarray(A @ _power(self.data, 2)) + mean_sq = utils.asarray(self.indicator_matrix @ _power(self.data, 2)) if self.weight is None: sq_mean = mean_**2 else: - A_unweighted, _ = Aggregate( - groupby=self.groupby, - data=self.data, - weight=self.weight, # TODO: why pass weights when creating unweighted A? - key_set=self.key_set, - )._sparse_aggregator() - mean_unweighted = utils.asarray(A_unweighted * self.data) + A_unweighted = sparse_indicator(self.groupby) + # , _ = Aggregate( + # groupby=self.groupby, + # data=self.data, + # weight=self.weight, # TODO: why pass weights when creating unweighted A? + # key_set=self.key_set, + # )._sparse_aggregator() + mean_unweighted = utils.asarray(A_unweighted @ self.data) sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 var_ = mean_sq - sq_mean # TODO: Why these values exactly? Because they are high relative to the datatype? @@ -155,111 +152,21 @@ def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CM # detects loss of precision in mean_sq - sq_mean, which suggests variance is 0 var_[precision * var_ < sq_mean] = 0 if dof != 0: - var_ *= (count_ / (count_ - dof))[:, np.newaxis] + var_ *= (group_counts / (group_counts - dof))[:, np.newaxis] return CMV(count=count_, mean=mean_, var=var_) - def _sparse_aggregator( - self, normalize: bool = False, *, _indices: Indices | None = None - ) -> tuple[coo_matrix, NDArray[np.floating]]: - """\ - Form a coordinate-sparse matrix A such that rows of A * X - are weighted sums of groups of rows of X. - - A[i, j] = w includes X[j,:] in group i with weight w. - Params - ------ - normalize - If true, weights for each group are normalized to sum to 1.0, - corresponding to (weighted) mean. - - Returns - ------- - A - weighted sums of groups of rows of X. - keys - An ndarray with keys[i] the group key corresponding to row i of A. - """ - if _indices is None: - _indices = self._extract_indices() - keys, key_index, df_index, weight_value = _indices - if df_index is None: - df_index = np.arange(len(key_index)) - if self.weight is None: - weight_value = np.broadcast_to(1.0, len(key_index)) - # TODO: why a coo matrix here and a dia matrix below? (unchanged from original code: https://github.com/scverse/anndata/pull/564) - A = coo_matrix( - (weight_value, (key_index, df_index)), - shape=(len(keys), self.data.shape[0]), - ) - if normalize: - n_row = A.shape[0] - row_sums = np.asarray(A.sum(axis=1)) - D = dia_matrix(((row_sums.T**-1), [0]), shape=(n_row, n_row)) - A = D * A - return A, keys - - def _filter_indices(self, indices: Indices) -> Indices: - """\ - Filter the values of keys, key_index, df_index, and optionally weight_value based on :attr:`key_set`. - - Params - ------ - keys - Unique key values to be filtered. - key_index - Non-unique integer indices mapping keys to the df_index to be filtered. - df_index - An Index that the keys + key_index constitute to be filtered. - weight_value, optional - Weight values to be filtered., by default None - - Returns - ------- - Filtered versions of all arguments. - - Raises - ------ - ValueError - If no keys in key_set found in keys. - """ - keys, key_index, df_index, weight_value = indices - keep = [i for i, k in enumerate(keys) if k in set(self.key_set)] - if len(keep) == 0: - raise ValueError("No keys in key_set found in keys.") - elif len(keep) < len(keys): - mask = np.in1d(key_index, keep) - remap = np.zeros(len(keys), dtype=np.int64) - for i, j in enumerate(keep): - remap[j] = i - keys = [keys[j] for j in keep] - key_index = np.array([remap[i] for i in key_index[mask]], dtype=np.int64) - df_index = df_index[mask] - if weight_value is not None: - weight_value = weight_value[mask] - return Indices(keys, key_index, df_index, weight_value) - - def _extract_indices(self) -> Indices: - """\ - Extract indices from attr:`groupby` with the goal of building a matrix - that can be multiplied with the data to produce an aggregation statistics e.g., mean or variance. - These are filtered if a :attr:`key_set` is present. - - Returns - ------- - Unique keys, an array mapping those unique keys to an index, said index, and a weight if present. - """ - key_value = self.groupby - keys, key_index = np.unique(_ndarray_from_seq(key_value), return_inverse=True) - df_index = np.arange(len(key_index)) - if self.weight is None: - weight_value = None - else: - weight_value = self.weight.values[df_index] - indices = Indices(keys, key_index, df_index, weight_value) - if self.key_set is None: - return indices - return self._filter_indices(indices) +# def count_mean_var_spd(by, data): +# sums = np.zeros((by.shape[0],data.shape[1])) +# counts = np.zeros((by.shape[0],data.shape[1])) +# sums = by.toarray() @ data +# counts = by.toarray() @ data._with_data(np.ones(len(data.data),dtype=data.data.dtype)) +# n_cells = np.array(by.sum(axis= 1).astype(data.dtype)) +# means = sums/n_cells +# sq_mean = by.toarray() @ data.multiply(data)/n_cells +# var = sq_mean - np.power(means, 2) +# var *= n_cells / (n_cells - 1) +# return sums, counts, means, var def _power(X: Array, power: float | int) -> Array: @@ -293,60 +200,6 @@ def _ndarray_from_seq(lst: Sequence): return arr -def _superset_columns(df: pd.DataFrame, groupby_key: str) -> list[str]: - """\ - Find all columns which are a superset of the key column. - - Params - ------ - df - DataFrame which contains candidate columns. - groupby_key - Key for column of which to find superset of columns. - - Returns - ------- - Superset columns. - """ - columns = [] - groupy_key_codes = df[groupby_key].astype("category") - for key in df: - if key != groupby_key: - key_codes = df[key].astype("category") - if all( - key_codes[groupy_key_codes == group_key_code].nunique() == 1 - for group_key_code in groupy_key_codes - ): - columns += [key] - return columns - - -def _df_grouped(df: pd.DataFrame, key: str, key_set: list[str]) -> pd.DataFrame: - """\ - Generate a grouped-by dataframe (no aggregation) by - a key with columns that are supersets of the key column. - - Params - ------ - df - DataFrame to be grouped. - key - Column to be grouped on. - key_set - Values in the `key` column to keep before groupby. - - Returns - ------- - pd.DataFrame: Grouped-by Dataframe. - """ - df = df.copy() - if key_set is not None: - df = df[df[key].isin(key_set)] - if isinstance(df[key].dtype, pd.CategoricalDtype): - df[key] = df[key].cat.remove_unused_categories() - return df.groupby(key).first()[_superset_columns(df, key)] - - @singledispatch def aggregated( adata: AnnData, @@ -355,7 +208,6 @@ def aggregated( *, dim: Literal["obs", "var"] = "obs", weight_key: str | None = None, - key_set: Iterable[str] | None = None, dof: int = 1, layer: str | None = None, obsm: str | None = None, @@ -395,15 +247,13 @@ def aggregated( Aggregated :class:`~anndata.AnnData`. """ data = adata.X - write_to_xxxm = None + # TODO replace with get helper if sum(p is not None for p in [varm, obsm, layer]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") if varm is not None: data = adata.varm[varm] - write_to_xxxm = True # the data will have to be transposed so this is accurate elif obsm is not None: data = adata.obsm[obsm] - write_to_xxxm = True elif layer is not None: data = adata.layers[layer] if dim == "var": @@ -416,74 +266,62 @@ def aggregated( groupby_df=getattr(adata, dim), dim=dim, by=by, - write_to_xxxm=write_to_xxxm, + # write_to_xxxm=write_to_xxxm, no_groupby_df=getattr(adata, "var" if dim == "obs" else "obs"), weight_key=weight_key, - key_set=key_set, + # key_set=key_set, func=func, dof=dof, ) @aggregated.register(np.ndarray) -@aggregated.register(spmatrix) +@aggregated.register(sparse.spmatrix) def aggregated_from_array( data, groupby_df: pd.DataFrame, func: AggType | Iterable[AggType], dim: str, by: str, - write_to_xxxm: bool, no_groupby_df: pd.DataFrame, weight_key: str | None = None, - key_set: Iterable[str] | None = None, dof: int = 1, ) -> AnnData: """Aggregate data based on one of the columns of one of a `~pd.DataFrame`.""" + categorical = _combine_categories(groupby_df, by) groupby = Aggregate( - groupby=groupby_df[by], + groupby=categorical, data=data, weight=groupby_df[weight_key] if weight_key is not None else None, - key_set=key_set, ) # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate adata_kw = dict( X=None, layers={}, - obs=_df_grouped(groupby_df, by, key_set), + obs=pd.DataFrame(index=categorical.categories), var=no_groupby_df, obsm={}, ) - write_key = "obsm" if write_to_xxxm else "layers" funcs = set([func] if isinstance(func, str) else func) if unknown := funcs - set(get_args(AggType)): raise ValueError(f"func {unknown} is not one of {get_args(AggType)}") if "sum" in funcs: # sum is calculated separately from the rest agg = groupby.sum() - # put aggregation in X if it is the only one and the aggregation data is not coming from `xxxm` - if len(funcs) == 1 and not write_to_xxxm: - adata_kw["X"] = agg - else: - adata_kw[write_key]["sum"] = agg + adata_kw["layers"]["sum"] = agg # here and below for count, if var is present, these can be calculate alongside var if "mean" in funcs and "var" not in funcs: agg = groupby.mean() - if len(funcs) == 1 and not write_to_xxxm: - adata_kw["X"] = agg - else: - adata_kw[write_key]["mean"] = agg + adata_kw["layers"]["mean"] = agg if "count" in funcs and "var" not in funcs: - adata_kw["obs"]["count"] = groupby.count() # count goes in dim df + adata_kw["layers"]["count"] = groupby.count() # count goes in dim df if "var" in funcs: aggs = groupby.count_mean_var(dof) - if len(funcs) == 1 and not write_to_xxxm: - adata_kw["X"] = aggs.var - else: - adata_kw[write_key]["var"] = aggs.var - if "mean" in funcs: - adata_kw[write_key]["mean"] = aggs.mean - if "count" in funcs: - adata_kw["obs"]["count"] = aggs.count + adata_kw["layers"]["var"] = aggs.var + if "mean" in funcs: + adata_kw["layers"]["mean"] = aggs.mean + if "count" in funcs: + adata_kw["layers"]["count"] = aggs.count + adata_agg = AnnData(**adata_kw) if dim == "var": return adata_agg.T @@ -493,11 +331,14 @@ def aggregated_from_array( def _combine_categories(label_df: pd.DataFrame, cols: list[str]) -> pd.Categorical: from itertools import product + if isinstance(cols, str): + cols = [cols] + df = pd.DataFrame( {c: pd.Categorical(label_df[c]).remove_unused_categories() for c in cols}, ) result_categories = [ - "_".join(x) for x in product(*[df[c].cat.categories for c in cols]) + "_".join(map(str, x)) for x in product(*[df[c].cat.categories for c in cols]) ] n_categories = [len(df[c].cat.categories) for c in cols] @@ -513,3 +354,15 @@ def _combine_categories(label_df: pd.DataFrame, cols: list[str]) -> pd.Categoric return pd.Categorical.from_codes( final_codes, categories=result_categories ).remove_unused_categories() + + +def sparse_indicator( + categorical, weights: None | np.ndarray = None +) -> sparse.coo_matrix: + if weights is None: + weights = np.broadcast_to(1.0, len(categorical)) + A = sparse.coo_matrix( + (weights, (categorical.codes, np.arange(len(categorical)))), + shape=(len(categorical.categories), len(categorical)), + ) + return A diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 646513c94..8daee21ed 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -7,6 +7,8 @@ from scipy.sparse import csr_matrix import scanpy as sc +from scanpy.testing._helpers import assert_equal +from scanpy.testing._helpers.data import pbmc3k_processed @pytest.fixture @@ -67,6 +69,35 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): return adata_sparse, adata_dense +# TODO: There isn't an exact equivalent for our count operation in pandas I think (i.e. count non-zero values) +@pytest.mark.parametrize("metric", ["sum", "mean", "var"]) +def test_aggregated_vs_pandas(metric): + adata = pbmc3k_processed().raw.to_adata() + adata.obs["percent_mito_binned"] = pd.cut(adata.obs["percent_mito"], bins=10) + result = sc.get.aggregated(adata, ["louvain", "percent_mito_binned"], metric) + + expected = ( + adata.to_df() + .astype(np.float64) + .join(adata.obs[["louvain", "percent_mito_binned"]]) + .groupby(["louvain", "percent_mito_binned"], observed=True) + .agg(metric) + ) + # TODO: figure out the axis names + expected.index = expected.index.to_frame().apply( + lambda x: "_".join(map(str, x)), axis=1 + ) + expected.index.name = None + expected.columns.name = None + + result_df = result.to_df(layer=metric) + result_df.index.name = None + result_df.columns.name = None + # expected = adata.to_df().groupby(adata.obs[["louvain", "percent_mito_binned"]]).agg(metric) + + assert_equal(result_df, expected) + + @pytest.mark.parametrize("data_key", ["layers", "obsm", "varm", "X"]) @pytest.mark.parametrize("dim", ["obs", "var"]) def test_groupby(data_key, dim, df_base, df_groupby, X): @@ -192,6 +223,103 @@ def get_single_agg(adata, key, agg): ) # returns for both columns but counts only needs one because it is the same +@pytest.mark.parametrize( + "matrix,df,keys,metrics,expected", + [ + ( + np.block( + [ + [np.ones((2, 2)), np.zeros((2, 2))], + [np.zeros((2, 2)), np.ones((2, 2))], + ] + ), + pd.DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": ["c", "d", "d", "d"], + } + ), + ["a", "b"], + ["count"], # , "sum", "mean"], + ad.AnnData( + obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), + var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), + layers={ + "count": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), + # "sum": np.array([[2, 0], [0, 2]]), + # "mean": np.array([[1, 0], [0, 1]]), + }, + ), + ), + ( + np.block( + [ + [np.ones((2, 2)), np.zeros((2, 2))], + [np.zeros((2, 2)), np.ones((2, 2))], + ] + ), + pd.DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": ["c", "d", "d", "d"], + } + ), + ["a", "b"], + ["sum", "mean", "count"], + ad.AnnData( + obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), + var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), + layers={ + "sum": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), + "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), + "count": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), + }, + ), + ), + ( + np.block( + [ + [np.ones((2, 2)), np.zeros((2, 2))], + [np.zeros((2, 2)), np.ones((2, 2))], + ] + ), + pd.DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": ["c", "d", "d", "d"], + } + ), + ["a", "b"], + ["mean"], # , "sum", "mean"], + ad.AnnData( + obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), + var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), + layers={ + "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), + # "sum": np.array([[2, 0], [0, 2]]), + # "mean": np.array([[1, 0], [0, 1]]), + }, + ), + ), + ], +) +def test_aggregated_parameterized(matrix, df, keys, metrics, expected): + adata = ad.AnnData( + X=matrix, + obs=df, + var=pd.DataFrame(index=[f"gene_{i}" for i in range(matrix.shape[1])]), + ) + result = sc.get.aggregated(adata, by=keys, func=metrics) + + print(result) + print(expected) + + for k in metrics: + assert_equal(result.layers[k], expected.layers[k]) + + assert_equal(expected, result) + + @pytest.mark.parametrize( "label_df,cols,expected", [ @@ -247,9 +375,5 @@ def test_combine_categories(label_df, cols, expected): assert isinstance(result, pd.Categorical) - # assert set(result.cat.categories) == {"a_d", "b_d", "c_f"} - # assert set(result.codes) == {0, 1, 2} - - # expected = pd.Series(["a_d", "b_d", "c_f"], dtype="category") # TODO: is there a better function here? pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected)) From 461f049dc12026388ba6a2370ddb4b83e48e6d39 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 28 Nov 2023 14:54:27 +0000 Subject: [PATCH 61/89] Add array types to tests --- scanpy/tests/test_aggregated.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 8daee21ed..9f866c3e6 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -9,6 +9,7 @@ import scanpy as sc from scanpy.testing._helpers import assert_equal from scanpy.testing._helpers.data import pbmc3k_processed +from scanpy.testing._pytest.params import ARRAY_TYPES_MEM @pytest.fixture @@ -70,9 +71,11 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): # TODO: There isn't an exact equivalent for our count operation in pandas I think (i.e. count non-zero values) +@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) @pytest.mark.parametrize("metric", ["sum", "mean", "var"]) -def test_aggregated_vs_pandas(metric): +def test_aggregated_vs_pandas(metric, array_type): adata = pbmc3k_processed().raw.to_adata() + adata.X = array_type(adata.X) adata.obs["percent_mito_binned"] = pd.cut(adata.obs["percent_mito"], bins=10) result = sc.get.aggregated(adata, ["louvain", "percent_mito_binned"], metric) From dc4e9204e3888ec577fa404de2ddab817964d22d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 28 Nov 2023 14:54:40 +0000 Subject: [PATCH 62/89] Fix typo --- scanpy/get/_aggregated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index edf929e87..a75f84567 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -59,7 +59,7 @@ class Aggregate: data Data matrix for aggregation. weight - Weights to be used for aggergation. + Weights to be used for aggregation. """ def __init__(self, groupby, data, weight=None): From 9c7db293c32e2fdce259bf5be253a5647b54dbca Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 28 Nov 2023 16:44:32 +0000 Subject: [PATCH 63/89] Fix variance calculation --- scanpy/get/_aggregated.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index a75f84567..e8f73fd37 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -132,7 +132,10 @@ def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CM group_counts = np.bincount(self.groupby.codes) mean_ = self.mean() # sparse matrices do not support ** for elementwise power. - mean_sq = utils.asarray(self.indicator_matrix @ _power(self.data, 2)) + mean_sq = ( + utils.asarray(self.indicator_matrix @ _power(self.data, 2)) + / group_counts[:, None] + ) if self.weight is None: sq_mean = mean_**2 else: From 7012042c8361823f1be859e45f52d487e8f23722 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 29 Nov 2023 10:55:20 +0000 Subject: [PATCH 64/89] Get tests passing --- scanpy/tests/test_aggregated.py | 254 ++++++++++++++++---------------- 1 file changed, 127 insertions(+), 127 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 9f866c3e6..3659e05ac 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -70,6 +70,7 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): return adata_sparse, adata_dense +# TODO: test count_nonzero # TODO: There isn't an exact equivalent for our count operation in pandas I think (i.e. count non-zero values) @pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) @pytest.mark.parametrize("metric", ["sum", "mean", "var"]) @@ -96,134 +97,133 @@ def test_aggregated_vs_pandas(metric, array_type): result_df = result.to_df(layer=metric) result_df.index.name = None result_df.columns.name = None - # expected = adata.to_df().groupby(adata.obs[["louvain", "percent_mito_binned"]]).agg(metric) - assert_equal(result_df, expected) - - -@pytest.mark.parametrize("data_key", ["layers", "obsm", "varm", "X"]) -@pytest.mark.parametrize("dim", ["obs", "var"]) -def test_groupby(data_key, dim, df_base, df_groupby, X): - adata_sparse, adata_dense = gen_adata(data_key, dim, df_base, df_groupby, X) - - data_loc_dict = ( - {(data_key if data_key != "layers" else "layer"): "test"} - if data_key != "X" - else {} - ) - # When `X` is not the `data_key`, the multi-aggregation data is colocated with the `data_key`. Otherwise it is in `layers`. - multi_agg_data_loc_key = data_key if data_key != "X" else "layers" - - stats_sparse, stats_dense = ( - sc.get.aggregated( - adata, - by="key", - dim=dim, - func=["count", "mean", "var"], - **data_loc_dict, - ) - for adata in [adata_sparse, adata_dense] - ) - - # superset columns can be kept but not subsets - assert "key_superset" in getattr(stats_sparse, dim) - assert "key_subset" not in getattr(stats_sparse, dim) - - assert np.allclose( - getattr(stats_sparse, dim)["count"], - getattr(stats_dense, dim)["count"], - ) - assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)["mean"], - getattr(stats_dense, multi_agg_data_loc_key)["mean"], - ) - assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)["var"], - getattr(stats_dense, multi_agg_data_loc_key)["var"], - equal_nan=True, - ) - - stats_weight = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func=["count", "mean", "var"], - weight_key="weight", - **data_loc_dict, - ) - sum_ = sc.get.aggregated( - adata_sparse, by="key", dim=dim, func="sum", **data_loc_dict - ) - sum_weight = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func="sum", - weight_key="weight", - **data_loc_dict, - ) - - def get_single_agg(adata, key, agg): - # Get the data of the aggregation from the correct location when only one `func` is passed in to `aggregated` - if (key != "obsm" and key != "varm") or data_key == "X": - return adata.X - return getattr(adata, key)[agg] - - assert np.allclose( - 2 * get_single_agg(sum_, data_key, "sum"), - get_single_agg(sum_weight, data_key, "sum"), - ) - assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)["mean"], - getattr(stats_weight, multi_agg_data_loc_key)["mean"], - ) - assert np.allclose( - getattr(stats_sparse, multi_agg_data_loc_key)["var"], - getattr(stats_dense, multi_agg_data_loc_key)["var"], - equal_nan=True, - ) - - key_set = ["v", "w"] - mean_key_set_adata = sc.get.aggregated( - adata_dense, - by="key", - dim=dim, - func="mean", - key_set=key_set, - **data_loc_dict, - ) - subset_idx = getattr(stats_sparse, dim).index.isin(key_set) - subset_adata = ( - stats_sparse[subset_idx, :] if dim == "obs" else stats_sparse[:, subset_idx] - ) - subset_mean = getattr(subset_adata, multi_agg_data_loc_key)["mean"] - key_set_mean = get_single_agg(mean_key_set_adata, data_key, "mean") - - assert np.allclose(subset_mean, key_set_mean) - - df = pd.DataFrame( - index=getattr(adata_dense, dim)["key"], - columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), - data=X, - ) - grouped_agg_df = ( - df.groupby("key") - .agg(["count", "mean", "var"]) - .swaplevel(axis=1) - .sort_index(axis=1) - ) - mean = getattr(stats_dense, multi_agg_data_loc_key)["mean"] - if dim == "var" and data_key != "varm": - mean = mean.T - assert np.allclose(mean, grouped_agg_df["mean"].values) - var = getattr(stats_dense, multi_agg_data_loc_key)["var"] - if dim == "var" and multi_agg_data_loc_key != "varm": - var = var.T - assert np.allclose(var, grouped_agg_df["var"].values, equal_nan=True) - assert np.allclose( - getattr(stats_dense, dim)["count"], - grouped_agg_df["count"]["A"].values, - ) # returns for both columns but counts only needs one because it is the same + pd.testing.assert_frame_equal(result_df, expected, check_dtype=False, atol=1e-5) + + +# @pytest.mark.parametrize("data_key", ["layers", "obsm", "varm", "X"]) +# @pytest.mark.parametrize("dim", ["obs", "var"]) +# def test_groupby(data_key, dim, df_base, df_groupby, X): +# adata_sparse, adata_dense = gen_adata(data_key, dim, df_base, df_groupby, X) + +# data_loc_dict = ( +# {(data_key if data_key != "layers" else "layer"): "test"} +# if data_key != "X" +# else {} +# ) +# # When `X` is not the `data_key`, the multi-aggregation data is colocated with the `data_key`. Otherwise it is in `layers`. +# multi_agg_data_loc_key = data_key if data_key != "X" else "layers" + +# stats_sparse, stats_dense = ( +# sc.get.aggregated( +# adata, +# by="key", +# dim=dim, +# func=["count", "mean", "var"], +# **data_loc_dict, +# ) +# for adata in [adata_sparse, adata_dense] +# ) + +# # superset columns can be kept but not subsets +# assert "key_superset" in getattr(stats_sparse, dim) +# assert "key_subset" not in getattr(stats_sparse, dim) + +# assert np.allclose( +# getattr(stats_sparse, dim)["count"], +# getattr(stats_dense, dim)["count"], +# ) +# assert np.allclose( +# getattr(stats_sparse, multi_agg_data_loc_key)["mean"], +# getattr(stats_dense, multi_agg_data_loc_key)["mean"], +# ) +# assert np.allclose( +# getattr(stats_sparse, multi_agg_data_loc_key)["var"], +# getattr(stats_dense, multi_agg_data_loc_key)["var"], +# equal_nan=True, +# ) + +# stats_weight = sc.get.aggregated( +# adata_dense, +# by="key", +# dim=dim, +# func=["count", "mean", "var"], +# weight_key="weight", +# **data_loc_dict, +# ) +# sum_ = sc.get.aggregated( +# adata_sparse, by="key", dim=dim, func="sum", **data_loc_dict +# ) +# sum_weight = sc.get.aggregated( +# adata_dense, +# by="key", +# dim=dim, +# func="sum", +# weight_key="weight", +# **data_loc_dict, +# ) + +# def get_single_agg(adata, key, agg): +# # Get the data of the aggregation from the correct location when only one `func` is passed in to `aggregated` +# if (key != "obsm" and key != "varm") or data_key == "X": +# return adata.X +# return getattr(adata, key)[agg] + +# assert np.allclose( +# 2 * get_single_agg(sum_, data_key, "sum"), +# get_single_agg(sum_weight, data_key, "sum"), +# ) +# assert np.allclose( +# getattr(stats_sparse, multi_agg_data_loc_key)["mean"], +# getattr(stats_weight, multi_agg_data_loc_key)["mean"], +# ) +# assert np.allclose( +# getattr(stats_sparse, multi_agg_data_loc_key)["var"], +# getattr(stats_dense, multi_agg_data_loc_key)["var"], +# equal_nan=True, +# ) + +# key_set = ["v", "w"] +# mean_key_set_adata = sc.get.aggregated( +# adata_dense, +# by="key", +# dim=dim, +# func="mean", +# key_set=key_set, +# **data_loc_dict, +# ) +# subset_idx = getattr(stats_sparse, dim).index.isin(key_set) +# subset_adata = ( +# stats_sparse[subset_idx, :] if dim == "obs" else stats_sparse[:, subset_idx] +# ) +# subset_mean = getattr(subset_adata, multi_agg_data_loc_key)["mean"] +# key_set_mean = get_single_agg(mean_key_set_adata, data_key, "mean") + +# assert np.allclose(subset_mean, key_set_mean) + +# df = pd.DataFrame( +# index=getattr(adata_dense, dim)["key"], +# columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), +# data=X, +# ) +# grouped_agg_df = ( +# df.groupby("key") +# .agg(["count", "mean", "var"]) +# .swaplevel(axis=1) +# .sort_index(axis=1) +# ) +# mean = getattr(stats_dense, multi_agg_data_loc_key)["mean"] +# if dim == "var" and data_key != "varm": +# mean = mean.T +# assert np.allclose(mean, grouped_agg_df["mean"].values) +# var = getattr(stats_dense, multi_agg_data_loc_key)["var"] +# if dim == "var" and multi_agg_data_loc_key != "varm": +# var = var.T +# assert np.allclose(var, grouped_agg_df["var"].values, equal_nan=True) +# assert np.allclose( +# getattr(stats_dense, dim)["count"], +# grouped_agg_df["count"]["A"].values, +# ) # returns for both columns but counts only needs one because it is the same @pytest.mark.parametrize( From 487e4c5f52a026d7228cf3e11b389f77da2ec846 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 30 Nov 2023 00:25:15 +0100 Subject: [PATCH 65/89] count_nonzero + test speed --- scanpy/get/_aggregated.py | 14 ++++----- scanpy/tests/test_aggregated.py | 51 ++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index e8f73fd37..6c4db6d0e 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -14,7 +14,7 @@ from numpy.typing import NDArray Array = _U[np.ndarray, sparse.spmatrix] -AggType = Literal["count", "mean", "sum", "var"] +AggType = Literal["count_nonzero", "mean", "sum", "var"] class CMV(NamedTuple): @@ -73,7 +73,7 @@ def __init__(self, groupby, data, weight=None): weight: pd.Series | Array key_set: Set[str] | None - def count(self) -> np.ndarray: + def count_nonzero(self) -> np.ndarray: """\ Count the number of observations in each group. @@ -128,7 +128,7 @@ def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CM Object with `count`, `mean`, and `var` attributes. """ assert dof >= 0 - count_ = self.count() + count_ = self.count_nonzero() group_counts = np.bincount(self.groupby.codes) mean_ = self.mean() # sparse matrices do not support ** for elementwise power. @@ -315,15 +315,15 @@ def aggregated_from_array( if "mean" in funcs and "var" not in funcs: agg = groupby.mean() adata_kw["layers"]["mean"] = agg - if "count" in funcs and "var" not in funcs: - adata_kw["layers"]["count"] = groupby.count() # count goes in dim df + if "count_nonzero" in funcs and "var" not in funcs: + adata_kw["layers"]["count_nonzero"] = groupby.count_nonzero() if "var" in funcs: aggs = groupby.count_mean_var(dof) adata_kw["layers"]["var"] = aggs.var if "mean" in funcs: adata_kw["layers"]["mean"] = aggs.mean - if "count" in funcs: - adata_kw["layers"]["count"] = aggs.count + if "count_nonzero" in funcs: + adata_kw["layers"]["count_nonzero"] = aggs.count adata_agg = AnnData(**adata_kw) if dim == "var": diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 3659e05ac..ca6a6b5b2 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -73,20 +73,32 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): # TODO: test count_nonzero # TODO: There isn't an exact equivalent for our count operation in pandas I think (i.e. count non-zero values) @pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) -@pytest.mark.parametrize("metric", ["sum", "mean", "var"]) +@pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) def test_aggregated_vs_pandas(metric, array_type): adata = pbmc3k_processed().raw.to_adata() + adata = adata[ + adata.obs["louvain"].isin(adata.obs["louvain"].cat.categories[:5]), :1_000 + ].copy() adata.X = array_type(adata.X) - adata.obs["percent_mito_binned"] = pd.cut(adata.obs["percent_mito"], bins=10) + adata.obs["percent_mito_binned"] = pd.cut(adata.obs["percent_mito"], bins=5) result = sc.get.aggregated(adata, ["louvain", "percent_mito_binned"], metric) - expected = ( - adata.to_df() - .astype(np.float64) - .join(adata.obs[["louvain", "percent_mito_binned"]]) - .groupby(["louvain", "percent_mito_binned"], observed=True) - .agg(metric) - ) + if metric == "count_nonzero": + expected = ( + (adata.to_df() != 0) + .astype(np.float64) + .join(adata.obs[["louvain", "percent_mito_binned"]]) + .groupby(["louvain", "percent_mito_binned"], observed=True) + .agg("sum") + ) + else: + expected = ( + adata.to_df() + .astype(np.float64) + .join(adata.obs[["louvain", "percent_mito_binned"]]) + .groupby(["louvain", "percent_mito_binned"], observed=True) + .agg(metric) + ) # TODO: figure out the axis names expected.index = expected.index.to_frame().apply( lambda x: "_".join(map(str, x)), axis=1 @@ -243,12 +255,14 @@ def test_aggregated_vs_pandas(metric, array_type): } ), ["a", "b"], - ["count"], # , "sum", "mean"], + ["count_nonzero"], # , "sum", "mean"], ad.AnnData( obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), layers={ - "count": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), + "count_nonzero": np.array( + [[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]] + ), # "sum": np.array([[2, 0], [0, 2]]), # "mean": np.array([[1, 0], [0, 1]]), }, @@ -268,14 +282,16 @@ def test_aggregated_vs_pandas(metric, array_type): } ), ["a", "b"], - ["sum", "mean", "count"], + ["sum", "mean", "count_nonzero"], ad.AnnData( obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), layers={ "sum": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), - "count": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), + "count_nonzero": np.array( + [[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]] + ), }, ), ), @@ -293,20 +309,18 @@ def test_aggregated_vs_pandas(metric, array_type): } ), ["a", "b"], - ["mean"], # , "sum", "mean"], + ["mean"], ad.AnnData( obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), layers={ "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), - # "sum": np.array([[2, 0], [0, 2]]), - # "mean": np.array([[1, 0], [0, 1]]), }, ), ), ], ) -def test_aggregated_parameterized(matrix, df, keys, metrics, expected): +def test_aggregated_examples(matrix, df, keys, metrics, expected): adata = ad.AnnData( X=matrix, obs=df, @@ -317,9 +331,6 @@ def test_aggregated_parameterized(matrix, df, keys, metrics, expected): print(result) print(expected) - for k in metrics: - assert_equal(result.layers[k], expected.layers[k]) - assert_equal(expected, result) From b2284515144b564fa7cfbb9fbf81e8a6bc102f92 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 5 Dec 2023 18:01:16 +0100 Subject: [PATCH 66/89] Fix up mean_var --- scanpy/get/_aggregated.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 6c4db6d0e..01f25ddb2 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -2,7 +2,7 @@ from collections.abc import Iterable, Sequence, Set from functools import singledispatch -from typing import TYPE_CHECKING, Literal, NamedTuple, get_args +from typing import Literal, NamedTuple, get_args from typing import Union as _U import numpy as np @@ -10,19 +10,10 @@ from anndata import AnnData, utils from scipy import sparse -if TYPE_CHECKING: - from numpy.typing import NDArray - Array = _U[np.ndarray, sparse.spmatrix] AggType = Literal["count_nonzero", "mean", "sum", "var"] -class CMV(NamedTuple): - count: NDArray[np.integer] - mean: NDArray[np.floating] - var: NDArray[np.floating] - - class Indices(NamedTuple): keys: np.ndarray key_index: np.ndarray @@ -108,7 +99,7 @@ def mean(self) -> Array: / np.bincount(self.groupby.codes)[:, None] ) - def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CMV: + def mean_var(self, dof: int = 1) -> tuple[np.ndarray, np.ndarray]: """\ Compute the count, as well as mean and variance per feature, per group of observations. @@ -128,7 +119,7 @@ def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CM Object with `count`, `mean`, and `var` attributes. """ assert dof >= 0 - count_ = self.count_nonzero() + group_counts = np.bincount(self.groupby.codes) mean_ = self.mean() # sparse matrices do not support ** for elementwise power. @@ -156,7 +147,7 @@ def count_mean_var(self, dof: int = 1, *, _indices: Indices | None = None) -> CM var_[precision * var_ < sq_mean] = 0 if dof != 0: var_ *= (group_counts / (group_counts - dof))[:, np.newaxis] - return CMV(count=count_, mean=mean_, var=var_) + return mean_, var_ # def count_mean_var_spd(by, data): @@ -315,15 +306,13 @@ def aggregated_from_array( if "mean" in funcs and "var" not in funcs: agg = groupby.mean() adata_kw["layers"]["mean"] = agg - if "count_nonzero" in funcs and "var" not in funcs: + if "count_nonzero" in funcs: adata_kw["layers"]["count_nonzero"] = groupby.count_nonzero() if "var" in funcs: - aggs = groupby.count_mean_var(dof) - adata_kw["layers"]["var"] = aggs.var + mean_, var_ = groupby.mean_var(dof) + adata_kw["layers"]["var"] = var_ if "mean" in funcs: - adata_kw["layers"]["mean"] = aggs.mean - if "count_nonzero" in funcs: - adata_kw["layers"]["count_nonzero"] = aggs.count + adata_kw["layers"]["mean"] = mean_ adata_agg = AnnData(**adata_kw) if dim == "var": From 6276f8ca9b1d91f7c80ef1901076783361bb31fd Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 6 Dec 2023 14:31:25 +0100 Subject: [PATCH 67/89] Add test for dim --- scanpy/tests/test_aggregated.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index ca6a6b5b2..a01d8d0c5 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -70,8 +70,6 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): return adata_sparse, adata_dense -# TODO: test count_nonzero -# TODO: There isn't an exact equivalent for our count operation in pandas I think (i.e. count non-zero values) @pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) @pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) def test_aggregated_vs_pandas(metric, array_type): @@ -113,6 +111,20 @@ def test_aggregated_vs_pandas(metric, array_type): pd.testing.assert_frame_equal(result_df, expected, check_dtype=False, atol=1e-5) +@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) +@pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) +def test_aggregated_axis(array_type, metric): + adata = pbmc3k_processed().raw.to_adata() + adata = adata[ + adata.obs["louvain"].isin(adata.obs["louvain"].cat.categories[:5]), :1_000 + ].copy() + adata.X = array_type(adata.X) + expected = sc.get.aggregated(adata, ["louvain"], metric) + actual = sc.get.aggregated(adata.T, ["louvain"], metric, dim="var").T + + assert_equal(expected, actual) + + # @pytest.mark.parametrize("data_key", ["layers", "obsm", "varm", "X"]) # @pytest.mark.parametrize("dim", ["obs", "var"]) # def test_groupby(data_key, dim, df_base, df_groupby, X): From 4ea5471abd725b94686c3ed8b9d9aa11dc96b1f9 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 6 Dec 2023 14:32:04 +0100 Subject: [PATCH 68/89] Remove old test --- scanpy/tests/test_aggregated.py | 125 -------------------------------- 1 file changed, 125 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index a01d8d0c5..b7d3fb44f 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -125,131 +125,6 @@ def test_aggregated_axis(array_type, metric): assert_equal(expected, actual) -# @pytest.mark.parametrize("data_key", ["layers", "obsm", "varm", "X"]) -# @pytest.mark.parametrize("dim", ["obs", "var"]) -# def test_groupby(data_key, dim, df_base, df_groupby, X): -# adata_sparse, adata_dense = gen_adata(data_key, dim, df_base, df_groupby, X) - -# data_loc_dict = ( -# {(data_key if data_key != "layers" else "layer"): "test"} -# if data_key != "X" -# else {} -# ) -# # When `X` is not the `data_key`, the multi-aggregation data is colocated with the `data_key`. Otherwise it is in `layers`. -# multi_agg_data_loc_key = data_key if data_key != "X" else "layers" - -# stats_sparse, stats_dense = ( -# sc.get.aggregated( -# adata, -# by="key", -# dim=dim, -# func=["count", "mean", "var"], -# **data_loc_dict, -# ) -# for adata in [adata_sparse, adata_dense] -# ) - -# # superset columns can be kept but not subsets -# assert "key_superset" in getattr(stats_sparse, dim) -# assert "key_subset" not in getattr(stats_sparse, dim) - -# assert np.allclose( -# getattr(stats_sparse, dim)["count"], -# getattr(stats_dense, dim)["count"], -# ) -# assert np.allclose( -# getattr(stats_sparse, multi_agg_data_loc_key)["mean"], -# getattr(stats_dense, multi_agg_data_loc_key)["mean"], -# ) -# assert np.allclose( -# getattr(stats_sparse, multi_agg_data_loc_key)["var"], -# getattr(stats_dense, multi_agg_data_loc_key)["var"], -# equal_nan=True, -# ) - -# stats_weight = sc.get.aggregated( -# adata_dense, -# by="key", -# dim=dim, -# func=["count", "mean", "var"], -# weight_key="weight", -# **data_loc_dict, -# ) -# sum_ = sc.get.aggregated( -# adata_sparse, by="key", dim=dim, func="sum", **data_loc_dict -# ) -# sum_weight = sc.get.aggregated( -# adata_dense, -# by="key", -# dim=dim, -# func="sum", -# weight_key="weight", -# **data_loc_dict, -# ) - -# def get_single_agg(adata, key, agg): -# # Get the data of the aggregation from the correct location when only one `func` is passed in to `aggregated` -# if (key != "obsm" and key != "varm") or data_key == "X": -# return adata.X -# return getattr(adata, key)[agg] - -# assert np.allclose( -# 2 * get_single_agg(sum_, data_key, "sum"), -# get_single_agg(sum_weight, data_key, "sum"), -# ) -# assert np.allclose( -# getattr(stats_sparse, multi_agg_data_loc_key)["mean"], -# getattr(stats_weight, multi_agg_data_loc_key)["mean"], -# ) -# assert np.allclose( -# getattr(stats_sparse, multi_agg_data_loc_key)["var"], -# getattr(stats_dense, multi_agg_data_loc_key)["var"], -# equal_nan=True, -# ) - -# key_set = ["v", "w"] -# mean_key_set_adata = sc.get.aggregated( -# adata_dense, -# by="key", -# dim=dim, -# func="mean", -# key_set=key_set, -# **data_loc_dict, -# ) -# subset_idx = getattr(stats_sparse, dim).index.isin(key_set) -# subset_adata = ( -# stats_sparse[subset_idx, :] if dim == "obs" else stats_sparse[:, subset_idx] -# ) -# subset_mean = getattr(subset_adata, multi_agg_data_loc_key)["mean"] -# key_set_mean = get_single_agg(mean_key_set_adata, data_key, "mean") - -# assert np.allclose(subset_mean, key_set_mean) - -# df = pd.DataFrame( -# index=getattr(adata_dense, dim)["key"], -# columns=getattr(adata_dense, f"{'var' if dim == 'obs' else 'obs'}_names"), -# data=X, -# ) -# grouped_agg_df = ( -# df.groupby("key") -# .agg(["count", "mean", "var"]) -# .swaplevel(axis=1) -# .sort_index(axis=1) -# ) -# mean = getattr(stats_dense, multi_agg_data_loc_key)["mean"] -# if dim == "var" and data_key != "varm": -# mean = mean.T -# assert np.allclose(mean, grouped_agg_df["mean"].values) -# var = getattr(stats_dense, multi_agg_data_loc_key)["var"] -# if dim == "var" and multi_agg_data_loc_key != "varm": -# var = var.T -# assert np.allclose(var, grouped_agg_df["var"].values, equal_nan=True) -# assert np.allclose( -# getattr(stats_dense, dim)["count"], -# grouped_agg_df["count"]["A"].values, -# ) # returns for both columns but counts only needs one because it is the same - - @pytest.mark.parametrize( "matrix,df,keys,metrics,expected", [ From b78f6bcd906d06a37867bb13aeac2bcb6244437d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 6 Dec 2023 14:39:05 +0100 Subject: [PATCH 69/89] Better dim handling + test --- scanpy/get/_aggregated.py | 30 ++++++++++-------------------- scanpy/tests/test_aggregated.py | 7 +++++++ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 01f25ddb2..1de88315e 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -150,19 +150,6 @@ def mean_var(self, dof: int = 1) -> tuple[np.ndarray, np.ndarray]: return mean_, var_ -# def count_mean_var_spd(by, data): -# sums = np.zeros((by.shape[0],data.shape[1])) -# counts = np.zeros((by.shape[0],data.shape[1])) -# sums = by.toarray() @ data -# counts = by.toarray() @ data._with_data(np.ones(len(data.data),dtype=data.data.dtype)) -# n_cells = np.array(by.sum(axis= 1).astype(data.dtype)) -# means = sums/n_cells -# sq_mean = by.toarray() @ data.multiply(data)/n_cells -# var = sq_mean - np.power(means, 2) -# var *= n_cells / (n_cells - 1) -# return sums, counts, means, var - - def _power(X: Array, power: float | int) -> Array: """\ Generate elementwise power of a matrix. @@ -240,8 +227,10 @@ def aggregated( ------- Aggregated :class:`~anndata.AnnData`. """ - data = adata.X + if dim not in ["obs", "var"]: + raise ValueError(f"dim must be one of 'obs' or 'var', was '{dim}'") # TODO replace with get helper + data = adata.X if sum(p is not None for p in [varm, obsm, layer]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") if varm is not None: @@ -255,10 +244,9 @@ def aggregated( elif dim == "var": # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T - return aggregated( + result = aggregated( data, groupby_df=getattr(adata, dim), - dim=dim, by=by, # write_to_xxxm=write_to_xxxm, no_groupby_df=getattr(adata, "var" if dim == "obs" else "obs"), @@ -268,6 +256,11 @@ def aggregated( dof=dof, ) + if dim == "var": + return result.T + else: + return result + @aggregated.register(np.ndarray) @aggregated.register(sparse.spmatrix) @@ -275,7 +268,6 @@ def aggregated_from_array( data, groupby_df: pd.DataFrame, func: AggType | Iterable[AggType], - dim: str, by: str, no_groupby_df: pd.DataFrame, weight_key: str | None = None, @@ -315,8 +307,6 @@ def aggregated_from_array( adata_kw["layers"]["mean"] = mean_ adata_agg = AnnData(**adata_kw) - if dim == "var": - return adata_agg.T return adata_agg @@ -352,7 +342,7 @@ def sparse_indicator( categorical, weights: None | np.ndarray = None ) -> sparse.coo_matrix: if weights is None: - weights = np.broadcast_to(1.0, len(categorical)) + weights = np.broadcast_to(1, len(categorical)) A = sparse.coo_matrix( (weights, (categorical.codes, np.arange(len(categorical)))), shape=(len(categorical.categories), len(categorical)), diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index b7d3fb44f..bb1a837b3 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -125,6 +125,13 @@ def test_aggregated_axis(array_type, metric): assert_equal(expected, actual) +def test_aggregated_incorrect_dim(): + adata = pbmc3k_processed().raw.to_adata() + + with pytest.raises(ValueError, match="was 'foo'"): + sc.get.aggregated(adata, ["louvain"], "sum", dim="foo") + + @pytest.mark.parametrize( "matrix,df,keys,metrics,expected", [ From 560eae24509116b43f75b5bc788411892a25ff74 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 6 Dec 2023 16:07:44 +0100 Subject: [PATCH 70/89] Retain input categories in result --- scanpy/get/_aggregated.py | 50 ++++++++++++++++++++++++--------- scanpy/tests/test_aggregated.py | 26 ++++++++++++++--- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 1de88315e..3f93b0631 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -212,8 +212,6 @@ def aggregated( Axis on which to find group by column. weight_key Key of the `dim` containing weights for a weighted sum aggregation. - key_set - Subset of dim on which to filter. dof Degrees of freedom for variance. Defaults to 1. layer @@ -244,6 +242,8 @@ def aggregated( elif dim == "var": # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T + + # Actual computation result = aggregated( data, groupby_df=getattr(adata, dim), @@ -274,7 +274,7 @@ def aggregated_from_array( dof: int = 1, ) -> AnnData: """Aggregate data based on one of the columns of one of a `~pd.DataFrame`.""" - categorical = _combine_categories(groupby_df, by) + categorical, new_label_df = _combine_categories(groupby_df, by) groupby = Aggregate( groupby=categorical, data=data, @@ -284,7 +284,7 @@ def aggregated_from_array( adata_kw = dict( X=None, layers={}, - obs=pd.DataFrame(index=categorical.categories), + obs=new_label_df, var=no_groupby_df, obsm={}, ) @@ -310,7 +310,12 @@ def aggregated_from_array( return adata_agg -def _combine_categories(label_df: pd.DataFrame, cols: list[str]) -> pd.Categorical: +def _combine_categories( + label_df: pd.DataFrame, cols: list[str] +) -> tuple[pd.Categorical, pd.DataFrame]: + """ + Returns both the result categories and a dataframe labelling each row + """ from itertools import product if isinstance(cols, str): @@ -319,23 +324,42 @@ def _combine_categories(label_df: pd.DataFrame, cols: list[str]) -> pd.Categoric df = pd.DataFrame( {c: pd.Categorical(label_df[c]).remove_unused_categories() for c in cols}, ) + n_categories = [len(df[c].cat.categories) for c in cols] + + # It's like np.concatenate([x for x in product(*[range(n) for n in n_categories])]) + code_combinations = np.indices(n_categories).reshape(len(n_categories), -1) result_categories = [ "_".join(map(str, x)) for x in product(*[df[c].cat.categories for c in cols]) ] - n_categories = [len(df[c].cat.categories) for c in cols] + # Dataframe with unique combination of categories for each row + new_label_df = pd.DataFrame( + { + c: pd.Categorical.from_codes(code_combinations[i], df[c].cat.categories) + for i, c in enumerate(cols) + }, + index=result_categories, + ) + + # Calculating result codes factors = np.ones(len(cols) + 1, dtype=np.int32) # First factor needs to be 1 np.cumsum(n_categories[::-1], out=factors[1:]) factors = factors[:-1][::-1] - # TODO: pick a more optimal bit width - final_codes = np.zeros(df.shape[0], dtype=np.int32) - for factor, c in zip(factors, cols): - final_codes += df[c].cat.codes * factor + code_array = np.zeros((len(cols), df.shape[0]), dtype=np.int32) + for i, c in enumerate(cols): + code_array[i] = df[c].cat.codes + code_array *= factors[:, None] + + result_categorical = pd.Categorical.from_codes( + code_array.sum(axis=0), categories=result_categories + ) + + # Filter unused categories + result_categorical = result_categorical.remove_unused_categories() + new_label_df = new_label_df.loc[result_categorical.categories] - return pd.Categorical.from_codes( - final_codes, categories=result_categories - ).remove_unused_categories() + return result_categorical, new_label_df def sparse_indicator( diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index bb1a837b3..eed6ff7b6 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -151,7 +151,10 @@ def test_aggregated_incorrect_dim(): ["a", "b"], ["count_nonzero"], # , "sum", "mean"], ad.AnnData( - obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), + obs=pd.DataFrame( + {"a": ["a", "a", "b"], "b": ["c", "d", "d"]}, + index=["a_c", "a_d", "b_d"], + ).astype("category"), var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), layers={ "count_nonzero": np.array( @@ -178,7 +181,10 @@ def test_aggregated_incorrect_dim(): ["a", "b"], ["sum", "mean", "count_nonzero"], ad.AnnData( - obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), + obs=pd.DataFrame( + {"a": ["a", "a", "b"], "b": ["c", "d", "d"]}, + index=["a_c", "a_d", "b_d"], + ).astype("category"), var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), layers={ "sum": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 2, 2]]), @@ -205,7 +211,10 @@ def test_aggregated_incorrect_dim(): ["a", "b"], ["mean"], ad.AnnData( - obs=pd.DataFrame(index=["a_c", "a_d", "b_d"]), + obs=pd.DataFrame( + {"a": ["a", "a", "b"], "b": ["c", "d", "d"]}, + index=["a_c", "a_d", "b_d"], + ).astype("category"), var=pd.DataFrame(index=[f"gene_{i}" for i in range(4)]), layers={ "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), @@ -279,9 +288,18 @@ def test_aggregated_examples(matrix, df, keys, metrics, expected): def test_combine_categories(label_df, cols, expected): from scanpy.get._aggregated import _combine_categories - result = _combine_categories(label_df, cols) + result, result_label_df = _combine_categories(label_df, cols) assert isinstance(result, pd.Categorical) # TODO: is there a better function here? pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected)) + + pd.testing.assert_series_equal( + pd.Series(result), pd.Series(result_label_df.index.astype("category")) + ) + + reconstructed_df = pd.DataFrame( + [x.split("_") for x in result], columns=cols, index=result.astype(str) + ).astype("category") + pd.testing.assert_frame_equal(reconstructed_df, result_label_df) From a173140e362d477545975304e73da6cf1f482730 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 6 Dec 2023 17:23:55 +0100 Subject: [PATCH 71/89] Examples in docs + some minor fixes --- scanpy/get/_aggregated.py | 42 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 3f93b0631..6164ae85a 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -184,7 +184,7 @@ def _ndarray_from_seq(lst: Sequence): @singledispatch def aggregated( adata: AnnData, - by: str, + by: str | list[str], func: AggType | Iterable[AggType], *, dim: Literal["obs", "var"] = "obs", @@ -195,7 +195,14 @@ def aggregated( varm: str | None = None, ) -> AnnData: """\ - Aggregate data based on one of the columns of one of the axes (`obs` or `var`). + Aggregate data matrix based on some categorical grouping. + + This function is useful for pseudobulking as well as plotting. + + Aggregation to perform is specified by `func`, which can be a single metric or a + list of metrics. Each metric is computed over the group and results in a new layer + in the output `AnnData` object. + If none of `layer`, `obsm`, or `varm` are passed in, `X` will be used for aggregation data. If `func` only has length 1 or is just an `AggType`, then aggregation data is written to `X`. Otherwise, it is written to `layers` or `xxxm` as appropriate for the dimensions of the aggregation data. @@ -224,6 +231,33 @@ def aggregated( Returns ------- Aggregated :class:`~anndata.AnnData`. + + Examples + -------- + + Calculating mean expression and number of nonzero entries per cluster: + + >>> import scanpy as sc, pandas as pd + >>> pbmc = sc.datasets.pbmc3k_processed().raw.to_adata() + >>> pbmc.shape + (2638, 13714) + >>> aggregated = sc.get.aggregated(pbmc, by="louvain", func=["mean", "count_nonzero"]) + >>> aggregated + AnnData object with n_obs × n_vars = 8 × 13714 + obs: 'louvain' + var: 'n_cells' + layers: 'mean', 'count_nonzero' + + We can group over multiple columns: + + >>> pbmc.obs["percent_mito_binned"] = pd.cut(pbmc.obs["percent_mito"], bins=5) + >>> sc.get.aggregated(pbmc, by=["louvain", "percent_mito_binned"], func=["mean", "count_nonzero"]) + AnnData object with n_obs × n_vars = 40 × 13714 + obs: 'louvain', 'percent_mito_binned' + var: 'n_cells' + layers: 'mean', 'count_nonzero' + + Note that this filters out any combination of groups that wasn't present in the original data. """ if dim not in ["obs", "var"]: raise ValueError(f"dim must be one of 'obs' or 'var', was '{dim}'") @@ -232,8 +266,12 @@ def aggregated( if sum(p is not None for p in [varm, obsm, layer]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") if varm is not None: + if dim != "var": + raise ValueError("varm can only be used when dim is 'var'") data = adata.varm[varm] elif obsm is not None: + if dim != "obs": + raise ValueError("obsm can only be used when dim is 'obs'") data = adata.obsm[obsm] elif layer is not None: data = adata.layers[layer] From d00fea1d0864a4d32f8f9ba49d49ceaec2de161b Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 6 Dec 2023 17:42:52 +0100 Subject: [PATCH 72/89] Rename from aggregated -> aggregate --- docs/api/get.md | 2 +- scanpy/get/__init__.py | 4 ++-- scanpy/get/_aggregated.py | 14 +++++++------- scanpy/tests/test_aggregated.py | 18 +++++++++--------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/api/get.md b/docs/api/get.md index 74865d184..039769d0a 100644 --- a/docs/api/get.md +++ b/docs/api/get.md @@ -19,6 +19,6 @@ useful formats. get.obs_df get.var_df get.rank_genes_groups_df - get.aggregated + get.aggregate ``` diff --git a/scanpy/get/__init__.py b/scanpy/get/__init__.py index 9f094446d..56c0d3c13 100644 --- a/scanpy/get/__init__.py +++ b/scanpy/get/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from ._aggregated import aggregated +from ._aggregated import aggregate from .get import ( _check_mask, _get_obs_rep, @@ -14,7 +14,7 @@ "_check_mask", "_get_obs_rep", "_set_obs_rep", - "aggregated", + "aggregate", "obs_df", "rank_genes_groups_df", "var_df", diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 6164ae85a..d54d130e5 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -182,7 +182,7 @@ def _ndarray_from_seq(lst: Sequence): @singledispatch -def aggregated( +def aggregate( adata: AnnData, by: str | list[str], func: AggType | Iterable[AggType], @@ -241,7 +241,7 @@ def aggregated( >>> pbmc = sc.datasets.pbmc3k_processed().raw.to_adata() >>> pbmc.shape (2638, 13714) - >>> aggregated = sc.get.aggregated(pbmc, by="louvain", func=["mean", "count_nonzero"]) + >>> aggregated = sc.get.aggregate(pbmc, by="louvain", func=["mean", "count_nonzero"]) >>> aggregated AnnData object with n_obs × n_vars = 8 × 13714 obs: 'louvain' @@ -251,7 +251,7 @@ def aggregated( We can group over multiple columns: >>> pbmc.obs["percent_mito_binned"] = pd.cut(pbmc.obs["percent_mito"], bins=5) - >>> sc.get.aggregated(pbmc, by=["louvain", "percent_mito_binned"], func=["mean", "count_nonzero"]) + >>> sc.get.aggregate(pbmc, by=["louvain", "percent_mito_binned"], func=["mean", "count_nonzero"]) AnnData object with n_obs × n_vars = 40 × 13714 obs: 'louvain', 'percent_mito_binned' var: 'n_cells' @@ -282,7 +282,7 @@ def aggregated( data = data.T # Actual computation - result = aggregated( + result = aggregate( data, groupby_df=getattr(adata, dim), by=by, @@ -300,9 +300,9 @@ def aggregated( return result -@aggregated.register(np.ndarray) -@aggregated.register(sparse.spmatrix) -def aggregated_from_array( +@aggregate.register(np.ndarray) +@aggregate.register(sparse.spmatrix) +def aggregate_from_array( data, groupby_df: pd.DataFrame, func: AggType | Iterable[AggType], diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index eed6ff7b6..4e3a76824 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -72,14 +72,14 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): @pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) @pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) -def test_aggregated_vs_pandas(metric, array_type): +def test_aggregate_vs_pandas(metric, array_type): adata = pbmc3k_processed().raw.to_adata() adata = adata[ adata.obs["louvain"].isin(adata.obs["louvain"].cat.categories[:5]), :1_000 ].copy() adata.X = array_type(adata.X) adata.obs["percent_mito_binned"] = pd.cut(adata.obs["percent_mito"], bins=5) - result = sc.get.aggregated(adata, ["louvain", "percent_mito_binned"], metric) + result = sc.get.aggregate(adata, ["louvain", "percent_mito_binned"], metric) if metric == "count_nonzero": expected = ( @@ -113,23 +113,23 @@ def test_aggregated_vs_pandas(metric, array_type): @pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) @pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) -def test_aggregated_axis(array_type, metric): +def test_aggregate_axis(array_type, metric): adata = pbmc3k_processed().raw.to_adata() adata = adata[ adata.obs["louvain"].isin(adata.obs["louvain"].cat.categories[:5]), :1_000 ].copy() adata.X = array_type(adata.X) - expected = sc.get.aggregated(adata, ["louvain"], metric) - actual = sc.get.aggregated(adata.T, ["louvain"], metric, dim="var").T + expected = sc.get.aggregate(adata, ["louvain"], metric) + actual = sc.get.aggregate(adata.T, ["louvain"], metric, dim="var").T assert_equal(expected, actual) -def test_aggregated_incorrect_dim(): +def test_aggregate_incorrect_dim(): adata = pbmc3k_processed().raw.to_adata() with pytest.raises(ValueError, match="was 'foo'"): - sc.get.aggregated(adata, ["louvain"], "sum", dim="foo") + sc.get.aggregate(adata, ["louvain"], "sum", dim="foo") @pytest.mark.parametrize( @@ -223,13 +223,13 @@ def test_aggregated_incorrect_dim(): ), ], ) -def test_aggregated_examples(matrix, df, keys, metrics, expected): +def test_aggregate_examples(matrix, df, keys, metrics, expected): adata = ad.AnnData( X=matrix, obs=df, var=pd.DataFrame(index=[f"gene_{i}" for i in range(matrix.shape[1])]), ) - result = sc.get.aggregated(adata, by=keys, func=metrics) + result = sc.get.aggregate(adata, by=keys, func=metrics) print(result) print(expected) From 1f837137fcaaf8c69731d68a483723da9b4f203f Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 6 Dec 2023 17:57:58 +0100 Subject: [PATCH 73/89] Simplify non-anndata version --- scanpy/get/_aggregated.py | 138 ++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 43 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index d54d130e5..c4079e557 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -55,7 +55,7 @@ class Aggregate: def __init__(self, groupby, data, weight=None): self.groupby = groupby - self.indicator_matrix = sparse_indicator(groupby) + self.indicator_matrix = sparse_indicator(groupby, weight=weight) self.data = data self.weight = weight @@ -188,7 +188,7 @@ def aggregate( func: AggType | Iterable[AggType], *, dim: Literal["obs", "var"] = "obs", - weight_key: str | None = None, + weight: str | None = None, dof: int = 1, layer: str | None = None, obsm: str | None = None, @@ -217,7 +217,7 @@ def aggregate( How to aggregate. dim Axis on which to find group by column. - weight_key + weight Key of the `dim` containing weights for a weighted sum aggregation. dof Degrees of freedom for variance. Defaults to 1. @@ -281,19 +281,36 @@ def aggregate( # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T + dim_df = getattr(adata, dim) + categorical, new_label_df = _combine_categories(dim_df, by) + if isinstance(weight, str): + weight = dim_df[weight] # Actual computation - result = aggregate( + layers = aggregate( data, - groupby_df=getattr(adata, dim), - by=by, - # write_to_xxxm=write_to_xxxm, - no_groupby_df=getattr(adata, "var" if dim == "obs" else "obs"), - weight_key=weight_key, - # key_set=key_set, + by=categorical, func=func, dof=dof, + weight=weight, + ) + result = AnnData( + layers=layers, + obs=new_label_df, + var=getattr(adata, "var" if dim == "obs" else "obs"), ) + # result = aggregate( + # data, + # groupby_df=getattr(adata, dim), + # by=by, + # # write_to_xxxm=write_to_xxxm, + # no_groupby_df=getattr(adata, "var" if dim == "obs" else "obs"), + # weight_key=weight_key, + # # key_set=key_set, + # func=func, + # dof=dof, + # ) + if dim == "var": return result.T else: @@ -302,50 +319,85 @@ def aggregate( @aggregate.register(np.ndarray) @aggregate.register(sparse.spmatrix) -def aggregate_from_array( +def aggregate_array( data, - groupby_df: pd.DataFrame, + by: pd.Categorical, func: AggType | Iterable[AggType], - by: str, - no_groupby_df: pd.DataFrame, - weight_key: str | None = None, + *, dof: int = 1, -) -> AnnData: - """Aggregate data based on one of the columns of one of a `~pd.DataFrame`.""" - categorical, new_label_df = _combine_categories(groupby_df, by) - groupby = Aggregate( - groupby=categorical, - data=data, - weight=groupby_df[weight_key] if weight_key is not None else None, - ) - # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate - adata_kw = dict( - X=None, - layers={}, - obs=new_label_df, - var=no_groupby_df, - obsm={}, - ) + weight: np.ndarray | None = None, +) -> dict[str, np.ndarray]: + groupby = Aggregate(groupby=by, data=data, weight=weight) + result = {} + funcs = set([func] if isinstance(func, str) else func) if unknown := funcs - set(get_args(AggType)): raise ValueError(f"func {unknown} is not one of {get_args(AggType)}") + if "sum" in funcs: # sum is calculated separately from the rest agg = groupby.sum() - adata_kw["layers"]["sum"] = agg + result["sum"] = agg # here and below for count, if var is present, these can be calculate alongside var if "mean" in funcs and "var" not in funcs: agg = groupby.mean() - adata_kw["layers"]["mean"] = agg + result["mean"] = agg if "count_nonzero" in funcs: - adata_kw["layers"]["count_nonzero"] = groupby.count_nonzero() + result["count_nonzero"] = groupby.count_nonzero() if "var" in funcs: mean_, var_ = groupby.mean_var(dof) - adata_kw["layers"]["var"] = var_ + result["var"] = var_ if "mean" in funcs: - adata_kw["layers"]["mean"] = mean_ - - adata_agg = AnnData(**adata_kw) - return adata_agg + result["mean"] = mean_ + + return result + + +# @aggregate.register(np.ndarray) +# @aggregate.register(sparse.spmatrix) +# def aggregate_from_array( +# data, +# groupby_df: pd.DataFrame, +# func: AggType | Iterable[AggType], +# by: str, +# no_groupby_df: pd.DataFrame, +# weight_key: str | None = None, +# dof: int = 1, +# ) -> AnnData: +# """Aggregate data based on one of the columns of one of a `~pd.DataFrame`.""" +# categorical, new_label_df = _combine_categories(groupby_df, by) +# groupby = Aggregate( +# groupby=categorical, +# data=data, +# weight=groupby_df[weight_key] if weight_key is not None else None, +# ) +# # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate +# adata_kw = dict( +# X=None, +# layers={}, +# obs=new_label_df, +# var=no_groupby_df, +# obsm={}, +# ) +# funcs = set([func] if isinstance(func, str) else func) +# if unknown := funcs - set(get_args(AggType)): +# raise ValueError(f"func {unknown} is not one of {get_args(AggType)}") +# if "sum" in funcs: # sum is calculated separately from the rest +# agg = groupby.sum() +# adata_kw["layers"]["sum"] = agg +# # here and below for count, if var is present, these can be calculate alongside var +# if "mean" in funcs and "var" not in funcs: +# agg = groupby.mean() +# adata_kw["layers"]["mean"] = agg +# if "count_nonzero" in funcs: +# adata_kw["layers"]["count_nonzero"] = groupby.count_nonzero() +# if "var" in funcs: +# mean_, var_ = groupby.mean_var(dof) +# adata_kw["layers"]["var"] = var_ +# if "mean" in funcs: +# adata_kw["layers"]["mean"] = mean_ + +# adata_agg = AnnData(**adata_kw) +# return adata_agg def _combine_categories( @@ -401,12 +453,12 @@ def _combine_categories( def sparse_indicator( - categorical, weights: None | np.ndarray = None + categorical, weight: None | np.ndarray = None ) -> sparse.coo_matrix: - if weights is None: - weights = np.broadcast_to(1, len(categorical)) + if weight is None: + weight = np.broadcast_to(1, len(categorical)) A = sparse.coo_matrix( - (weights, (categorical.codes, np.arange(len(categorical)))), + (weight, (categorical.codes, np.arange(len(categorical)))), shape=(len(categorical.categories), len(categorical)), ) return A From bbdbb4c66261065d96e7faafc8a437b9a4f058cb Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 11 Dec 2023 16:20:40 +0100 Subject: [PATCH 74/89] Remove dead code --- scanpy/get/_aggregated.py | 69 +-------------------------------------- 1 file changed, 1 insertion(+), 68 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index c4079e557..1a1274890 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -2,7 +2,7 @@ from collections.abc import Iterable, Sequence, Set from functools import singledispatch -from typing import Literal, NamedTuple, get_args +from typing import Literal, get_args from typing import Union as _U import numpy as np @@ -14,13 +14,6 @@ AggType = Literal["count_nonzero", "mean", "sum", "var"] -class Indices(NamedTuple): - keys: np.ndarray - key_index: np.ndarray - df_index: np.ndarray - weight_value: pd.Series | Array | None - - class Aggregate: """\ Functionality for generic grouping and aggregating. @@ -299,18 +292,6 @@ def aggregate( var=getattr(adata, "var" if dim == "obs" else "obs"), ) - # result = aggregate( - # data, - # groupby_df=getattr(adata, dim), - # by=by, - # # write_to_xxxm=write_to_xxxm, - # no_groupby_df=getattr(adata, "var" if dim == "obs" else "obs"), - # weight_key=weight_key, - # # key_set=key_set, - # func=func, - # dof=dof, - # ) - if dim == "var": return result.T else: @@ -352,54 +333,6 @@ def aggregate_array( return result -# @aggregate.register(np.ndarray) -# @aggregate.register(sparse.spmatrix) -# def aggregate_from_array( -# data, -# groupby_df: pd.DataFrame, -# func: AggType | Iterable[AggType], -# by: str, -# no_groupby_df: pd.DataFrame, -# weight_key: str | None = None, -# dof: int = 1, -# ) -> AnnData: -# """Aggregate data based on one of the columns of one of a `~pd.DataFrame`.""" -# categorical, new_label_df = _combine_categories(groupby_df, by) -# groupby = Aggregate( -# groupby=categorical, -# data=data, -# weight=groupby_df[weight_key] if weight_key is not None else None, -# ) -# # groupby df is put in `obs`, nongroupby in `var` to be transposed later as appropriate -# adata_kw = dict( -# X=None, -# layers={}, -# obs=new_label_df, -# var=no_groupby_df, -# obsm={}, -# ) -# funcs = set([func] if isinstance(func, str) else func) -# if unknown := funcs - set(get_args(AggType)): -# raise ValueError(f"func {unknown} is not one of {get_args(AggType)}") -# if "sum" in funcs: # sum is calculated separately from the rest -# agg = groupby.sum() -# adata_kw["layers"]["sum"] = agg -# # here and below for count, if var is present, these can be calculate alongside var -# if "mean" in funcs and "var" not in funcs: -# agg = groupby.mean() -# adata_kw["layers"]["mean"] = agg -# if "count_nonzero" in funcs: -# adata_kw["layers"]["count_nonzero"] = groupby.count_nonzero() -# if "var" in funcs: -# mean_, var_ = groupby.mean_var(dof) -# adata_kw["layers"]["var"] = var_ -# if "mean" in funcs: -# adata_kw["layers"]["mean"] = mean_ - -# adata_agg = AnnData(**adata_kw) -# return adata_agg - - def _combine_categories( label_df: pd.DataFrame, cols: list[str] ) -> tuple[pd.Categorical, pd.DataFrame]: From edfe57d7b5e9ad3c644e8de7f5b72de5ac9c43ed Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 11 Dec 2023 16:30:53 +0100 Subject: [PATCH 75/89] Remove code for handling weighted mean and variance (put off for later) --- scanpy/get/_aggregated.py | 57 ++++++++------------------------------- 1 file changed, 11 insertions(+), 46 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 1a1274890..f76a19134 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -1,8 +1,7 @@ from __future__ import annotations -from collections.abc import Iterable, Sequence, Set from functools import singledispatch -from typing import Literal, get_args +from typing import TYPE_CHECKING, Literal, get_args from typing import Union as _U import numpy as np @@ -10,6 +9,9 @@ from anndata import AnnData, utils from scipy import sparse +if TYPE_CHECKING: + from collections.abc import Iterable, Set + Array = _U[np.ndarray, sparse.spmatrix] AggType = Literal["count_nonzero", "mean", "sum", "var"] @@ -18,20 +20,14 @@ class Aggregate: """\ Functionality for generic grouping and aggregating. - There is currently support for count, sum, mean, and variance. - - Set `weight` for weighted sum, mean, and variance. - - Set `key_set` to a list of keys to most efficiently compute results for a subset of groups. + There is currently support for count_nonzero, sum, mean, and variance. **Implementation** Moments are computed using weighted sum aggregation of data by some feature - via multiplication by a sparse coordinate matrix A, exposed by - `_sparse_aggregator`. The approach works with data in ndarray or scipy sparse formats, with - no view or copy overhead on runtime or memory, even when filtering keys. + via multiplication by a sparse coordinate matrix A. - Runtime is effectively computation of the product A * X, i.e. the count of (non-zero) + Runtime is effectively computation of the product A @ X, i.e. the count of (non-zero) entries in X with multiplicity the number of group memberships for that entry. This is O(data) for partitions (each observation belonging to exactly one group), independent of the number of groups. @@ -46,15 +42,13 @@ class Aggregate: Weights to be used for aggregation. """ - def __init__(self, groupby, data, weight=None): + def __init__(self, groupby, data): self.groupby = groupby - self.indicator_matrix = sparse_indicator(groupby, weight=weight) + self.indicator_matrix = sparse_indicator(groupby) self.data = data - self.weight = weight groupby: pd.Series data: Array - weight: pd.Series | Array key_set: Set[str] | None def count_nonzero(self) -> np.ndarray: @@ -120,18 +114,7 @@ def mean_var(self, dof: int = 1) -> tuple[np.ndarray, np.ndarray]: utils.asarray(self.indicator_matrix @ _power(self.data, 2)) / group_counts[:, None] ) - if self.weight is None: - sq_mean = mean_**2 - else: - A_unweighted = sparse_indicator(self.groupby) - # , _ = Aggregate( - # groupby=self.groupby, - # data=self.data, - # weight=self.weight, # TODO: why pass weights when creating unweighted A? - # key_set=self.key_set, - # )._sparse_aggregator() - mean_unweighted = utils.asarray(A_unweighted @ self.data) - sq_mean = 2 * mean_ * mean_unweighted + mean_unweighted**2 + sq_mean = mean_**2 var_ = mean_sq - sq_mean # TODO: Why these values exactly? Because they are high relative to the datatype? # (unchanged from original code: https://github.com/scverse/anndata/pull/564) @@ -163,17 +146,6 @@ def _power(X: Array, power: float | int) -> Array: return X**power if isinstance(X, np.ndarray) else X.power(power) -def _ndarray_from_seq(lst: Sequence): - # prevents expansion of iterables as axis - n = len(lst) - if n > 0 and isinstance(lst[0], Iterable): - arr = np.empty(n, dtype=object) - arr[:] = lst - else: - arr = np.array(lst) - return arr - - @singledispatch def aggregate( adata: AnnData, @@ -181,7 +153,6 @@ def aggregate( func: AggType | Iterable[AggType], *, dim: Literal["obs", "var"] = "obs", - weight: str | None = None, dof: int = 1, layer: str | None = None, obsm: str | None = None, @@ -210,8 +181,6 @@ def aggregate( How to aggregate. dim Axis on which to find group by column. - weight - Key of the `dim` containing weights for a weighted sum aggregation. dof Degrees of freedom for variance. Defaults to 1. layer @@ -276,15 +245,12 @@ def aggregate( dim_df = getattr(adata, dim) categorical, new_label_df = _combine_categories(dim_df, by) - if isinstance(weight, str): - weight = dim_df[weight] # Actual computation layers = aggregate( data, by=categorical, func=func, dof=dof, - weight=weight, ) result = AnnData( layers=layers, @@ -306,9 +272,8 @@ def aggregate_array( func: AggType | Iterable[AggType], *, dof: int = 1, - weight: np.ndarray | None = None, ) -> dict[str, np.ndarray]: - groupby = Aggregate(groupby=by, data=data, weight=weight) + groupby = Aggregate(groupby=by, data=data) result = {} funcs = set([func] if isinstance(func, str) else func) From 6c7892f8a128fc3157b84092b8d2343146cab9c8 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 11 Dec 2023 16:31:58 +0100 Subject: [PATCH 76/89] Remove change to pyproject.toml --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 44b4522dd..443906bd8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: v0.1.5 hooks: - id: ruff - args: ["--fix", "--unsafe-fixes"] + args: ["--fix"] - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 From 3764a7fede0ae92a30e2be259d2328b8b0927783 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 11 Dec 2023 16:50:54 +0100 Subject: [PATCH 77/89] support for obsm/ varm --- scanpy/get/_aggregated.py | 11 +++++++++-- scanpy/tests/test_aggregated.py | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index f76a19134..78888272b 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -152,7 +152,7 @@ def aggregate( by: str | list[str], func: AggType | Iterable[AggType], *, - dim: Literal["obs", "var"] = "obs", + dim: Literal["obs", "var"] | None = None, dof: int = 1, layer: str | None = None, obsm: str | None = None, @@ -221,12 +221,19 @@ def aggregate( Note that this filters out any combination of groups that wasn't present in the original data. """ - if dim not in ["obs", "var"]: + if dim not in ["obs", "var", None]: raise ValueError(f"dim must be one of 'obs' or 'var', was '{dim}'") # TODO replace with get helper data = adata.X if sum(p is not None for p in [varm, obsm, layer]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") + + if dim is None: + if varm: + dim = "var" + else: + dim = "obs" + if varm is not None: if dim != "var": raise ValueError("varm can only be used when dim is 'var'") diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 4e3a76824..e6e8f0c5e 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -125,6 +125,41 @@ def test_aggregate_axis(array_type, metric): assert_equal(expected, actual) +def test_aggregate_entry(): + args = ("blobs", ["mean", "var", "count_nonzero"]) + + adata = sc.datasets.blobs() + X_result = sc.get.aggregate(adata, *args) + # layer adata + layer_adata = ad.AnnData( + obs=adata.obs, + var=adata.var, + layers={"test": adata.X.copy()}, + ) + layer_result = sc.get.aggregate(layer_adata, *args, layer="test") + obsm_adata = ad.AnnData( + obs=adata.obs, + var=adata.var, + obsm={"test": adata.X.copy()}, + ) + obsm_result = sc.get.aggregate(obsm_adata, *args, obsm="test") + varm_adata = ad.AnnData( + obs=adata.var, + var=adata.obs, + varm={"test": adata.X.copy()}, + ) + varm_result = sc.get.aggregate(varm_adata, *args, varm="test") + + X_result_min = X_result.copy() + del X_result_min.var + X_result_min.var_names = [str(x) for x in np.arange(X_result_min.n_vars)] + + assert_equal(X_result, layer_result) + assert_equal(X_result_min, obsm_result) + assert_equal(X_result.layers, obsm_result.layers) + assert_equal(X_result.layers, varm_result.T.layers) + + def test_aggregate_incorrect_dim(): adata = pbmc3k_processed().raw.to_adata() From 0aef147e7a4bef0244a099f1e19ff51d034a9c01 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 12 Dec 2023 16:19:20 +0100 Subject: [PATCH 78/89] dim -> axis --- scanpy/get/_aggregated.py | 32 ++++++++++++++++---------------- scanpy/tests/test_aggregated.py | 4 ++-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 78888272b..058fd9921 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -152,7 +152,7 @@ def aggregate( by: str | list[str], func: AggType | Iterable[AggType], *, - dim: Literal["obs", "var"] | None = None, + axis: Literal[0, 1] | None = None, dof: int = 1, layer: str | None = None, obsm: str | None = None, @@ -179,7 +179,7 @@ def aggregate( Key of the column to be grouped-by. func How to aggregate. - dim + axis Axis on which to find group by column. dof Degrees of freedom for variance. Defaults to 1. @@ -221,36 +221,36 @@ def aggregate( Note that this filters out any combination of groups that wasn't present in the original data. """ - if dim not in ["obs", "var", None]: - raise ValueError(f"dim must be one of 'obs' or 'var', was '{dim}'") + if axis not in [0, 1, None]: + raise ValueError(f"axis must be one of 0 or 1, was '{axis}'") # TODO replace with get helper data = adata.X if sum(p is not None for p in [varm, obsm, layer]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") - if dim is None: + if axis is None: if varm: - dim = "var" + axis = 1 else: - dim = "obs" + axis = 0 if varm is not None: - if dim != "var": - raise ValueError("varm can only be used when dim is 'var'") + if axis != 1: + raise ValueError("varm can only be used when axis is 1") data = adata.varm[varm] elif obsm is not None: - if dim != "obs": - raise ValueError("obsm can only be used when dim is 'obs'") + if axis != 0: + raise ValueError("obsm can only be used when axis is 0") data = adata.obsm[obsm] elif layer is not None: data = adata.layers[layer] - if dim == "var": + if axis == 1: data = data.T - elif dim == "var": + elif axis == 1: # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T - dim_df = getattr(adata, dim) + dim_df = getattr(adata, ["obs", "var"][axis]) categorical, new_label_df = _combine_categories(dim_df, by) # Actual computation layers = aggregate( @@ -262,10 +262,10 @@ def aggregate( result = AnnData( layers=layers, obs=new_label_df, - var=getattr(adata, "var" if dim == "obs" else "obs"), + var=getattr(adata, "var" if axis == 0 else "obs"), ) - if dim == "var": + if axis == 1: return result.T else: return result diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index e6e8f0c5e..9d035237c 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -120,7 +120,7 @@ def test_aggregate_axis(array_type, metric): ].copy() adata.X = array_type(adata.X) expected = sc.get.aggregate(adata, ["louvain"], metric) - actual = sc.get.aggregate(adata.T, ["louvain"], metric, dim="var").T + actual = sc.get.aggregate(adata.T, ["louvain"], metric, axis=1).T assert_equal(expected, actual) @@ -164,7 +164,7 @@ def test_aggregate_incorrect_dim(): adata = pbmc3k_processed().raw.to_adata() with pytest.raises(ValueError, match="was 'foo'"): - sc.get.aggregate(adata, ["louvain"], "sum", dim="foo") + sc.get.aggregate(adata, ["louvain"], "sum", axis="foo") @pytest.mark.parametrize( From 062eea9430784eaf3e4d3a2f3bb8b517e7b86846 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 12 Dec 2023 17:21:20 +0100 Subject: [PATCH 79/89] Add mask argument --- scanpy/get/_aggregated.py | 23 ++++++++++++++++++----- scanpy/tests/test_aggregated.py | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 058fd9921..f3392b45a 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -9,6 +9,8 @@ from anndata import AnnData, utils from scipy import sparse +from scanpy.get.get import _check_mask + if TYPE_CHECKING: from collections.abc import Iterable, Set @@ -42,9 +44,9 @@ class Aggregate: Weights to be used for aggregation. """ - def __init__(self, groupby, data): + def __init__(self, groupby, data, *, mask: np.ndarray | None = None): self.groupby = groupby - self.indicator_matrix = sparse_indicator(groupby) + self.indicator_matrix = sparse_indicator(groupby, mask=mask) self.data = data groupby: pd.Series @@ -153,6 +155,7 @@ def aggregate( func: AggType | Iterable[AggType], *, axis: Literal[0, 1] | None = None, + mask: np.ndarray | str | None = None, dof: int = 1, layer: str | None = None, obsm: str | None = None, @@ -181,6 +184,8 @@ def aggregate( How to aggregate. axis Axis on which to find group by column. + mask + Boolean mask (or key to column containing mask) to apply along the axis. dof Degrees of freedom for variance. Defaults to 1. layer @@ -224,6 +229,8 @@ def aggregate( if axis not in [0, 1, None]: raise ValueError(f"axis must be one of 0 or 1, was '{axis}'") # TODO replace with get helper + if mask is not None: + mask = _check_mask(adata, mask, ["obs", "var"][axis]) data = adata.X if sum(p is not None for p in [varm, obsm, layer]) > 1: raise TypeError("Please only provide one (or none) of varm, obsm, or layer") @@ -257,6 +264,7 @@ def aggregate( data, by=categorical, func=func, + mask=mask, dof=dof, ) result = AnnData( @@ -278,9 +286,10 @@ def aggregate_array( by: pd.Categorical, func: AggType | Iterable[AggType], *, + mask: np.ndarray | None = None, dof: int = 1, ) -> dict[str, np.ndarray]: - groupby = Aggregate(groupby=by, data=data) + groupby = Aggregate(groupby=by, data=data, mask=mask) result = {} funcs = set([func] if isinstance(func, str) else func) @@ -358,9 +367,13 @@ def _combine_categories( def sparse_indicator( - categorical, weight: None | np.ndarray = None + categorical, *, mask: np.ndarray | None = None, weight: np.ndarray | None = None ) -> sparse.coo_matrix: - if weight is None: + if mask is not None and weight is None: + weight = mask + elif mask is not None and weight is not None: + weight = mask * weight + elif mask is None and weight is None: weight = np.broadcast_to(1, len(categorical)) A = sparse.coo_matrix( (weight, (categorical.codes, np.arange(len(categorical)))), diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 9d035237c..2b7eb4508 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -70,6 +70,21 @@ def gen_adata(data_key, dim, df_base, df_groupby, X): return adata_sparse, adata_dense +@pytest.mark.parametrize("axis", [0, 1]) +def test_mask(axis): + blobs = sc.datasets.blobs() + mask = blobs.obs["blobs"] == 0 + blobs.obs["mask_col"] = mask + if axis == 1: + blobs = blobs.T + by_name = sc.get.aggregate(blobs, "blobs", "sum", axis=axis, mask="mask_col") + by_value = sc.get.aggregate(blobs, "blobs", "sum", axis=axis, mask=mask) + + assert_equal(by_name, by_value) + + assert np.all(by_name["0"].layers["sum"] == 0) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) @pytest.mark.parametrize("metric", ["sum", "mean", "var", "count_nonzero"]) def test_aggregate_vs_pandas(metric, array_type): From 86532ace961ca8efe677b2aa80a6eb63c1468786 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 12 Dec 2023 17:44:13 +0100 Subject: [PATCH 80/89] release note --- docs/release-notes/1.10.0.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes/1.10.0.md b/docs/release-notes/1.10.0.md index 856a66b23..fd8b7d7c3 100644 --- a/docs/release-notes/1.10.0.md +++ b/docs/release-notes/1.10.0.md @@ -13,6 +13,7 @@ * Enhanced dask support for some internal utilities, paving the way for more extensive dask support {pr}`2696` {smaller}`P Angerer` * {func}`scanpy.pp.pca`, {func}`scanpy.pp.scale`, {func}`scanpy.pl.embedding`, and {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca` now support a `mask` parameter {pr}`2272` {smaller}`C Bright, T Marcella, & P Angerer` +* New function {func}`sc.get.aggregate` which allows grouped aggregations over your data. Useful for pseudobulking! {pr}`2590` {smaller}`Isaac Virshup` ```{rubric} Docs ``` From f89c4d35bc7b99945d57f4add0eef7436ef5f811 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 19 Feb 2024 13:36:12 +0000 Subject: [PATCH 81/89] add support for semantic axis --- scanpy/_utils/__init__.py | 10 ++++++++++ scanpy/get/_aggregated.py | 19 +++++++++---------- scanpy/tests/test_aggregated.py | 21 +++++++++++++++++++++ 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py index 6d21a11fd..223c74630 100644 --- a/scanpy/_utils/__init__.py +++ b/scanpy/_utils/__init__.py @@ -841,3 +841,13 @@ def _choose_graph(adata, obsp, neighbors_key): "to compute a neighborhood graph." ) return neighbors["connectivities"] + + +def _resolve_axis( + axis: Literal["obs", 0, "var", 1], +) -> tuple[Literal[0], Literal["obs"]] | tuple[Literal[1], Literal["var"]]: + if axis in {0, "obs"}: + return (0, "obs") + if axis in {1, "var"}: + return (1, "var") + raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis}") diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index f3392b45a..78c38caa9 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -9,6 +9,7 @@ from anndata import AnnData, utils from scipy import sparse +from scanpy._utils import _resolve_axis from scanpy.get.get import _check_mask if TYPE_CHECKING: @@ -226,21 +227,19 @@ def aggregate( Note that this filters out any combination of groups that wasn't present in the original data. """ - if axis not in [0, 1, None]: - raise ValueError(f"axis must be one of 0 or 1, was '{axis}'") - # TODO replace with get helper - if mask is not None: - mask = _check_mask(adata, mask, ["obs", "var"][axis]) - data = adata.X - if sum(p is not None for p in [varm, obsm, layer]) > 1: - raise TypeError("Please only provide one (or none) of varm, obsm, or layer") - if axis is None: if varm: axis = 1 else: axis = 0 + axis, axis_name = _resolve_axis(axis) + if mask is not None: + mask = _check_mask(adata, mask, axis_name) + data = adata.X + if sum(p is not None for p in [varm, obsm, layer]) > 1: + raise TypeError("Please only provide one (or none) of varm, obsm, or layer") + if varm is not None: if axis != 1: raise ValueError("varm can only be used when axis is 1") @@ -257,7 +256,7 @@ def aggregate( # i.e., all of `varm`, `obsm`, `layers` are None so we use `X` which must be transposed data = data.T - dim_df = getattr(adata, ["obs", "var"][axis]) + dim_df = getattr(adata, axis_name) categorical, new_label_df = _combine_categories(dim_df, by) # Actual computation layers = aggregate( diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 2b7eb4508..114f102ad 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -182,6 +182,27 @@ def test_aggregate_incorrect_dim(): sc.get.aggregate(adata, ["louvain"], "sum", axis="foo") +def test_aggregate_axis_specification(): + adata = sc.datasets.blobs() + adata.var["labels"] = np.tile(["a", "b"], adata.shape[1])[: adata.shape[1]] + + obs_int = sc.get.aggregate(adata, by="blobs", func="mean", axis=0) + obs_str = sc.get.aggregate(adata, by="blobs", func="mean", axis="obs") + obs_unspecified = sc.get.aggregate( + adata, + by="blobs", + func="mean", + ) + + np.testing.assert_equal(obs_int.layers["mean"], obs_str.layers["mean"]) + np.testing.assert_equal(obs_int.layers["mean"], obs_unspecified.layers["mean"]) + + var_int = sc.get.aggregate(adata, by="labels", func="mean", axis=1) + var_str = sc.get.aggregate(adata, by="labels", func="mean", axis="var") + + np.testing.assert_equal(var_int.layers["mean"], var_str.layers["mean"]) + + @pytest.mark.parametrize( "matrix,df,keys,metrics,expected", [ From 30a2f2a64904558e36654a4682e1ebf92efa771b Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 19 Feb 2024 13:39:44 +0000 Subject: [PATCH 82/89] Fixup signature --- scanpy/get/_aggregated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 78c38caa9..3d944e3cc 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -155,7 +155,7 @@ def aggregate( by: str | list[str], func: AggType | Iterable[AggType], *, - axis: Literal[0, 1] | None = None, + axis: Literal["obs", 0, "var", 1] | None = None, mask: np.ndarray | str | None = None, dof: int = 1, layer: str | None = None, From f6d5ac94356a3e5e874a9d78904048648cfea749 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 19 Feb 2024 14:12:56 +0000 Subject: [PATCH 83/89] fix error message --- scanpy/_utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py index 223c74630..e7e9e7b4d 100644 --- a/scanpy/_utils/__init__.py +++ b/scanpy/_utils/__init__.py @@ -850,4 +850,4 @@ def _resolve_axis( return (0, "obs") if axis in {1, "var"}: return (1, "var") - raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis}") + raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was '{axis}'") From 5f2d06339c5ee309ed0218ac781c9d8d1f130bd8 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 19 Feb 2024 15:37:39 +0000 Subject: [PATCH 84/89] fix docs error --- docs/release-notes/1.10.0.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/1.10.0.md b/docs/release-notes/1.10.0.md index b66f38bad..a21f841fb 100644 --- a/docs/release-notes/1.10.0.md +++ b/docs/release-notes/1.10.0.md @@ -15,7 +15,7 @@ * Enhanced dask support for some internal utilities, paving the way for more extensive dask support {pr}`2696` {smaller}`P Angerer` * {func}`scanpy.pp.pca`, {func}`scanpy.pp.scale`, {func}`scanpy.pl.embedding`, and {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca` now support a `mask` parameter {pr}`2272` {smaller}`C Bright, T Marcella, & P Angerer` -* New function {func}`sc.get.aggregate` which allows grouped aggregations over your data. Useful for pseudobulking! {pr}`2590` {smaller}`Isaac Virshup` +* New function {func}`scanpy.get.aggregate` which allows grouped aggregations over your data. Useful for pseudobulking! {pr}`2590` {smaller}`Isaac Virshup` {smaller}`Ilan Gold` {smaller}`Jon Bloom` * {func}`scanpy.tl.rank_genes_groups` no longer warns that it's default was changed from t-test_overestim_var to t-test {pr}`2798` {smaller}`L Heumos` * {func}`scanpy.pp.highly_variable_genes` has new flavor `seurat_v3_paper` that is in its implementation consistent with the paper description in Stuart et al 2018. {pr}`2792` {smaller}`E Roellin` * {func}`scanpy.pp.highly_variable_genes` supports dask for the default `seurat` and `cell_ranger` flavors {pr}`2809` {smaller}`P Angerer` From d910e7469b3b9c91ad4767ff28130baad1b9fbef Mon Sep 17 00:00:00 2001 From: Philipp A Date: Tue, 20 Feb 2024 09:00:25 +0100 Subject: [PATCH 85/89] even better formatting --- scanpy/_utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py index e7e9e7b4d..84e7b6f03 100644 --- a/scanpy/_utils/__init__.py +++ b/scanpy/_utils/__init__.py @@ -850,4 +850,4 @@ def _resolve_axis( return (0, "obs") if axis in {1, "var"}: return (1, "var") - raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was '{axis}'") + raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis!r}") From ab5aa80a6d2bf2870779101341acc1215de21850 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 20 Feb 2024 09:40:41 +0100 Subject: [PATCH 86/89] Some type fixes --- scanpy/get/_aggregated.py | 66 +++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 3d944e3cc..28ef5be05 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -1,21 +1,22 @@ from __future__ import annotations from functools import singledispatch -from typing import TYPE_CHECKING, Literal, get_args -from typing import Union as _U +from typing import TYPE_CHECKING, Literal, Union, get_args import numpy as np import pandas as pd from anndata import AnnData, utils from scipy import sparse -from scanpy._utils import _resolve_axis -from scanpy.get.get import _check_mask +from .._utils import _resolve_axis +from .get import _check_mask if TYPE_CHECKING: - from collections.abc import Iterable, Set + from collections.abc import Collection, Iterable -Array = _U[np.ndarray, sparse.spmatrix] + from numpy.typing import NDArray + +Array = Union[np.ndarray, sparse.csc_matrix, sparse.csr_matrix] AggType = Literal["count_nonzero", "mean", "sum", "var"] @@ -30,31 +31,37 @@ class Aggregate: Moments are computed using weighted sum aggregation of data by some feature via multiplication by a sparse coordinate matrix A. - Runtime is effectively computation of the product A @ X, i.e. the count of (non-zero) - entries in X with multiplicity the number of group memberships for that entry. This is - O(data) for partitions (each observation belonging to exactly one group), independent of - the number of groups. + Runtime is effectively computation of the product `A @ X`, i.e. the count of (non-zero) + entries in X with multiplicity the number of group memberships for that entry. + This is `O(data)` for partitions (each observation belonging to exactly one group), + independent of the number of groups. Params ------ groupby - `Series` containing values for grouping by. + :class:`~pandas.Categorical` containing values for grouping by. data Data matrix for aggregation. - weight - Weights to be used for aggregation. + mask + Mask to be used for aggregation. """ - def __init__(self, groupby, data, *, mask: np.ndarray | None = None): + def __init__( + self, + groupby: pd.Categorical, + data: Array, + *, + mask: NDArray[np.bool_] | None = None, + ) -> None: self.groupby = groupby self.indicator_matrix = sparse_indicator(groupby, mask=mask) self.data = data - groupby: pd.Series + groupby: pd.Categorical + indicator_matrix: sparse.coo_matrix data: Array - key_set: Set[str] | None - def count_nonzero(self) -> np.ndarray: + def count_nonzero(self) -> NDArray[np.integer]: """\ Count the number of observations in each group. @@ -133,7 +140,7 @@ def _power(X: Array, power: float | int) -> Array: """\ Generate elementwise power of a matrix. - Needed for non-square sparse matrices because they do not support ** so the `.power` function is used. + Needed for non-square sparse matrices because they do not support `**` so the `.power` function is used. Params ------ @@ -152,11 +159,11 @@ def _power(X: Array, power: float | int) -> Array: @singledispatch def aggregate( adata: AnnData, - by: str | list[str], + by: str | Collection[str], func: AggType | Iterable[AggType], *, axis: Literal["obs", 0, "var", 1] | None = None, - mask: np.ndarray | str | None = None, + mask: NDArray[np.bool_] | str | None = None, dof: int = 1, layer: str | None = None, obsm: str | None = None, @@ -228,11 +235,7 @@ def aggregate( Note that this filters out any combination of groups that wasn't present in the original data. """ if axis is None: - if varm: - axis = 1 - else: - axis = 0 - + axis = 1 if varm else 0 axis, axis_name = _resolve_axis(axis) if mask is not None: mask = _check_mask(adata, mask, axis_name) @@ -285,7 +288,7 @@ def aggregate_array( by: pd.Categorical, func: AggType | Iterable[AggType], *, - mask: np.ndarray | None = None, + mask: NDArray[np.bool_] | None = None, dof: int = 1, ) -> dict[str, np.ndarray]: groupby = Aggregate(groupby=by, data=data, mask=mask) @@ -314,7 +317,7 @@ def aggregate_array( def _combine_categories( - label_df: pd.DataFrame, cols: list[str] + label_df: pd.DataFrame, cols: Collection[str] | str ) -> tuple[pd.Categorical, pd.DataFrame]: """ Returns both the result categories and a dataframe labelling each row @@ -366,14 +369,17 @@ def _combine_categories( def sparse_indicator( - categorical, *, mask: np.ndarray | None = None, weight: np.ndarray | None = None + categorical: pd.Categorical, + *, + mask: NDArray[np.bool_] | None = None, + weight: NDArray[np.floating] | None = None, ) -> sparse.coo_matrix: if mask is not None and weight is None: - weight = mask + weight = mask.astype(np.float32) elif mask is not None and weight is not None: weight = mask * weight elif mask is None and weight is None: - weight = np.broadcast_to(1, len(categorical)) + weight = np.broadcast_to(1.0, len(categorical)) A = sparse.coo_matrix( (weight, (categorical.codes, np.arange(len(categorical)))), shape=(len(categorical.categories), len(categorical)), From cf9c1eb5a8f056f776ec35bd834b57193880f671 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 20 Feb 2024 09:59:22 +0100 Subject: [PATCH 87/89] test style --- scanpy/get/_aggregated.py | 8 ++--- scanpy/tests/test_aggregated.py | 52 ++++++++++++++------------------- 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/scanpy/get/_aggregated.py b/scanpy/get/_aggregated.py index 28ef5be05..159f7572e 100644 --- a/scanpy/get/_aggregated.py +++ b/scanpy/get/_aggregated.py @@ -290,7 +290,7 @@ def aggregate_array( *, mask: NDArray[np.bool_] | None = None, dof: int = 1, -) -> dict[str, np.ndarray]: +) -> dict[AggType, np.ndarray]: groupby = Aggregate(groupby=by, data=data, mask=mask) result = {} @@ -334,9 +334,9 @@ def _combine_categories( # It's like np.concatenate([x for x in product(*[range(n) for n in n_categories])]) code_combinations = np.indices(n_categories).reshape(len(n_categories), -1) - result_categories = [ - "_".join(map(str, x)) for x in product(*[df[c].cat.categories for c in cols]) - ] + result_categories = pd.Index( + ["_".join(map(str, x)) for x in product(*[df[c].cat.categories for c in cols])] + ) # Dataframe with unique combination of categories for each row new_label_df = pd.DataFrame( diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 114f102ad..600bd95c4 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -204,7 +204,7 @@ def test_aggregate_axis_specification(): @pytest.mark.parametrize( - "matrix,df,keys,metrics,expected", + ("matrix", "df", "keys", "metrics", "expected"), [ ( np.block( @@ -309,65 +309,57 @@ def test_aggregate_examples(matrix, df, keys, metrics, expected): @pytest.mark.parametrize( - "label_df,cols,expected", + ("label_cols", "cols", "expected"), [ ( - pd.DataFrame( - { - "a": pd.Categorical(["a", "b", "c"]), - "b": pd.Categorical(["d", "d", "f"]), - } + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), ), ["a", "b"], pd.Categorical(["a_d", "b_d", "c_f"]), ), ( - pd.DataFrame( - { - "a": pd.Categorical(["a", "b", "c"]), - "b": pd.Categorical(["d", "d", "f"]), - "c": pd.Categorical(["g", "h", "h"]), - } + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), + c=pd.Categorical(["g", "h", "h"]), ), ["a", "b", "c"], pd.Categorical(["a_d_g", "b_d_h", "c_f_h"]), ), ( - pd.DataFrame( - { - "a": pd.Categorical(["a", "b", "c"]), - "b": pd.Categorical(["d", "d", "f"]), - "c": pd.Categorical(["g", "h", "h"]), - } + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), + c=pd.Categorical(["g", "h", "h"]), ), ["a", "c"], pd.Categorical(["a_g", "b_h", "c_h"]), ), ( - pd.DataFrame( - { - "a": pd.Categorical(["a", "b", "c"]), - "b": pd.Categorical(["d", "d", "f"]), - "c": pd.Categorical(["g", "h", "h"]), - } + dict( + a=pd.Categorical(["a", "b", "c"]), + b=pd.Categorical(["d", "d", "f"]), + c=pd.Categorical(["g", "h", "h"]), ), ["b", "c"], pd.Categorical(["d_g", "d_h", "f_h"]), ), ], ) -def test_combine_categories(label_df, cols, expected): +def test_combine_categories(label_cols, cols, expected): from scanpy.get._aggregated import _combine_categories + label_df = pd.DataFrame(label_cols) result, result_label_df = _combine_categories(label_df, cols) assert isinstance(result, pd.Categorical) - # TODO: is there a better function here? - pd.testing.assert_series_equal(pd.Series(result), pd.Series(expected)) + pd.testing.assert_extension_array_equal(result, expected) - pd.testing.assert_series_equal( - pd.Series(result), pd.Series(result_label_df.index.astype("category")) + pd.testing.assert_index_equal( + pd.Index(result), result_label_df.index.astype("category") ) reconstructed_df = pd.DataFrame( From 64dd227b247d32f27fbab110a79779bb02f215a3 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 20 Feb 2024 10:05:58 +0100 Subject: [PATCH 88/89] test names --- scanpy/tests/test_aggregated.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 600bd95c4..6fafc8a60 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -206,7 +206,7 @@ def test_aggregate_axis_specification(): @pytest.mark.parametrize( ("matrix", "df", "keys", "metrics", "expected"), [ - ( + pytest.param( np.block( [ [np.ones((2, 2)), np.zeros((2, 2))], @@ -235,8 +235,9 @@ def test_aggregate_axis_specification(): # "mean": np.array([[1, 0], [0, 1]]), }, ), + id="count_nonzero", ), - ( + pytest.param( np.block( [ [np.ones((2, 2)), np.zeros((2, 2))], @@ -265,8 +266,9 @@ def test_aggregate_axis_specification(): ), }, ), + id="sum-mean-count_nonzero", ), - ( + pytest.param( np.block( [ [np.ones((2, 2)), np.zeros((2, 2))], @@ -291,6 +293,7 @@ def test_aggregate_axis_specification(): "mean": np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1]]), }, ), + id="mean", ), ], ) @@ -311,15 +314,16 @@ def test_aggregate_examples(matrix, df, keys, metrics, expected): @pytest.mark.parametrize( ("label_cols", "cols", "expected"), [ - ( + pytest.param( dict( a=pd.Categorical(["a", "b", "c"]), b=pd.Categorical(["d", "d", "f"]), ), ["a", "b"], pd.Categorical(["a_d", "b_d", "c_f"]), + id="two_of_two", ), - ( + pytest.param( dict( a=pd.Categorical(["a", "b", "c"]), b=pd.Categorical(["d", "d", "f"]), @@ -327,8 +331,9 @@ def test_aggregate_examples(matrix, df, keys, metrics, expected): ), ["a", "b", "c"], pd.Categorical(["a_d_g", "b_d_h", "c_f_h"]), + id="three_of_three", ), - ( + pytest.param( dict( a=pd.Categorical(["a", "b", "c"]), b=pd.Categorical(["d", "d", "f"]), @@ -336,8 +341,9 @@ def test_aggregate_examples(matrix, df, keys, metrics, expected): ), ["a", "c"], pd.Categorical(["a_g", "b_h", "c_h"]), + id="two_of_three-1", ), - ( + pytest.param( dict( a=pd.Categorical(["a", "b", "c"]), b=pd.Categorical(["d", "d", "f"]), @@ -345,6 +351,7 @@ def test_aggregate_examples(matrix, df, keys, metrics, expected): ), ["b", "c"], pd.Categorical(["d_g", "d_h", "f_h"]), + id="two_of_three-2", ), ], ) From 776f420fdf2ef8748b67b69b4b052883f2abac14 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 20 Feb 2024 10:15:41 +0100 Subject: [PATCH 89/89] parametrize test_aggregate_axis_specification --- scanpy/tests/test_aggregated.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/scanpy/tests/test_aggregated.py b/scanpy/tests/test_aggregated.py index 6fafc8a60..4439f3e0e 100644 --- a/scanpy/tests/test_aggregated.py +++ b/scanpy/tests/test_aggregated.py @@ -7,6 +7,7 @@ from scipy.sparse import csr_matrix import scanpy as sc +from scanpy._utils import _resolve_axis from scanpy.testing._helpers import assert_equal from scanpy.testing._helpers.data import pbmc3k_processed from scanpy.testing._pytest.params import ARRAY_TYPES_MEM @@ -182,25 +183,22 @@ def test_aggregate_incorrect_dim(): sc.get.aggregate(adata, ["louvain"], "sum", axis="foo") -def test_aggregate_axis_specification(): +@pytest.mark.parametrize("axis_name", ["obs", "var"]) +def test_aggregate_axis_specification(axis_name): + axis, axis_name = _resolve_axis(axis_name) + by = "blobs" if axis == 0 else "labels" + adata = sc.datasets.blobs() adata.var["labels"] = np.tile(["a", "b"], adata.shape[1])[: adata.shape[1]] - obs_int = sc.get.aggregate(adata, by="blobs", func="mean", axis=0) - obs_str = sc.get.aggregate(adata, by="blobs", func="mean", axis="obs") - obs_unspecified = sc.get.aggregate( - adata, - by="blobs", - func="mean", - ) - - np.testing.assert_equal(obs_int.layers["mean"], obs_str.layers["mean"]) - np.testing.assert_equal(obs_int.layers["mean"], obs_unspecified.layers["mean"]) + agg_index = sc.get.aggregate(adata, by=by, func="mean", axis=axis) + agg_name = sc.get.aggregate(adata, by=by, func="mean", axis=axis_name) - var_int = sc.get.aggregate(adata, by="labels", func="mean", axis=1) - var_str = sc.get.aggregate(adata, by="labels", func="mean", axis="var") + np.testing.assert_equal(agg_index.layers["mean"], agg_name.layers["mean"]) - np.testing.assert_equal(var_int.layers["mean"], var_str.layers["mean"]) + if axis_name == "obs": + agg_unspecified = sc.get.aggregate(adata, by=by, func="mean") + np.testing.assert_equal(agg_name.layers["mean"], agg_unspecified.layers["mean"]) @pytest.mark.parametrize(