scverse · ivirshup · Mar 17, 2020 · Mar 17, 2020 · Mar 17, 2020 · Mar 17, 2020
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,6 @@ exclude = '''
         |_combat
         |_simple
         |_recipes
-        |_normalization
         |_highly_variable_genes
         |_deprecated/highly_variable_genes
     )
@@ -53,7 +52,6 @@ exclude = '''
         |test_combat
         |test_neighbors
         |test_readwrite
-        |test_clustering
         |test_preprocessing
         |test_rank_genes_groups
         |test_marker_gene_overlap

diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py
@@ -7,6 +7,6 @@
 from ._pca import pca
 from ._qc import calculate_qc_metrics
 from ._combat import combat
-from ._normalization import normalize_total
+from ._normalization import normalize_total, normalize_geometric
 
 from ..neighbors import neighbors
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
@@ -1,10 +1,12 @@
 from typing import Optional, Union, Iterable, Dict
 
-import numpy as np
 from anndata import AnnData
+import numpy as np
+import pandas as pd
 from scipy.sparse import issparse
 from sklearn.utils import sparsefuncs
 
+import scanpy as sc
 from .. import logging as logg
 from .._compat import Literal
 from .._utils import view_to_actual
@@ -15,11 +17,11 @@ def _normalize_data(X, counts, after=None, copy=False):
     if issubclass(X.dtype.type, (int, np.integer)):
         X = X.astype(np.float32)  # TODO: Check if float64 should be used
     counts = np.asarray(counts)  # dask doesn't do medians
-    after = np.median(counts[counts>0], axis=0) if after is None else after
-    counts += (counts == 0)
+    after = np.median(counts[counts > 0], axis=0) if after is None else after
+    counts += counts == 0
     counts = counts / after
     if issparse(X):
-        sparsefuncs.inplace_row_scale(X, 1/counts)
+        sparsefuncs.inplace_row_scale(X, 1 / counts)
     else:
         np.divide(X, counts[:, None], out=X)
     return X
@@ -144,8 +146,8 @@ def normalize_total(
         counts_per_cell = np.ravel(counts_per_cell)
 
         # at least one cell as more than max_fraction of counts per cell
-        gene_subset = (adata.X > counts_per_cell[:, None]*max_fraction).sum(0)
-        gene_subset = (np.ravel(gene_subset) == 0)
+        gene_subset = (adata.X > counts_per_cell[:, None] * max_fraction).sum(0)
+        gene_subset = np.ravel(gene_subset) == 0
 
         msg += (
             ' The following highly-expressed genes are not considered during '
@@ -184,7 +186,7 @@ def normalize_total(
             norm_factor=counts_per_cell,
         )
 
-    for layer_name in (layers or ()):
+    for layer_name in layers or ():
         layer = adata.layers[layer_name]
         counts = np.ravel(layer.sum(1))
         if inplace:
@@ -193,10 +195,71 @@ def normalize_total(
             dat[layer_name] = _normalize_data(layer, counts, after, copy=True)
 
     logg.info(
-        '    finished ({time_passed})',
-        time=start,
+        '    finished ({time_passed})', time=start,
     )
     if key_added is not None:
-        logg.debug(f'and added {key_added!r}, counts per cell before normalization (adata.obs)')
+        logg.debug(
+            f'and added {key_added!r}, counts per cell before normalization (adata.obs)'
+        )
 
     return dat if not inplace else None
+
+
+# TODO: Document better
+# TODO: Test better (have at least one gold standard comparison)
+def normalize_geometric(
+    adata: AnnData, *, inplace=True, copy=False, use_raw=False, layer=None, obsm=None
+):
+    """Normalize data by geometric mean.
+
+    Params
+    ------
+    adata
+        AnnData containing array to normalize.
+
+    Usage
+    -----
+    >>> sc.pp.normalize_geometric(adata)
+    >>> sc.pp.log1p(adata)
+    """
+    if copy:
+        adata = adata.copy()
+    if adata.is_view:
+        adata._init_as_actual()
+
+    # Sorry about the ugliness of dataframe support
+    was_df = False
+    X = sc.get._get_obs_rep(adata, use_raw=use_raw, layer=layer, obsm=obsm)
+    if isinstance(X, pd.DataFrame):
+        was_df = True
+        orig = X
+        X = X.values
+
+    if issparse(X):
+        X = X.toarray()
+        inplace = False
+    if not np.issubdtype(X.dtype, np.floating):
+        X = X.astype(np.promote_type(X.dtype, np.float32), copy=False)
+
+    X = _normalize_geometric(X, inplace)
+
+    if was_df:
+        if X is not orig.values:
+            X = pd.DataFrame(X, index=orig.index, columns=orig.columns)
+        else:
+            X = orig
+
+    sc.get._set_obs_rep(adata, X, use_raw=use_raw, layer=layer, obsm=obsm)
+
+    if copy:
+        return adata
+
+
+def _normalize_geometric(X: np.ndarray, inplace=True):
+    from scipy.stats.mstats import gmean
+
+    if inplace:
+        out = X
+    else:
+        out = None
+    return np.divide(X, gmean(X + 1, axis=0), out=X)
diff --git a/scanpy/tests/test_clustering.py b/scanpy/tests/test_clustering.py
@@ -1,5 +1,6 @@
 import pytest
 import scanpy as sc
+from scipy import sparse
 
 
 @pytest.fixture
@@ -10,10 +11,10 @@ def adata_neighbors():
 def test_leiden_basic(adata_neighbors):
     sc.tl.leiden(adata_neighbors)
 
-@pytest.mark.parametrize('clustering,key', [
-    (sc.tl.louvain, 'louvain'),
-    (sc.tl.leiden, 'leiden'),
-])
+
+@pytest.mark.parametrize(
+    'clustering,key', [(sc.tl.louvain, 'louvain'), (sc.tl.leiden, 'leiden'),]
+)
 def test_clustering_subset(adata_neighbors, clustering, key):
     clustering(adata_neighbors, key_added=key)
 
@@ -23,9 +24,7 @@ def test_clustering_subset(adata_neighbors, clustering, key):
         ncells_in_c = adata_neighbors.obs[key].value_counts().loc[c]
         key_sub = str(key) + '_sub'
         clustering(
-            adata_neighbors,
-            restrict_to=(key, [c]),
-            key_added=key_sub,
+            adata_neighbors, restrict_to=(key, [c]), key_added=key_sub,
         )
         # Get new clustering labels
         new_partition = adata_neighbors.obs[key_sub]
@@ -50,5 +49,18 @@ def test_louvain_basic(adata_neighbors):
 
 def test_partition_type(adata_neighbors):
     import louvain
+
     sc.tl.louvain(adata_neighbors, partition_type=louvain.RBERVertexPartition)
     sc.tl.louvain(adata_neighbors, partition_type=louvain.SurpriseVertexPartition)
+
+
+def test_leiden_multiplex():
+    """Currently just tests that leiden_multiplex doesn't fail."""
+    adata = sc.AnnData(
+        X=sparse.random(100, 50, format="csr"),
+        obsp={
+            "rep1": sparse.random(100, 100, density=0.1, format="csr"),
+            "rep2": sparse.random(100, 100, density=0.1, format="csr"),
+        },
+    )
+    sc.tl.leiden_multiplex(adata, ["rep1", "rep2"])
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 from anndata import AnnData
+from scipy import sparse
 from scipy.sparse import csr_matrix
 
 import scanpy as sc
@@ -44,3 +45,17 @@ def test_normalize_total_view(typ, dtype):
 
     assert not v.is_view
     assert_equal(adata, v)
+
+
+def test_normalize_geometric():
+    mtx = sparse.random(100, 50, density=0.4, format="csr", dtype=np.float32)
+    adata = AnnData(X=mtx.copy(), layers={"dense": mtx.toarray()})
+    adata.obsm["df"] = adata.to_df()
+
+    sc.pp.normalize_geometric(adata)
+    sc.pp.normalize_geometric(adata, obsm="df")
+    sc.pp.normalize_geometric(adata, layer="dense")
+
+    assert np.array_equal(adata.X, adata.layers["dense"])
+    assert np.array_equal(adata.X, adata.obsm["df"].values)
+    assert not np.array_equal(adata.X, mtx)
diff --git a/scanpy/tools/__init__.py b/scanpy/tools/__init__.py
@@ -13,6 +13,7 @@
 from ._rank_genes_groups import rank_genes_groups, filter_rank_genes_groups
 from ._dpt import dpt
 from ._leiden import leiden
+from ._leiden_multiplex import leiden_multiplex
 from ._louvain import louvain
 from ._sim import sim
 from ._score_genes import score_genes, score_genes_cell_cycle

diff --git a/scanpy/tools/_leiden_multiplex.py b/scanpy/tools/_leiden_multiplex.py
@@ -0,0 +1,106 @@
+from numbers import Number
+from typing import List, Iterable, Union
+from itertools import repeat
+
+import scanpy as sc
+import igraph
+
+import pandas as pd
+import numpy as np
+from scipy.sparse import spmatrix
+
+
+def _validate_reps(reps, adata):
+    reps_out = []
+    correct_shape = (adata.n_obs, adata.n_obs)
+    for i, rep in enumerate(reps):
+        if isinstance(rep, str):
+            rep = adata.obsp[rep]
+        if isinstance(rep, spmatrix):
+            reps_out.append(sc._utils.get_igraph_from_adjacency(rep, directed=True))
+        elif isinstance(rep, igraph.Graph):
+            if rep.shape != correct_shape:
+                raise ValueError(
+                    "Graph of invalid shape passed to leiden_multiplex.\n\n"
+                    f"Rep {i}'s shape was {rep.shape}, but should be {correct_shape}."
+                )
+            elif "weight" not in rep.es.attribute_names():
+                raise NotImplementedError(
+                    "Graphs passed to leiden_multiplex must have edge weights under a 'weight' attribute."
+                )
+            reps_out.append(rep)
+    return reps_out
+
+
+def _validate_floats(vals, n_reps: int, arg_name: str):
+    if isinstance(vals, Number):
+        return list(repeat(vals, n_reps))
+    vals_out = list(vals)
+    if len(vals_out) != n_reps:
+        raise ValueError(
+            f"Incorrect number of {arg_name}. Expected 1 or {n_reps}, got {len(vals_out)}."
+        )
+    return vals_out
+
+
+# TODO: Document better
+# TODO: Test better
+def leiden_multiplex(
+    adata: "sc.AnnData",
+    reps: Iterable[Union[str, igraph.Graph, spmatrix]],
+    layer_weights: Union[float, Iterable[float]] = 1.0,
+    resolutions: Union[float, Iterable[float]] = 1.0,
+    key_added: str = "leiden_multiplex",
+):
+    """Perform a multiplexed clustering on multiple graphs representation of the same data.
+
+    Overview: https://leidenalg.readthedocs.io/en/latest/multiplex.html.
+
+    Params
+    ------
+    adata
+    reps
+        Connectivity graphs to use. Possible values are:
+        * string key to .obsp
+        * sparse matrix of shape (n_obs, n_obs)
+        * igraph.Graph
+    layer_weights
+        Weights to use for each representation in the joint clustering.
+    resolutions
+        Resolution parameter to use for each representation.
+    key_added
+        Key in .obs to add this clustering in.
+
+    Usage
+    -----
+    >>> sc.tl.leiden_multiplex(adata, ["rna_connectivities", "protein_connectivities"])
+    >>> sc.pl.umap(adata, color="leiden_multiplex")
+    """
+    n_reps = len(reps)
+
+    reps = _validate_reps(reps, adata)
+    layer_weights = _validate_floats(layer_weights, n_reps, "layer_weights")
+    resolutions = _validate_floats(resolutions, n_reps, "resolutions")
+
+    clustering = cluster_joint(reps, layer_weights, resolutions)
+
+    adata.obs[key_added] = pd.Categorical.from_codes(
+        clustering, categories=list(map(str, np.unique(clustering))),
+    )
+
+
+def cluster_joint(
+    reps: List[igraph.Graph], layer_weights: List[float], resolutions: List[float],
+):
+    """Actually do the clustering.
+    """
+    import leidenalg
+    partitions = [
+        leidenalg.RBConfigurationVertexPartition(
+            rep, weights="weight", resolution_parameter=res
+        )
+        for rep, res in zip(reps, resolutions)
+    ]
+    optimiser = leidenalg.Optimiser()
+    optimiser.optimise_partition_multiplex(partitions, layer_weights=layer_weights)
+    return np.array(partitions[0].membership)