Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ exclude = '''
|_combat
|_simple
|_recipes
|_normalization
|_highly_variable_genes
|_deprecated/highly_variable_genes
)
Expand Down Expand Up @@ -53,7 +52,6 @@ exclude = '''
|test_combat
|test_neighbors
|test_readwrite
|test_clustering
|test_preprocessing
|test_rank_genes_groups
|test_marker_gene_overlap
Expand Down
2 changes: 1 addition & 1 deletion scanpy/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
from ._pca import pca
from ._qc import calculate_qc_metrics
from ._combat import combat
from ._normalization import normalize_total
from ._normalization import normalize_total, normalize_geometric

from ..neighbors import neighbors
83 changes: 73 additions & 10 deletions scanpy/preprocessing/_normalization.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import Optional, Union, Iterable, Dict

import numpy as np
from anndata import AnnData
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from sklearn.utils import sparsefuncs

import scanpy as sc
from .. import logging as logg
from .._compat import Literal
from .._utils import view_to_actual
Expand All @@ -15,11 +17,11 @@ def _normalize_data(X, counts, after=None, copy=False):
if issubclass(X.dtype.type, (int, np.integer)):
X = X.astype(np.float32) # TODO: Check if float64 should be used
counts = np.asarray(counts) # dask doesn't do medians
after = np.median(counts[counts>0], axis=0) if after is None else after
counts += (counts == 0)
after = np.median(counts[counts > 0], axis=0) if after is None else after
counts += counts == 0
counts = counts / after
if issparse(X):
sparsefuncs.inplace_row_scale(X, 1/counts)
sparsefuncs.inplace_row_scale(X, 1 / counts)
else:
np.divide(X, counts[:, None], out=X)
return X
Expand Down Expand Up @@ -144,8 +146,8 @@ def normalize_total(
counts_per_cell = np.ravel(counts_per_cell)

# at least one cell as more than max_fraction of counts per cell
gene_subset = (adata.X > counts_per_cell[:, None]*max_fraction).sum(0)
gene_subset = (np.ravel(gene_subset) == 0)
gene_subset = (adata.X > counts_per_cell[:, None] * max_fraction).sum(0)
gene_subset = np.ravel(gene_subset) == 0

msg += (
' The following highly-expressed genes are not considered during '
Expand Down Expand Up @@ -184,7 +186,7 @@ def normalize_total(
norm_factor=counts_per_cell,
)

for layer_name in (layers or ()):
for layer_name in layers or ():
layer = adata.layers[layer_name]
counts = np.ravel(layer.sum(1))
if inplace:
Expand All @@ -193,10 +195,71 @@ def normalize_total(
dat[layer_name] = _normalize_data(layer, counts, after, copy=True)

logg.info(
' finished ({time_passed})',
time=start,
' finished ({time_passed})', time=start,
)
if key_added is not None:
logg.debug(f'and added {key_added!r}, counts per cell before normalization (adata.obs)')
logg.debug(
f'and added {key_added!r}, counts per cell before normalization (adata.obs)'
)

return dat if not inplace else None


# TODO: Document better
# TODO: Test better (have at least one gold standard comparison)
def normalize_geometric(
adata: AnnData, *, inplace=True, copy=False, use_raw=False, layer=None, obsm=None
):
"""Normalize data by geometric mean.

Params
------
adata
AnnData containing array to normalize.

Usage
-----
>>> sc.pp.normalize_geometric(adata)
>>> sc.pp.log1p(adata)
"""
if copy:
adata = adata.copy()
if adata.is_view:
adata._init_as_actual()

# Sorry about the ugliness of dataframe support
was_df = False
X = sc.get._get_obs_rep(adata, use_raw=use_raw, layer=layer, obsm=obsm)
if isinstance(X, pd.DataFrame):
was_df = True
orig = X
X = X.values

if issparse(X):
X = X.toarray()
inplace = False
if not np.issubdtype(X.dtype, np.floating):
X = X.astype(np.promote_type(X.dtype, np.float32), copy=False)

X = _normalize_geometric(X, inplace)

if was_df:
if X is not orig.values:
X = pd.DataFrame(X, index=orig.index, columns=orig.columns)
else:
X = orig

sc.get._set_obs_rep(adata, X, use_raw=use_raw, layer=layer, obsm=obsm)

if copy:
return adata


def _normalize_geometric(X: np.ndarray, inplace=True):
from scipy.stats.mstats import gmean

if inplace:
out = X
else:
out = None
return np.divide(X, gmean(X + 1, axis=0), out=X)
26 changes: 19 additions & 7 deletions scanpy/tests/test_clustering.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
import scanpy as sc
from scipy import sparse


@pytest.fixture
Expand All @@ -10,10 +11,10 @@ def adata_neighbors():
def test_leiden_basic(adata_neighbors):
sc.tl.leiden(adata_neighbors)

@pytest.mark.parametrize('clustering,key', [
(sc.tl.louvain, 'louvain'),
(sc.tl.leiden, 'leiden'),
])

@pytest.mark.parametrize(
'clustering,key', [(sc.tl.louvain, 'louvain'), (sc.tl.leiden, 'leiden'),]
)
def test_clustering_subset(adata_neighbors, clustering, key):
clustering(adata_neighbors, key_added=key)

Expand All @@ -23,9 +24,7 @@ def test_clustering_subset(adata_neighbors, clustering, key):
ncells_in_c = adata_neighbors.obs[key].value_counts().loc[c]
key_sub = str(key) + '_sub'
clustering(
adata_neighbors,
restrict_to=(key, [c]),
key_added=key_sub,
adata_neighbors, restrict_to=(key, [c]), key_added=key_sub,
)
# Get new clustering labels
new_partition = adata_neighbors.obs[key_sub]
Expand All @@ -50,5 +49,18 @@ def test_louvain_basic(adata_neighbors):

def test_partition_type(adata_neighbors):
import louvain

sc.tl.louvain(adata_neighbors, partition_type=louvain.RBERVertexPartition)
sc.tl.louvain(adata_neighbors, partition_type=louvain.SurpriseVertexPartition)


def test_leiden_multiplex():
"""Currently just tests that leiden_multiplex doesn't fail."""
adata = sc.AnnData(
X=sparse.random(100, 50, format="csr"),
obsp={
"rep1": sparse.random(100, 100, density=0.1, format="csr"),
"rep2": sparse.random(100, 100, density=0.1, format="csr"),
},
)
sc.tl.leiden_multiplex(adata, ["rep1", "rep2"])
15 changes: 15 additions & 0 deletions scanpy/tests/test_normalization.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import numpy as np
from anndata import AnnData
from scipy import sparse
from scipy.sparse import csr_matrix

import scanpy as sc
Expand Down Expand Up @@ -44,3 +45,17 @@ def test_normalize_total_view(typ, dtype):

assert not v.is_view
assert_equal(adata, v)


def test_normalize_geometric():
mtx = sparse.random(100, 50, density=0.4, format="csr", dtype=np.float32)
adata = AnnData(X=mtx.copy(), layers={"dense": mtx.toarray()})
adata.obsm["df"] = adata.to_df()

sc.pp.normalize_geometric(adata)
sc.pp.normalize_geometric(adata, obsm="df")
sc.pp.normalize_geometric(adata, layer="dense")

assert np.array_equal(adata.X, adata.layers["dense"])
assert np.array_equal(adata.X, adata.obsm["df"].values)
assert not np.array_equal(adata.X, mtx)
1 change: 1 addition & 0 deletions scanpy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from ._rank_genes_groups import rank_genes_groups, filter_rank_genes_groups
from ._dpt import dpt
from ._leiden import leiden
from ._leiden_multiplex import leiden_multiplex
from ._louvain import louvain
from ._sim import sim
from ._score_genes import score_genes, score_genes_cell_cycle
Expand Down
106 changes: 106 additions & 0 deletions scanpy/tools/_leiden_multiplex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from numbers import Number
from typing import List, Iterable, Union
from itertools import repeat

import scanpy as sc
import igraph

import pandas as pd
import numpy as np
from scipy.sparse import spmatrix


def _validate_reps(reps, adata):
reps_out = []
correct_shape = (adata.n_obs, adata.n_obs)
for i, rep in enumerate(reps):
if isinstance(rep, str):
rep = adata.obsp[rep]
if isinstance(rep, spmatrix):
reps_out.append(sc._utils.get_igraph_from_adjacency(rep, directed=True))
elif isinstance(rep, igraph.Graph):
if rep.shape != correct_shape:
raise ValueError(
"Graph of invalid shape passed to leiden_multiplex.\n\n"
f"Rep {i}'s shape was {rep.shape}, but should be {correct_shape}."
)
elif "weight" not in rep.es.attribute_names():
raise NotImplementedError(
"Graphs passed to leiden_multiplex must have edge weights under a 'weight' attribute."
)
reps_out.append(rep)
return reps_out


def _validate_floats(vals, n_reps: int, arg_name: str):
if isinstance(vals, Number):
return list(repeat(vals, n_reps))
vals_out = list(vals)
if len(vals_out) != n_reps:
raise ValueError(
f"Incorrect number of {arg_name}. Expected 1 or {n_reps}, got {len(vals_out)}."
)
return vals_out


# TODO: Document better
# TODO: Test better
def leiden_multiplex(
adata: "sc.AnnData",
reps: Iterable[Union[str, igraph.Graph, spmatrix]],
layer_weights: Union[float, Iterable[float]] = 1.0,
resolutions: Union[float, Iterable[float]] = 1.0,
key_added: str = "leiden_multiplex",
):
"""Perform a multiplexed clustering on multiple graphs representation of the same data.

Overview: https://leidenalg.readthedocs.io/en/latest/multiplex.html.

Params
------
adata
reps
Connectivity graphs to use. Possible values are:
* string key to .obsp
* sparse matrix of shape (n_obs, n_obs)
* igraph.Graph
layer_weights
Weights to use for each representation in the joint clustering.
resolutions
Resolution parameter to use for each representation.
key_added
Key in .obs to add this clustering in.

Usage
-----
>>> sc.tl.leiden_multiplex(adata, ["rna_connectivities", "protein_connectivities"])
>>> sc.pl.umap(adata, color="leiden_multiplex")
"""
n_reps = len(reps)

reps = _validate_reps(reps, adata)
layer_weights = _validate_floats(layer_weights, n_reps, "layer_weights")
resolutions = _validate_floats(resolutions, n_reps, "resolutions")

clustering = cluster_joint(reps, layer_weights, resolutions)

adata.obs[key_added] = pd.Categorical.from_codes(
clustering, categories=list(map(str, np.unique(clustering))),
)


def cluster_joint(
reps: List[igraph.Graph], layer_weights: List[float], resolutions: List[float],
):
"""Actually do the clustering.
"""
import leidenalg
partitions = [
leidenalg.RBConfigurationVertexPartition(
rep, weights="weight", resolution_parameter=res
)
for rep, res in zip(reps, resolutions)
]
optimiser = leidenalg.Optimiser()
optimiser.optimise_partition_multiplex(partitions, layer_weights=layer_weights)
return np.array(partitions[0].membership)