Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Layers support for PCA and regress_out #2588

Merged
merged 6 commits into from Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion scanpy/preprocessing/_pca.py
Expand Up @@ -12,11 +12,14 @@
from ._utils import _get_mean_var
from .._utils import AnyRandom
from .. import settings
from ..get import _get_obs_rep


def pca(
data: Union[AnnData, np.ndarray, spmatrix],
n_comps: Optional[int] = None,
*,
layer: Optional[str] = None,
zero_center: Optional[bool] = True,
svd_solver: str = 'arpack',
random_state: AnyRandom = 0,
Expand Down Expand Up @@ -49,6 +52,8 @@ def pca(
n_comps
Number of principal components to compute. Defaults to 50, or 1 - minimum
dimension size of selected representation.
layer
If provided, which element of layers to use for PCA.
zero_center
If `True`, compute standard PCA from covariance matrix.
If `False`, omit zero-centering variables
Expand Down Expand Up @@ -153,7 +158,7 @@ def pca(

random_state = check_random_state(random_state)

X = adata_comp.X
X = _get_obs_rep(adata_comp, layer=layer)

if chunked:
if not zero_center or random_state or svd_solver != 'arpack':
Expand Down
28 changes: 15 additions & 13 deletions scanpy/preprocessing/_simple.py
Expand Up @@ -569,6 +569,7 @@ def normalize_per_cell(
def regress_out(
adata: AnnData,
keys: Union[str, Sequence[str]],
layer: Optional[str] = None,
n_jobs: Optional[int] = None,
copy: bool = False,
) -> Optional[AnnData]:
Expand All @@ -585,6 +586,8 @@ def regress_out(
The annotated data matrix.
keys
Keys for observation annotation on which to regress on.
layer
If provided, which element of layers to regress on.
n_jobs
Number of jobs for parallel computation.
`None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`.
Expand All @@ -596,21 +599,20 @@ def regress_out(
Depending on `copy` returns or updates `adata` with the corrected data matrix.
"""
start = logg.info(f'regressing out {keys}')
if issparse(adata.X):
logg.info(' sparse input is densified and may ' 'lead to high memory use')
adata = adata.copy() if copy else adata

sanitize_anndata(adata)

# TODO: This should throw an implicit modification warning
if adata.is_view:
adata._init_as_actual(adata.copy())
view_to_actual(adata)

if isinstance(keys, str):
keys = [keys]

if issparse(adata.X):
adata.X = adata.X.toarray()
X = _get_obs_rep(adata, layer=layer)

if issparse(X):
logg.info(' sparse input is densified and may ' 'lead to high memory use')
X = X.toarray()

n_jobs = sett.n_jobs if n_jobs is None else n_jobs

Expand All @@ -624,10 +626,10 @@ def regress_out(
'we regress on the mean for each category.'
)
logg.debug('... regressing on per-gene means within categories')
regressors = np.zeros(adata.X.shape, dtype='float32')
regressors = np.zeros(X.shape, dtype='float32')
for category in adata.obs[keys[0]].cat.categories:
mask = (category == adata.obs[keys[0]]).values
for ix, x in enumerate(adata.X.T):
for ix, x in enumerate(X.T):
regressors[mask, ix] = x[mask].mean()
variable_is_categorical = True
# regress on one or several ordinal variables
Expand All @@ -641,13 +643,13 @@ def regress_out(
# add column of ones at index 0 (first column)
regressors.insert(0, 'ones', 1.0)

len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int)
n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int)
len_chunk = np.ceil(min(1000, X.shape[1]) / n_jobs).astype(int)
n_chunks = np.ceil(X.shape[1] / len_chunk).astype(int)

tasks = []
# split the adata.X matrix by columns in chunks of size n_chunk
# (the last chunk could be of smaller size than the others)
chunk_list = np.array_split(adata.X, n_chunks, axis=1)
chunk_list = np.array_split(X, n_chunks, axis=1)
if variable_is_categorical:
regressors_chunk = np.array_split(regressors, n_chunks, axis=1)
for idx, data_chunk in enumerate(chunk_list):
Expand All @@ -666,7 +668,7 @@ def regress_out(

# res is a list of vectors (each corresponding to a regressed gene column).
# The transpose is needed to get the matrix in the shape needed
adata.X = np.vstack(res).T.astype(adata.X.dtype)
_set_obs_rep(adata, np.vstack(res).T, layer=layer)
logg.info(' finished', time=start)
return adata if copy else None

Expand Down
19 changes: 19 additions & 0 deletions scanpy/tests/test_pca.py
Expand Up @@ -154,3 +154,22 @@ def test_pca_n_pcs():
assert np.allclose(
original.obsp["distances"].toarray(), renamed.obsp["distances"].toarray()
)


def test_pca_layer():
"""
Tests that layers works the same way as .X
"""
pbmc = pbmc3k_normalized()

pbmc.layers["counts"] = pbmc.X.copy()

X_pca = sc.pp.pca(pbmc, dtype=np.float64, copy=True)
layer_pca = sc.pp.pca(pbmc, layer="counts", dtype=np.float64, copy=True)

assert np.allclose(X_pca.uns["pca"]["variance"], layer_pca.uns["pca"]["variance"])
assert np.allclose(
X_pca.uns["pca"]["variance_ratio"], layer_pca.uns["pca"]["variance_ratio"]
)
assert np.allclose(X_pca.obsm['X_pca'], layer_pca.obsm['X_pca'])
assert np.allclose(X_pca.varm['PCs'], layer_pca.varm['PCs'])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should be exactly equal, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also maybe checkout scanpy.testing._helpers.check_rep_results

20 changes: 20 additions & 0 deletions scanpy/tests/test_preprocessing.py
Expand Up @@ -183,6 +183,26 @@ def test_regress_out_ordinal():
np.testing.assert_array_equal(single.X, multi.X)


def test_regress_out_layer():
from scipy.sparse import random

adata = AnnData(random(1000, 100, density=0.6, format='csr'))
adata.obs['percent_mito'] = np.random.rand(adata.X.shape[0])
adata.obs['n_counts'] = adata.X.sum(axis=1)
adata.layers["counts"] = adata.X.copy()

single = sc.pp.regress_out(
adata, keys=['n_counts', 'percent_mito'], n_jobs=1, copy=True
)
assert adata.X.shape == single.X.shape

layer = sc.pp.regress_out(
adata, layer="counts", keys=['n_counts', 'percent_mito'], n_jobs=1, copy=True
)

np.testing.assert_array_equal(single.X, layer.layers["counts"])


def test_regress_out_view():
from scipy.sparse import random

Expand Down