Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pca on sparse data reproducibility #1240

Merged
merged 4 commits into from May 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 21 additions & 0 deletions conftest.py
Expand Up @@ -19,3 +19,24 @@ def pytest_collection_modifyitems(config, items):
# `--run-internet` passed
if not run_internet and ("internet" in item.keywords):
item.add_marker(skip_internet)


# These fixtures provide a per test new copy of pbmc3k with some preprocessing run on it,
# without having to hit the disk or recompute normalization.
# The private fixture creates the object while the public one returns a deep copy.
@pytest.fixture(scope="session")
def _pbmc3k_normalized():
import scanpy as sc

pbmc = sc.datasets.pbmc3k()
pbmc.X = pbmc.X.astype("float64") # For better accuracy
sc.pp.filter_genes(pbmc, min_counts=1)
sc.pp.log1p(pbmc)
sc.pp.normalize_total(pbmc)
sc.pp.highly_variable_genes(pbmc)
return pbmc


@pytest.fixture
def pbmc3k_normalized(_pbmc3k_normalized):
return _pbmc3k_normalized.copy()
8 changes: 8 additions & 0 deletions docs/release-latest.rst
@@ -1,6 +1,14 @@
.. role:: small
.. role:: smaller

1.5.1 :small:`2020-05-21`
~~~~~~~~~~~~~~~~~~~~~~~~~

.. rubric:: Bug fixes

- Fixed a bug in :func:`~scanpy.pp.pca`, where `random_state` did not have an effect for sparse input :pr:`1240` :smaller:`I Virshup`
- Fixed docstring in :func:`~scanpy.pp.pca` which included an unused argument :pr:`1240` :smaller:`I Virshup`

1.5.0 :small:`2020-05-15`
~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Expand Up @@ -47,7 +47,6 @@ exclude = '''
)
|tests/(
test_get
|test_pca
|test_neighbors
|test_readwrite
|test_clustering
Expand Down
7 changes: 3 additions & 4 deletions scanpy/preprocessing/_pca.py
Expand Up @@ -94,9 +94,6 @@ def pca(
chunk_size
Number of observations to include in each chunk.
Required if `chunked=True` was passed.
pca_sparse
If `True`, uses the `'arpack'` solver in `scipy.sparse.linalg.svds`
with implicit mean centering on the sparse data.

Returns
-------
Expand Down Expand Up @@ -200,7 +197,9 @@ def pca(
'Use "arpack" (the default) or "lobpcg" instead.'
)

output = _pca_with_sparse(X, n_comps, solver=svd_solver)
output = _pca_with_sparse(
X, n_comps, solver=svd_solver, random_state=random_state
)
# this is just a wrapper for the results
X_pca = output['X_pca']
pca_ = PCA(n_components=n_comps, svd_solver=svd_solver)
Expand Down
23 changes: 23 additions & 0 deletions scanpy/tests/fixtures.py
@@ -0,0 +1,23 @@
"""This file contains some common fixtures for use in tests.

This is kept seperate from the helpers file because it relies on pytest.
"""
import pytest
import numpy as np
from scipy import sparse

from anndata.tests.helpers import asarray


@pytest.fixture(
params=[sparse.csr_matrix, sparse.csc_matrix, asarray],
ids=["scipy-csr", "scipy-csc", "np-ndarray"],
)
def array_type(request):
"""Function which converts passed array to one of the common array types."""
return request.param


@pytest.fixture(params=[np.float64, np.float32])
def float_dtype(request):
return request.param
90 changes: 57 additions & 33 deletions scanpy/tests/test_pca.py
Expand Up @@ -2,55 +2,67 @@
import numpy as np
from anndata import AnnData
from scipy.sparse import csr_matrix
from scipy import sparse

import scanpy as sc
from scanpy.tests.fixtures import array_type, float_dtype
from anndata.tests.helpers import assert_equal

A_list = [
[0, 0, 7, 0, 0],
[8, 5, 0, 2, 0],
[6, 0, 0, 2, 5],
[0, 0, 0, 1, 0],
[8, 8, 2, 1, 0],
[0, 0, 0, 4, 5]
[0, 0, 0, 4, 5],
]

A_pca = np.array([
[-4.4783009 , 5.55508466, 1.73111572, -0.06029139, 0.17292555],
[ 5.4855141 , -0.42651191, -0.74776055, -0.74532146, 0.74633582],
[ 0.01161428, -4.0156662 , 2.37252748, -1.33122372, -0.29044446],
[-3.61934397, 0.48525412, -2.96861931, -1.16312545, -0.33230607],
[ 7.14050048, 1.86330409, -0.05786325, 1.25045782, -0.50213107],
[-4.53998399, -3.46146476, -0.32940009, 2.04950419, 0.20562023]
])

A_svd = np.array([
[-0.77034038, -2.00750922, 6.64603489, -0.39669256, -0.22212097],
[-9.47135856, -0.6326006 , -1.33787112, -0.24894361, -1.02044665],
[-5.90007339, 4.99658727, 0.70712592, -2.15188849, 0.30430008],
[-0.19132409, 0.42172251, 0.11169531, 0.50977966, -0.71637566],
[-11.1286238, -2.73045559, 0.08040596, 1.06850585, 0.74173764],
[-1.50180389, 5.56886849, 1.64034442, 2.24476032, -0.05109001]
])


@pytest.mark.parametrize('typ', [np.array, csr_matrix])
def test_pca_transform(typ):
A = typ(A_list, dtype='float32')
A_pca = np.array(
[
[-4.4783009, 5.55508466, 1.73111572, -0.06029139, 0.17292555],
[5.4855141, -0.42651191, -0.74776055, -0.74532146, 0.74633582],
[0.01161428, -4.0156662, 2.37252748, -1.33122372, -0.29044446],
[-3.61934397, 0.48525412, -2.96861931, -1.16312545, -0.33230607],
[7.14050048, 1.86330409, -0.05786325, 1.25045782, -0.50213107],
[-4.53998399, -3.46146476, -0.32940009, 2.04950419, 0.20562023],
]
)

A_svd = np.array(
[
[-0.77034038, -2.00750922, 6.64603489, -0.39669256, -0.22212097],
[-9.47135856, -0.6326006, -1.33787112, -0.24894361, -1.02044665],
[-5.90007339, 4.99658727, 0.70712592, -2.15188849, 0.30430008],
[-0.19132409, 0.42172251, 0.11169531, 0.50977966, -0.71637566],
[-11.1286238, -2.73045559, 0.08040596, 1.06850585, 0.74173764],
[-1.50180389, 5.56886849, 1.64034442, 2.24476032, -0.05109001],
]
)


def test_pca_transform(array_type):
A = array_type(A_list).astype('float32')
A_pca_abs = np.abs(A_pca)
A_svd_abs = np.abs(A_svd)

adata = AnnData(A)

sc.pp.pca(adata, n_comps=4, zero_center=True, svd_solver='arpack', dtype='float64')

assert np.linalg.norm(A_pca_abs[:, :4]-np.abs(adata.obsm['X_pca'])) < 2e-05
assert np.linalg.norm(A_pca_abs[:, :4] - np.abs(adata.obsm['X_pca'])) < 2e-05

sc.pp.pca(adata, n_comps=5, zero_center=True, svd_solver='randomized',
dtype='float64', random_state=14)
assert np.linalg.norm(A_pca_abs-np.abs(adata.obsm['X_pca'])) < 2e-05
sc.pp.pca(
adata,
n_comps=5,
zero_center=True,
svd_solver='randomized',
dtype='float64',
random_state=14,
)
assert np.linalg.norm(A_pca_abs - np.abs(adata.obsm['X_pca'])) < 2e-05

sc.pp.pca(adata, n_comps=4, zero_center=False, dtype='float64', random_state=14)
assert np.linalg.norm(A_svd_abs[:, :4]-np.abs(adata.obsm['X_pca'])) < 2e-05
assert np.linalg.norm(A_svd_abs[:, :4] - np.abs(adata.obsm['X_pca'])) < 2e-05


def test_pca_shapes():
Expand All @@ -68,15 +80,12 @@ def test_pca_shapes():
sc.pp.pca(adata, n_comps=100)


def test_pca_sparse():
def test_pca_sparse(pbmc3k_normalized):
"""
Tests that implicitly centered pca on sparse arrays returns equivalent results to
explicit centering on dense arrays.
"""
pbmc = sc.datasets.pbmc3k()
pbmc.X = pbmc.X.astype(np.float64)
sc.pp.filter_genes(pbmc, min_cells=1)
sc.pp.log1p(pbmc)
pbmc = pbmc3k_normalized

pbmc_dense = pbmc.copy()
pbmc_dense.X = pbmc_dense.X.toarray()
Expand All @@ -90,3 +99,18 @@ def test_pca_sparse():
)
assert np.allclose(implicit.obsm['X_pca'], explicit.obsm['X_pca'])
assert np.allclose(implicit.varm['PCs'], explicit.varm['PCs'])


# This will take a while to run, but irreproducibility may
# not show up for float32 unless the matrix is large enough
def test_pca_reproducible(pbmc3k_normalized, array_type, float_dtype):
pbmc = pbmc3k_normalized
pbmc.X = array_type(pbmc.X)

a = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
b = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
c = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=0)

assert_equal(a, b)
# Test that changing random seed changes result
assert not np.array_equal(a.obsm["X_pca"], c.obsm["X_pca"])