scverse · ivirshup · May 21, 2020 · May 21, 2020 · May 21, 2020 · May 21, 2020
diff --git a/conftest.py b/conftest.py
@@ -19,3 +19,24 @@ def pytest_collection_modifyitems(config, items):
         # `--run-internet` passed
         if not run_internet and ("internet" in item.keywords):
             item.add_marker(skip_internet)
+
+
+# These fixtures provide a per test new copy of pbmc3k with some preprocessing run on it,
+# without having to hit the disk or recompute normalization.
+# The private fixture creates the object while the public one returns a deep copy.
+@pytest.fixture(scope="session")
+def _pbmc3k_normalized():
+    import scanpy as sc
+
+    pbmc = sc.datasets.pbmc3k()
+    pbmc.X = pbmc.X.astype("float64")  # For better accuracy
+    sc.pp.filter_genes(pbmc, min_counts=1)
+    sc.pp.log1p(pbmc)
+    sc.pp.normalize_total(pbmc)
+    sc.pp.highly_variable_genes(pbmc)
+    return pbmc
+
+
+@pytest.fixture
+def pbmc3k_normalized(_pbmc3k_normalized):
+    return _pbmc3k_normalized.copy()
diff --git a/docs/release-latest.rst b/docs/release-latest.rst
@@ -1,6 +1,14 @@
 .. role:: small
 .. role:: smaller
 
+1.5.1 :small:`2020-05-21`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. rubric:: Bug fixes
+
+- Fixed a bug in :func:`~scanpy.pp.pca`, where `random_state` did not have an effect for sparse input :pr:`1240` :smaller:`I Virshup`
+- Fixed docstring in :func:`~scanpy.pp.pca` which included an unused argument :pr:`1240` :smaller:`I Virshup`
+
 1.5.0 :small:`2020-05-15`
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -47,7 +47,6 @@ exclude = '''
     )
     |tests/(
         test_get
-        |test_pca
         |test_neighbors
         |test_readwrite
         |test_clustering

diff --git a/scanpy/preprocessing/_pca.py b/scanpy/preprocessing/_pca.py
@@ -94,9 +94,6 @@ def pca(
     chunk_size
         Number of observations to include in each chunk.
         Required if `chunked=True` was passed.
-    pca_sparse
-        If `True`, uses the `'arpack'` solver in `scipy.sparse.linalg.svds`
-        with implicit mean centering on the sparse data.
 
     Returns
     -------
@@ -200,7 +197,9 @@ def pca(
                 'Use "arpack" (the default) or "lobpcg" instead.'
             )
 
-        output = _pca_with_sparse(X, n_comps, solver=svd_solver)
+        output = _pca_with_sparse(
+            X, n_comps, solver=svd_solver, random_state=random_state
+        )
         # this is just a wrapper for the results
         X_pca = output['X_pca']
         pca_ = PCA(n_components=n_comps, svd_solver=svd_solver)

diff --git a/scanpy/tests/fixtures.py b/scanpy/tests/fixtures.py
@@ -0,0 +1,23 @@
+"""This file contains some common fixtures for use in tests.
+
+This is kept seperate from the helpers file because it relies on pytest.
+"""
+import pytest
+import numpy as np
+from scipy import sparse
+
+from anndata.tests.helpers import asarray
+
+
+@pytest.fixture(
+    params=[sparse.csr_matrix, sparse.csc_matrix, asarray],
+    ids=["scipy-csr", "scipy-csc", "np-ndarray"],
+)
+def array_type(request):
+    """Function which converts passed array to one of the common array types."""
+    return request.param
+
+
+@pytest.fixture(params=[np.float64, np.float32])
+def float_dtype(request):
+    return request.param
diff --git a/scanpy/tests/test_pca.py b/scanpy/tests/test_pca.py
@@ -2,55 +2,67 @@
 import numpy as np
 from anndata import AnnData
 from scipy.sparse import csr_matrix
+from scipy import sparse
 
 import scanpy as sc
+from scanpy.tests.fixtures import array_type, float_dtype
+from anndata.tests.helpers import assert_equal
 
 A_list = [
     [0, 0, 7, 0, 0],
     [8, 5, 0, 2, 0],
     [6, 0, 0, 2, 5],
     [0, 0, 0, 1, 0],
     [8, 8, 2, 1, 0],
-    [0, 0, 0, 4, 5]
+    [0, 0, 0, 4, 5],
 ]
 
-A_pca = np.array([
-    [-4.4783009 ,  5.55508466,  1.73111572, -0.06029139,  0.17292555],
-    [ 5.4855141 , -0.42651191, -0.74776055, -0.74532146,  0.74633582],
-    [ 0.01161428, -4.0156662 ,  2.37252748, -1.33122372, -0.29044446],
-    [-3.61934397,  0.48525412, -2.96861931, -1.16312545, -0.33230607],
-    [ 7.14050048,  1.86330409, -0.05786325,  1.25045782, -0.50213107],
-    [-4.53998399, -3.46146476, -0.32940009,  2.04950419,  0.20562023]
-])
-
-A_svd = np.array([
-    [-0.77034038, -2.00750922,  6.64603489, -0.39669256, -0.22212097],
-    [-9.47135856, -0.6326006 , -1.33787112, -0.24894361, -1.02044665],
-    [-5.90007339,  4.99658727,  0.70712592, -2.15188849,  0.30430008],
-    [-0.19132409,  0.42172251,  0.11169531,  0.50977966, -0.71637566],
-    [-11.1286238, -2.73045559,  0.08040596,  1.06850585,  0.74173764],
-    [-1.50180389,  5.56886849,  1.64034442,  2.24476032, -0.05109001]
-])
-
-
-@pytest.mark.parametrize('typ', [np.array, csr_matrix])
-def test_pca_transform(typ):
-    A = typ(A_list, dtype='float32')
+A_pca = np.array(
+    [
+        [-4.4783009, 5.55508466, 1.73111572, -0.06029139, 0.17292555],
+        [5.4855141, -0.42651191, -0.74776055, -0.74532146, 0.74633582],
+        [0.01161428, -4.0156662, 2.37252748, -1.33122372, -0.29044446],
+        [-3.61934397, 0.48525412, -2.96861931, -1.16312545, -0.33230607],
+        [7.14050048, 1.86330409, -0.05786325, 1.25045782, -0.50213107],
+        [-4.53998399, -3.46146476, -0.32940009, 2.04950419, 0.20562023],
+    ]
+)
+
+A_svd = np.array(
+    [
+        [-0.77034038, -2.00750922, 6.64603489, -0.39669256, -0.22212097],
+        [-9.47135856, -0.6326006, -1.33787112, -0.24894361, -1.02044665],
+        [-5.90007339, 4.99658727, 0.70712592, -2.15188849, 0.30430008],
+        [-0.19132409, 0.42172251, 0.11169531, 0.50977966, -0.71637566],
+        [-11.1286238, -2.73045559, 0.08040596, 1.06850585, 0.74173764],
+        [-1.50180389, 5.56886849, 1.64034442, 2.24476032, -0.05109001],
+    ]
+)
+
+
+def test_pca_transform(array_type):
+    A = array_type(A_list).astype('float32')
     A_pca_abs = np.abs(A_pca)
     A_svd_abs = np.abs(A_svd)
 
     adata = AnnData(A)
 
     sc.pp.pca(adata, n_comps=4, zero_center=True, svd_solver='arpack', dtype='float64')
 
-    assert np.linalg.norm(A_pca_abs[:, :4]-np.abs(adata.obsm['X_pca'])) < 2e-05
+    assert np.linalg.norm(A_pca_abs[:, :4] - np.abs(adata.obsm['X_pca'])) < 2e-05
 
-    sc.pp.pca(adata, n_comps=5, zero_center=True, svd_solver='randomized',
-              dtype='float64', random_state=14)
-    assert np.linalg.norm(A_pca_abs-np.abs(adata.obsm['X_pca'])) < 2e-05
+    sc.pp.pca(
+        adata,
+        n_comps=5,
+        zero_center=True,
+        svd_solver='randomized',
+        dtype='float64',
+        random_state=14,
+    )
+    assert np.linalg.norm(A_pca_abs - np.abs(adata.obsm['X_pca'])) < 2e-05
 
     sc.pp.pca(adata, n_comps=4, zero_center=False, dtype='float64', random_state=14)
-    assert np.linalg.norm(A_svd_abs[:, :4]-np.abs(adata.obsm['X_pca'])) < 2e-05
+    assert np.linalg.norm(A_svd_abs[:, :4] - np.abs(adata.obsm['X_pca'])) < 2e-05
 
 
 def test_pca_shapes():
@@ -68,15 +80,12 @@ def test_pca_shapes():
         sc.pp.pca(adata, n_comps=100)
 
 
-def test_pca_sparse():
+def test_pca_sparse(pbmc3k_normalized):
     """
     Tests that implicitly centered pca on sparse arrays returns equivalent results to
     explicit centering on dense arrays.
     """
-    pbmc = sc.datasets.pbmc3k()
-    pbmc.X = pbmc.X.astype(np.float64)
-    sc.pp.filter_genes(pbmc, min_cells=1)
-    sc.pp.log1p(pbmc)
+    pbmc = pbmc3k_normalized
 
     pbmc_dense = pbmc.copy()
     pbmc_dense.X = pbmc_dense.X.toarray()
@@ -90,3 +99,18 @@ def test_pca_sparse():
     )
     assert np.allclose(implicit.obsm['X_pca'], explicit.obsm['X_pca'])
     assert np.allclose(implicit.varm['PCs'], explicit.varm['PCs'])
+
+
+# This will take a while to run, but irreproducibility may
+# not show up for float32 unless the matrix is large enough
+def test_pca_reproducible(pbmc3k_normalized, array_type, float_dtype):
+    pbmc = pbmc3k_normalized
+    pbmc.X = array_type(pbmc.X)
+
+    a = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
+    b = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
+    c = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=0)
+
+    assert_equal(a, b)
+    # Test that changing random seed changes result
+    assert not np.array_equal(a.obsm["X_pca"], c.obsm["X_pca"])