Make sure random seed has an effect for pca

scverse · May 21, 2020 · 46a83a9 · 46a83a9
1 parent 78c6a43
commit 46a83a9
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 9 deletions.
diff --git a/conftest.py b/conftest.py
@@ -19,3 +19,24 @@ def pytest_collection_modifyitems(config, items):
         # `--run-internet` passed
         if not run_internet and ("internet" in item.keywords):
             item.add_marker(skip_internet)
+
+
+# These fixtures provide a per test new copy of pbmc3k with some preprocessing run on it,
+# without having to hit the disk or recompute normalization.
+# The private fixture creates the object while the public one returns a deep copy.
+@pytest.fixture(scope="session")
+def _pbmc3k_normalized():
+    import scanpy as sc
+
+    pbmc = sc.datasets.pbmc3k()
+    pbmc.X = pbmc.X.astype("float64")  # For better accuracy
+    sc.pp.filter_genes(pbmc, min_counts=1)
+    sc.pp.log1p(pbmc)
+    sc.pp.normalize_total(pbmc)
+    sc.pp.highly_variable_genes(pbmc)
+    return pbmc
+
+
+@pytest.fixture
+def pbmc3k_normalized(_pbmc3k_normalized):
+    return _pbmc3k_normalized.copy()
diff --git a/scanpy/preprocessing/_pca.py b/scanpy/preprocessing/_pca.py
@@ -200,7 +200,7 @@ def pca(
                 'Use "arpack" (the default) or "lobpcg" instead.'
             )
 
-        output = _pca_with_sparse(X, n_comps, solver=svd_solver)
+        output = _pca_with_sparse(X, n_comps, solver=svd_solver, random_state=random_state)
         # this is just a wrapper for the results
         X_pca = output['X_pca']
         pca_ = PCA(n_components=n_comps, svd_solver=svd_solver)

diff --git a/scanpy/tests/fixtures.py b/scanpy/tests/fixtures.py
@@ -0,0 +1,23 @@
+"""This file contains some common fixtures for use in tests.
+
+This is kept seperate from the helpers file because it relies on pytest.
+"""
+import pytest
+import numpy as np
+from scipy import sparse
+
+from anndata.tests.helpers import asarray
+
+
+@pytest.fixture(
+    params=[sparse.csr_matrix, sparse.csc_matrix, asarray],
+    ids=["scipy-csr", "scipy-csc", "np-ndarray"],
+)
+def array_type(request):
+    """Function which converts passed array to one of the common array types."""
+    return request.param
+
+
+@pytest.fixture(params=[np.float64, np.float32])
+def float_dtype(request):
+    return request.param
diff --git a/scanpy/tests/test_pca.py b/scanpy/tests/test_pca.py
@@ -2,8 +2,11 @@
 import numpy as np
 from anndata import AnnData
 from scipy.sparse import csr_matrix
+from scipy import sparse
 
 import scanpy as sc
+from scanpy.tests.fixtures import array_type, float_dtype
+from anndata.tests.helpers import assert_equal
 
 A_list = [
     [0, 0, 7, 0, 0],
@@ -33,9 +36,8 @@
 ])
 
 
-@pytest.mark.parametrize('typ', [np.array, csr_matrix])
-def test_pca_transform(typ):
-    A = typ(A_list, dtype='float32')
+def test_pca_transform(array_type):
+    A = array_type(A_list).astype('float32')
     A_pca_abs = np.abs(A_pca)
     A_svd_abs = np.abs(A_svd)
 
@@ -68,15 +70,12 @@ def test_pca_shapes():
         sc.pp.pca(adata, n_comps=100)
 
 
-def test_pca_sparse():
+def test_pca_sparse(pbmc3k_normalized):
     """
     Tests that implicitly centered pca on sparse arrays returns equivalent results to
     explicit centering on dense arrays.
     """
-    pbmc = sc.datasets.pbmc3k()
-    pbmc.X = pbmc.X.astype(np.float64)
-    sc.pp.filter_genes(pbmc, min_cells=1)
-    sc.pp.log1p(pbmc)
+    pbmc = pbmc3k_normalized
 
     pbmc_dense = pbmc.copy()
     pbmc_dense.X = pbmc_dense.X.toarray()
@@ -90,3 +89,18 @@ def test_pca_sparse():
     )
     assert np.allclose(implicit.obsm['X_pca'], explicit.obsm['X_pca'])
     assert np.allclose(implicit.varm['PCs'], explicit.varm['PCs'])
+
+
+# This will take a while to run, but irreproducibility may
+# not show up for float32 unless the matrix is large enough
+def test_pca_reproducible(pbmc3k_normalized, array_type, float_dtype):
+    pbmc = pbmc3k_normalized
+    pbmc.X = array_type(pbmc.X)
+
+    a = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
+    b = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
+    c = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=0)
+
+    assert_equal(a, b)
+    # Test that changing random seed changes result
+    assert not np.array_equal(a.obsm["X_pca"], c.obsm["X_pca"])