Skip to content

Commit

Permalink
Make sure random seed has an effect for pca
Browse files Browse the repository at this point in the history
  • Loading branch information
ivirshup committed May 21, 2020
1 parent 78c6a43 commit 46a83a9
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 9 deletions.
21 changes: 21 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,24 @@ def pytest_collection_modifyitems(config, items):
# `--run-internet` passed
if not run_internet and ("internet" in item.keywords):
item.add_marker(skip_internet)


# These fixtures provide a per test new copy of pbmc3k with some preprocessing run on it,
# without having to hit the disk or recompute normalization.
# The private fixture creates the object while the public one returns a deep copy.
@pytest.fixture(scope="session")
def _pbmc3k_normalized():
import scanpy as sc

pbmc = sc.datasets.pbmc3k()
pbmc.X = pbmc.X.astype("float64") # For better accuracy
sc.pp.filter_genes(pbmc, min_counts=1)
sc.pp.log1p(pbmc)
sc.pp.normalize_total(pbmc)
sc.pp.highly_variable_genes(pbmc)
return pbmc


@pytest.fixture
def pbmc3k_normalized(_pbmc3k_normalized):
return _pbmc3k_normalized.copy()
2 changes: 1 addition & 1 deletion scanpy/preprocessing/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def pca(
'Use "arpack" (the default) or "lobpcg" instead.'
)

output = _pca_with_sparse(X, n_comps, solver=svd_solver)
output = _pca_with_sparse(X, n_comps, solver=svd_solver, random_state=random_state)
# this is just a wrapper for the results
X_pca = output['X_pca']
pca_ = PCA(n_components=n_comps, svd_solver=svd_solver)
Expand Down
23 changes: 23 additions & 0 deletions scanpy/tests/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""This file contains some common fixtures for use in tests.
This is kept seperate from the helpers file because it relies on pytest.
"""
import pytest
import numpy as np
from scipy import sparse

from anndata.tests.helpers import asarray


@pytest.fixture(
params=[sparse.csr_matrix, sparse.csc_matrix, asarray],
ids=["scipy-csr", "scipy-csc", "np-ndarray"],
)
def array_type(request):
"""Function which converts passed array to one of the common array types."""
return request.param


@pytest.fixture(params=[np.float64, np.float32])
def float_dtype(request):
return request.param
30 changes: 22 additions & 8 deletions scanpy/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import numpy as np
from anndata import AnnData
from scipy.sparse import csr_matrix
from scipy import sparse

import scanpy as sc
from scanpy.tests.fixtures import array_type, float_dtype
from anndata.tests.helpers import assert_equal

A_list = [
[0, 0, 7, 0, 0],
Expand Down Expand Up @@ -33,9 +36,8 @@
])


@pytest.mark.parametrize('typ', [np.array, csr_matrix])
def test_pca_transform(typ):
A = typ(A_list, dtype='float32')
def test_pca_transform(array_type):
A = array_type(A_list).astype('float32')
A_pca_abs = np.abs(A_pca)
A_svd_abs = np.abs(A_svd)

Expand Down Expand Up @@ -68,15 +70,12 @@ def test_pca_shapes():
sc.pp.pca(adata, n_comps=100)


def test_pca_sparse():
def test_pca_sparse(pbmc3k_normalized):
"""
Tests that implicitly centered pca on sparse arrays returns equivalent results to
explicit centering on dense arrays.
"""
pbmc = sc.datasets.pbmc3k()
pbmc.X = pbmc.X.astype(np.float64)
sc.pp.filter_genes(pbmc, min_cells=1)
sc.pp.log1p(pbmc)
pbmc = pbmc3k_normalized

pbmc_dense = pbmc.copy()
pbmc_dense.X = pbmc_dense.X.toarray()
Expand All @@ -90,3 +89,18 @@ def test_pca_sparse():
)
assert np.allclose(implicit.obsm['X_pca'], explicit.obsm['X_pca'])
assert np.allclose(implicit.varm['PCs'], explicit.varm['PCs'])


# This will take a while to run, but irreproducibility may
# not show up for float32 unless the matrix is large enough
def test_pca_reproducible(pbmc3k_normalized, array_type, float_dtype):
pbmc = pbmc3k_normalized
pbmc.X = array_type(pbmc.X)

a = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
b = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=42)
c = sc.pp.pca(pbmc, copy=True, dtype=float_dtype, random_state=0)

assert_equal(a, b)
# Test that changing random seed changes result
assert not np.array_equal(a.obsm["X_pca"], c.obsm["X_pca"])

0 comments on commit 46a83a9

Please sign in to comment.