Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds test for zero genes to pca #144

Merged
merged 2 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/0.10.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
```
* Fixes an issue where `pp.normalize` and `pp.log1p` now use `copy` and `inplace` corretly {pr}`129` {smaller}`S Dicks`
* changes the graph constructor for `tl.leiden` and `tl.louvain` {pr}`143` {smaller}`S Dicks`
* Added a test to handle zero features, that caused issues in the sparse `pp.pca` {pr}`144` {smaller}`S Dicks`

```{rubric} Removals
```
Expand Down
12 changes: 12 additions & 0 deletions src/rapids_singlecell/preprocessing/_kernels/_pca_sparse_kernel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import cupy as cp
from cuml.common.kernel_utils import cuda_kernel_factory

cov_kernel_str = r"""
Expand Down Expand Up @@ -50,6 +51,17 @@
}
}
"""
check_zero_genes = r"""
extern "C" __global__ void check_zero_genes(const int* indices, int* genes, int nnz) {
int value = blockIdx.x * blockDim.x + threadIdx.x;
if(value >= nnz){
return;
}
atomicAdd(&genes[indices[value]], 1);

}
"""
_zero_genes_kernel = cp.RawKernel(check_zero_genes, "check_zero_genes")


def _cov_kernel(dtype):
Expand Down
24 changes: 24 additions & 0 deletions src/rapids_singlecell/preprocessing/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def fit(self, x):

if not isspmatrix_csr(x):
x = x.tocsr()
_check_matrix_for_zero_genes(x)
self.n_samples_ = x.shape[0]
self.n_features_in_ = x.shape[1] if x.ndim == 2 else 1
self.dtype = x.data.dtype
Expand Down Expand Up @@ -354,3 +355,26 @@ def _cov_sparse(x, return_gram=False, return_mean=False):
return cov_result, mean_x, mean_x
elif return_gram and return_mean:
return cov_result, gram_matrix, mean_x, mean_x


def _check_matrix_for_zero_genes(X):
gene_ex = cp.zeros(X.shape[1], dtype=cp.int32)

from ._kernels._pca_sparse_kernel import _zero_genes_kernel

block = (32,)
grid = (int(math.ceil(X.nnz / block[0])),)
_zero_genes_kernel(
grid,
block,
(
X.indices,
gene_ex,
X.nnz,
),
)
if cp.any(gene_ex == 0):
raise ValueError(
"There are genes with zero expression. "
"Please remove them before running PCA."
)
3 changes: 2 additions & 1 deletion src/rapids_singlecell/preprocessing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def _check_gpu_X(X):
else:
raise TypeError(
"The input is not a CuPy ndarray or CuPy sparse matrix. "
"Rapids-singlecell only supports GPU matrices, "
"Rapids-singlecell only supports GPU matrices in this function, "
"so your input must be either a CuPy ndarray or a CuPy sparse matrix. "
"Please checkout `rapids_singlecell.get.anndata_to_GPU` to convert your data to GPU. "
"If you're working with CPU-based matrices, please consider using Scanpy instead."
)
19 changes: 7 additions & 12 deletions tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,18 +137,6 @@ def test_pca_sparse():
atol=1e-6,
)

def test_mask_highly_var_error():
"""Check if use_highly_variable=True throws an error if the annotation is missing."""
adata = AnnData(np.array(A_list).astype("float32"))
with pytest.warns(
FutureWarning,
match=r"Argument `use_highly_variable` is deprecated, consider using the mask argument\.",
), pytest.raises(
ValueError,
match=r"Did not find `adata\.var\['highly_variable'\]`\.",
):
sc.pp.pca(adata, use_highly_variable=True)

def test_mask_length_error():
"""Check error for n_obs / mask length mismatch."""
adata = AnnData(np.array(A_list).astype("float32"))
Expand Down Expand Up @@ -244,3 +232,10 @@ def test_pca_layer():
)
np.testing.assert_almost_equal(X_adata.obsm["X_pca"], layer_adata.obsm["X_pca"])
np.testing.assert_almost_equal(X_adata.varm["PCs"], layer_adata.varm["PCs"])

def test_pca_layer_mask():
adata = sc.datasets.pbmc3k()[:,1000].copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
with pytest.raises(ValueError,match="There are genes with zero expression. Please remove them before running PCA."):
rsc.pp.pca(adata)