Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The method for annotating genes with cell types #812

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Empty file added example.py
Empty file.
8 changes: 8 additions & 0 deletions scanpy/external/__init__.py
Expand Up @@ -74,6 +74,14 @@

tl.sandbag
tl.cyclone

Cell annotation
~~~~~~~~~~~~~~~

.. autosummary::
:toctree: .

tl.annotator


Plotting: PL
Expand Down
1 change: 1 addition & 0 deletions scanpy/external/tl/__init__.py
Expand Up @@ -2,3 +2,4 @@
from ._phate import phate
from ._phenograph import phenograph
from ._palantir import palantir
from ._annotator import annotator
146 changes: 146 additions & 0 deletions scanpy/external/tl/_annotator.py
@@ -0,0 +1,146 @@
"""\
Annotates gene expression (cell data) with cell types.
"""
import warnings

from anndata import AnnData
import pandas as pd


def annotator(
adata: AnnData,
markers: pd.DataFrame,
num_genes: int = None,
return_nonzero_annotations: bool = True,
p_threshold: float = 0.05,
p_value_fun: str = "binom",
z_threshold: float = 1.0,
scoring: str = "exp_ratio",
normalize: bool = False,
):
"""\
Annotator marks the data with cell type annotations based on marker genes.

Over-expressed genes are selected with the Mann-Whitney U tests and cell
types are assigned with the hypergeometric test. This function first selects
genes from gene expression data with the Mann-Whitney U test, then annotate
them with the hypergeometric test, and finally filter out cell types that
have zero scores for all cells. The results are scores that tell how
probable is each cell type for each cell.

Parameters
----------
adata
Tabular data with gene expressions.
markers
The data-frame with marker genes and cell types. Data-frame has two
columns **Gene** and **Cell Type** first holds gene names or ID and
second cell type for this gene. Gene names must be written in the same
format than genes in `adata`.
num_genes
The number of genes that the organism has.
return_nonzero_annotations
If true return scores only for cell types that have no zero scores.
p_threshold
A threshold for accepting the annotations. Annotations that have FDR
value bellow this threshold are used.
p_value_fun
A function that calculates a p-value. It can be either
`binom` that uses binom.sf or
`hypergeom` that uses hypergeom.sf.
z_threshold
The threshold for selecting the gene from gene expression data.
For each cell the attributes with z-value above this value are selected.
scoring
Scoring method for cell type scores. Available scores are:

exp_ratio
Proportion of genes typical for a cell type expressed in the cell
sum_of_expressed_markers
Sum of expressions of genes typical for a cell type
log_fdr
Negative of the logarithm of an false discovery rate (FDR) value
log_p_value
Negative of the logarithm of a p-value
normalize : bool, optional (default = False)
If this parameter is True data will be normalized during the
a process with a log CPM normalization.
That method works correctly data needs to be normalized.
Set this `normalize` on True if your data are not normalized already.

Returns
-------
pd.DataFrame
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved
Cell type for each cell for each cell. The result is a sore matrix that
tells how probable is each cell type for each cell. Columns are cell
types and rows are cells.

Example
-------

Here is the example of annotation of dendritic cells based on their gene
expressions. For annotation, we use data by Villani et al. (2017)[1] and
marker genes by Franzén et al. (2019)[2].

[1] Villani, A. C., Satija, ... Jardine, L. (2017). Single-cell
RNA-seq reveals new types of human blood dendritic cells, monocytes,
and progenitors. Science, 356(6335).

[2] Oscar Franzén, Li-Ming Gan, Johan L M Björkegren, PanglaoDB:
a web server for exploration of mouse and human single-cell RNA
sequencing data, Database, Volume 2019, 2019.
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved

>>> import pandas as pd
>>> from scanpy import AnnData
>>> from scanpy.external.tl import annotator
>>> import urllib.request
>>>
>>> # download data in a temporary directory
>>> file_name_data, _ = urllib.request.urlretrieve(
... "https://github.com/biolab/cell-annotation/releases/download/"
... "0.1.0/DC_expMatrix_DCnMono.csv.gz")
>>> file_name_markers, _ = urllib.request.urlretrieve(
... "https://github.com/biolab/cell-annotation/releases/download/"
... "0.1.0/panglao_gene_markers_human.csv.gz")
>>>
>>> # read data with pandas
>>> df = pd.read_csv(file_name_data, compression="gzip").iloc[:, :-2]
>>> df_markers = pd.read_csv(file_name_markers, compression="gzip")
>>>
>>> # transform data to AnnData
>>> anndata = AnnData(df.values, var=df.columns.values)
>>>
>>> # run annotation
>>> scores = annotator(anndata, df_markers, normalize=True)
"""

try:
from pointannotator.annotate_samples import AnnotateSamples
except ImportError:
raise ImportError(
'Please install point-annotator: \n\t' 'pip install point-annotator'
)

data_df = pd.DataFrame(adata.X, columns=adata.var.values.flatten())
if num_genes is None:
num_genes = data_df.shape[1]
warnings.warn(
"The number of\norganisms genes is not provided. It is "
"currently\nset to the number of genes of the dataset.\n"
"If you want to change it please set `num_genes` "
"parameter."
)

annotations = AnnotateSamples.annotate_samples(
data_df,
markers,
num_genes=num_genes,
return_nonzero_annotations=return_nonzero_annotations,
p_threshold=p_threshold,
p_value_fun=p_value_fun,
z_threshold=z_threshold,
scoring="scoring_" + scoring,
normalize=normalize,
)

return AnnData(annotations.values, var=annotations.columns.values)
158 changes: 158 additions & 0 deletions scanpy/tests/test_annotator.py
@@ -0,0 +1,158 @@
import unittest
from importlib.util import find_spec

import pandas as pd
import numpy as np
from scanpy import AnnData
from scanpy.external.tl import annotator

import pytest


@pytest.mark.skipif(
find_spec('pointannotator') is None, reason="point-annotator not installed"
)
class AnnotatorTests(unittest.TestCase):
def setUp(self):
self.markers = pd.DataFrame(
[
["Type 1", "111"],
["Type 1", "112"],
["Type 1", "113"],
["Type 1", "114"],
["Type 2", "211"],
["Type 2", "212"],
["Type 2", "213"],
["Type 2", "214"],
],
columns=["Cell Type", "Gene"],
)

genes = ["111", "112", "113", "114", "211", "212", "213", "214"]
self.data = pd.DataFrame(
np.array(
[
[1, 1, 1, 1.1, 0, 0, 0, 0],
[1, 0.8, 0.9, 1, 0, 0, 0, 0],
[0.7, 1.1, 1, 1.2, 0, 0, 0, 0],
[0.8, 0.7, 1.1, 1, 0, 0.1, 0, 0],
[0, 0, 0, 0, 1.05, 1.05, 1.1, 1],
[0, 0, 0, 0, 1.1, 1.0, 1.05, 1.1],
[0, 0, 0, 0, 1.05, 0.9, 1.1, 1.1],
[0, 0, 0, 0, 0.9, 0.9, 1.2, 1],
]
),
columns=genes,
)

# transform data to AnnData
self.anndata = AnnData(self.data.values, var=self.data.columns.values)

def basic_check(self, annotations):
self.assertEqual(type(annotations), AnnData)
self.assertEqual(len(annotations), len(self.anndata))
self.assertTupleEqual(
annotations.shape, (8, 2)
) # two types in the data
self.assertGreater(np.nansum(annotations.X), 0)
self.assertLessEqual(np.nanmax(annotations.X), 1)
self.assertGreaterEqual(np.nanmin(annotations.X), 0)

def test_annotator(self):
annotations = annotator(
self.anndata, self.markers, normalize=False, num_genes=15
)

self.basic_check(annotations)

def test_remove_empty_column(self):
"""
Type 3 column must be removed here, since this cell type does not
belong to any cell.
"""
additinal_markers = pd.DataFrame(
[["Type 3", "311"], ["Type 3", "312"], ["Type 3", "313"]],
columns=["Cell Type", "Gene"],
)
markers = self.markers.append(additinal_markers)

annotations = annotator(self.anndata, markers, num_genes=20)

self.basic_check(annotations)

annotations = annotator(
self.anndata,
markers,
num_genes=20,
return_nonzero_annotations=False,
)
self.assertEqual(len(annotations), len(self.anndata))
self.assertTupleEqual(
annotations.shape, (8, 3)
) # two types in the data
self.assertGreater(np.nansum(annotations.X), 0)
self.assertLessEqual(np.nanmax(annotations.X), 1)
self.assertGreaterEqual(np.nanmin(annotations.X), 0)

def test_sf(self):
"""
Test annotations with hypergeom.sf
"""
annotations = annotator(
self.anndata, self.markers, num_genes=15, p_value_fun="hypergeom"
)

self.basic_check(annotations)

def test_scoring(self):
# scoring SCORING_EXP_RATIO
annotations = annotator(
self.anndata, self.markers, num_genes=15, scoring="exp_ratio"
)

self.basic_check(annotations)

# scoring SCORING_MARKERS_SUM
annotations = annotator(
self.anndata,
self.markers,
num_genes=15,
scoring="sum_of_expressed_markers",
)

self.assertEqual(type(annotations), AnnData)
self.assertEqual(len(annotations), len(self.anndata))
self.assertTupleEqual(
annotations.shape, (8, 2)
) # two types in the data

# based on provided data it should match
# the third row is skipped, since it is special
self.assertAlmostEqual(
annotations.X[0, 0], self.data.iloc[0].sum(), places=6
)
self.assertAlmostEqual(
annotations.X[5, 1], self.data.iloc[5].sum(), places=6
)

# scoring SCORING_LOG_FDR
annotations = annotator(
self.anndata, self.markers, num_genes=15, scoring="log_fdr"
)

self.assertEqual(type(annotations), AnnData)
self.assertEqual(len(annotations), len(self.anndata))
self.assertTupleEqual(
annotations.shape, (8, 2)
) # two types in the data

# scoring SCORING_LOG_PVALUE
annotations = annotator(
self.anndata, self.markers, num_genes=15, scoring="log_p_value"
)

self.assertEqual(type(annotations), AnnData)
self.assertEqual(len(annotations), len(self.anndata))
self.assertTupleEqual(
annotations.shape, (8, 2)
) # two types in the data
10 changes: 9 additions & 1 deletion setup.py
Expand Up @@ -38,7 +38,15 @@
'sphinx_autodoc_typehints',
'scanpydoc',
],
test=['pytest>=4.4', 'dask[array]', 'fsspec', 'zappy', 'zarr', 'black'],
test=[
'pytest>=4.4',
'dask[array]',
'fsspec',
'zappy',
'zarr',
'black',
'point-annotator',
],
),
packages=find_packages(),
entry_points=dict(console_scripts=['scanpy=scanpy.cli:console_main']),
Expand Down