scverse · PrimozGodec · Oct 3, 2019 · Aug 20, 2019 · Oct 7, 2019 · Oct 3, 2019
diff --git a/example.py b/example.py
diff --git a/scanpy/external/__init__.py b/scanpy/external/__init__.py
@@ -74,6 +74,14 @@
 
    tl.sandbag
    tl.cyclone
+
+Cell annotation
+~~~~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: .
+
+   tl.annotator
 
 
 Plotting: PL

diff --git a/scanpy/external/tl/__init__.py b/scanpy/external/tl/__init__.py
@@ -2,3 +2,4 @@
 from ._phate import phate
 from ._phenograph import phenograph
 from ._palantir import palantir
+from ._annotator import annotator
diff --git a/scanpy/external/tl/_annotator.py b/scanpy/external/tl/_annotator.py
@@ -0,0 +1,146 @@
+"""\
+Annotates gene expression (cell data) with cell types.
+"""
+import warnings
+
+from anndata import AnnData
+import pandas as pd
+
+
+def annotator(
+    adata: AnnData,
+    markers: pd.DataFrame,
+    num_genes: int = None,
+    return_nonzero_annotations: bool = True,
+    p_threshold: float = 0.05,
+    p_value_fun: str = "binom",
+    z_threshold: float = 1.0,
+    scoring: str = "exp_ratio",
+    normalize: bool = False,
+):
+    """\
+    Annotator marks the data with cell type annotations based on marker genes.
+
+    Over-expressed genes are selected with the Mann-Whitney U tests and cell
+    types are assigned with the hypergeometric test. This function first selects
+    genes from gene expression data with the Mann-Whitney U test, then annotate
+    them with the hypergeometric test, and finally filter out cell types that
+    have zero scores for all cells. The results are scores that tell how
+    probable is each cell type for each cell.
+
+    Parameters
+    ----------
+    adata
+        Tabular data with gene expressions.
+    markers
+        The data-frame with marker genes and cell types. Data-frame has two
+        columns **Gene** and **Cell Type** first holds gene names or ID and
+        second cell type for this gene. Gene names must be written in the same
+        format than genes in `adata`.
+    num_genes
+        The number of genes that the organism has.
+    return_nonzero_annotations
+        If true return scores only for cell types that have no zero scores.
+    p_threshold
+        A threshold for accepting the annotations. Annotations that have FDR
+        value bellow this threshold are used.
+    p_value_fun
+        A function that calculates a p-value. It can be either
+        `binom` that uses binom.sf or
+        `hypergeom` that uses hypergeom.sf.
+    z_threshold
+        The threshold for selecting the gene from gene expression data.
+        For each cell the attributes with z-value above this value are selected.
+    scoring
+        Scoring method for cell type scores. Available scores are:
+
+        exp_ratio
+            Proportion of genes typical for a cell type expressed in the cell
+        sum_of_expressed_markers
+            Sum of expressions of genes typical for a cell type
+        log_fdr
+            Negative of the logarithm of an false discovery rate (FDR) value
+        log_p_value
+            Negative of the logarithm of a p-value
+    normalize : bool, optional (default = False)
+        If this parameter is True data will be normalized during the
+        a process with a log CPM normalization.
+        That method works correctly data needs to be normalized.
+        Set this `normalize` on True if your data are not normalized already.
+
+    Returns
+    -------
+    pd.DataFrame
+        Cell type for each cell for each cell. The result is a sore matrix that
+        tells how probable is each cell type for each cell. Columns are cell
+        types and rows are cells.
+
+    Example
+    -------
+
+    Here is the example of annotation of dendritic cells based on their gene
+    expressions. For annotation, we use data by Villani et al. (2017)[1] and
+    marker genes by Franzén et al. (2019)[2].
+
+    [1]  Villani, A. C., Satija, ... Jardine, L. (2017). Single-cell
+         RNA-seq reveals new types of human blood dendritic cells, monocytes,
+         and progenitors. Science, 356(6335).
+
+    [2]  Oscar Franzén, Li-Ming Gan, Johan L M Björkegren, PanglaoDB:
+         a web server for exploration of mouse and human single-cell RNA
+         sequencing data, Database, Volume 2019, 2019.
+
+    >>> import pandas as pd
+    >>> from scanpy import AnnData
+    >>> from scanpy.external.tl import annotator
+    >>> import urllib.request
+    >>>
+    >>> # download data in a temporary directory
+    >>> file_name_data, _ = urllib.request.urlretrieve(
+    ...     "https://github.com/biolab/cell-annotation/releases/download/"
+    ...     "0.1.0/DC_expMatrix_DCnMono.csv.gz")
+    >>> file_name_markers, _ = urllib.request.urlretrieve(
+    ...     "https://github.com/biolab/cell-annotation/releases/download/"
+    ...     "0.1.0/panglao_gene_markers_human.csv.gz")
+    >>>
+    >>> # read data with pandas
+    >>> df = pd.read_csv(file_name_data, compression="gzip").iloc[:, :-2]
+    >>> df_markers = pd.read_csv(file_name_markers, compression="gzip")
+    >>>
+    >>> # transform data to AnnData
+    >>> anndata = AnnData(df.values, var=df.columns.values)
+    >>>
+    >>> # run annotation
+    >>> scores = annotator(anndata, df_markers, normalize=True)
+    """
+
+    try:
+        from pointannotator.annotate_samples import AnnotateSamples
+    except ImportError:
+        raise ImportError(
+            'Please install point-annotator: \n\t' 'pip install point-annotator'
+        )
+
+    data_df = pd.DataFrame(adata.X, columns=adata.var.values.flatten())
+    if num_genes is None:
+        num_genes = data_df.shape[1]
+        warnings.warn(
+            "The number of\norganisms genes is not provided. It is "
+            "currently\nset to the number of genes of the dataset.\n"
+            "If you want to change it please set `num_genes` "
+            "parameter."
+        )
+
+    annotations = AnnotateSamples.annotate_samples(
+        data_df,
+        markers,
+        num_genes=num_genes,
+        return_nonzero_annotations=return_nonzero_annotations,
+        p_threshold=p_threshold,
+        p_value_fun=p_value_fun,
+        z_threshold=z_threshold,
+        scoring="scoring_" + scoring,
+        normalize=normalize,
+    )
+
+    return AnnData(annotations.values, var=annotations.columns.values)
diff --git a/scanpy/tests/test_annotator.py b/scanpy/tests/test_annotator.py
@@ -0,0 +1,158 @@
+import unittest
+from importlib.util import find_spec
+
+import pandas as pd
+import numpy as np
+from scanpy import AnnData
+from scanpy.external.tl import annotator
+
+import pytest
+
+
+@pytest.mark.skipif(
+    find_spec('pointannotator') is None, reason="point-annotator not installed"
+)
+class AnnotatorTests(unittest.TestCase):
+    def setUp(self):
+        self.markers = pd.DataFrame(
+            [
+                ["Type 1", "111"],
+                ["Type 1", "112"],
+                ["Type 1", "113"],
+                ["Type 1", "114"],
+                ["Type 2", "211"],
+                ["Type 2", "212"],
+                ["Type 2", "213"],
+                ["Type 2", "214"],
+            ],
+            columns=["Cell Type", "Gene"],
+        )
+
+        genes = ["111", "112", "113", "114", "211", "212", "213", "214"]
+        self.data = pd.DataFrame(
+            np.array(
+                [
+                    [1, 1, 1, 1.1, 0, 0, 0, 0],
+                    [1, 0.8, 0.9, 1, 0, 0, 0, 0],
+                    [0.7, 1.1, 1, 1.2, 0, 0, 0, 0],
+                    [0.8, 0.7, 1.1, 1, 0, 0.1, 0, 0],
+                    [0, 0, 0, 0, 1.05, 1.05, 1.1, 1],
+                    [0, 0, 0, 0, 1.1, 1.0, 1.05, 1.1],
+                    [0, 0, 0, 0, 1.05, 0.9, 1.1, 1.1],
+                    [0, 0, 0, 0, 0.9, 0.9, 1.2, 1],
+                ]
+            ),
+            columns=genes,
+        )
+
+        # transform data to AnnData
+        self.anndata = AnnData(self.data.values, var=self.data.columns.values)
+
+    def basic_check(self, annotations):
+        self.assertEqual(type(annotations), AnnData)
+        self.assertEqual(len(annotations), len(self.anndata))
+        self.assertTupleEqual(
+            annotations.shape, (8, 2)
+        )  # two types in the data
+        self.assertGreater(np.nansum(annotations.X), 0)
+        self.assertLessEqual(np.nanmax(annotations.X), 1)
+        self.assertGreaterEqual(np.nanmin(annotations.X), 0)
+
+    def test_annotator(self):
+        annotations = annotator(
+            self.anndata, self.markers, normalize=False, num_genes=15
+        )
+
+        self.basic_check(annotations)
+
+    def test_remove_empty_column(self):
+        """
+        Type 3 column must be removed here, since this cell type does not
+        belong to any cell.
+        """
+        additinal_markers = pd.DataFrame(
+            [["Type 3", "311"], ["Type 3", "312"], ["Type 3", "313"]],
+            columns=["Cell Type", "Gene"],
+        )
+        markers = self.markers.append(additinal_markers)
+
+        annotations = annotator(self.anndata, markers, num_genes=20)
+
+        self.basic_check(annotations)
+
+        annotations = annotator(
+            self.anndata,
+            markers,
+            num_genes=20,
+            return_nonzero_annotations=False,
+        )
+        self.assertEqual(len(annotations), len(self.anndata))
+        self.assertTupleEqual(
+            annotations.shape, (8, 3)
+        )  # two types in the data
+        self.assertGreater(np.nansum(annotations.X), 0)
+        self.assertLessEqual(np.nanmax(annotations.X), 1)
+        self.assertGreaterEqual(np.nanmin(annotations.X), 0)
+
+    def test_sf(self):
+        """
+        Test annotations with hypergeom.sf
+        """
+        annotations = annotator(
+            self.anndata, self.markers, num_genes=15, p_value_fun="hypergeom"
+        )
+
+        self.basic_check(annotations)
+
+    def test_scoring(self):
+        # scoring SCORING_EXP_RATIO
+        annotations = annotator(
+            self.anndata, self.markers, num_genes=15, scoring="exp_ratio"
+        )
+
+        self.basic_check(annotations)
+
+        # scoring SCORING_MARKERS_SUM
+        annotations = annotator(
+            self.anndata,
+            self.markers,
+            num_genes=15,
+            scoring="sum_of_expressed_markers",
+        )
+
+        self.assertEqual(type(annotations), AnnData)
+        self.assertEqual(len(annotations), len(self.anndata))
+        self.assertTupleEqual(
+            annotations.shape, (8, 2)
+        )  # two types in the data
+
+        # based on provided data it should match
+        # the third row is skipped, since it is special
+        self.assertAlmostEqual(
+            annotations.X[0, 0], self.data.iloc[0].sum(), places=6
+        )
+        self.assertAlmostEqual(
+            annotations.X[5, 1], self.data.iloc[5].sum(), places=6
+        )
+
+        # scoring SCORING_LOG_FDR
+        annotations = annotator(
+            self.anndata, self.markers, num_genes=15, scoring="log_fdr"
+        )
+
+        self.assertEqual(type(annotations), AnnData)
+        self.assertEqual(len(annotations), len(self.anndata))
+        self.assertTupleEqual(
+            annotations.shape, (8, 2)
+        )  # two types in the data
+
+        # scoring SCORING_LOG_PVALUE
+        annotations = annotator(
+            self.anndata, self.markers, num_genes=15, scoring="log_p_value"
+        )
+
+        self.assertEqual(type(annotations), AnnData)
+        self.assertEqual(len(annotations), len(self.anndata))
+        self.assertTupleEqual(
+            annotations.shape, (8, 2)
+        )  # two types in the data
diff --git a/setup.py b/setup.py
@@ -38,7 +38,15 @@
             'sphinx_autodoc_typehints',
             'scanpydoc',
         ],
-        test=['pytest>=4.4', 'dask[array]', 'fsspec', 'zappy', 'zarr', 'black'],
+        test=[
+            'pytest>=4.4',
+            'dask[array]',
+            'fsspec',
+            'zappy',
+            'zarr',
+            'black',
+            'point-annotator',
+        ],
     ),
     packages=find_packages(),
     entry_points=dict(console_scripts=['scanpy=scanpy.cli:console_main']),