Fix #20.

scverse · Mar 30, 2020 · 8ce1265 · 8ce1265
1 parent 22babe4
commit 8ce1265
Show file tree

Hide file tree

Showing 7 changed files with 195 additions and 72 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -59,6 +59,7 @@ Tools: `tl`
    clonotype_network
    chain_pairing
    clip_and_count
+   clonal_expansion
    alpha_diversity
    group_abundance
    spectratype

diff --git a/docs/tutorials/tutorial_3k_tcr.md b/docs/tutorials/tutorial_3k_tcr.md
@@ -147,7 +147,7 @@ sc.pl.umap(adata, color=["sample", "patient", "cluster", "CD8A", "CD4", "FOXP3"]
 ## TCR Quality Control
 
 <!-- #raw raw_mimetype="text/restructuredtext" -->
-WWhile most of T cell receptors have exactly one pair of α and β chains, up to one third of 
+While most of T cell receptors have exactly one pair of α and β chains, up to one third of 
 T cells can have *dual TCRs*, i.e. two pairs of receptors originating from different alleles (:cite:`Schuldt2019`).
 
 Using the :func:`scirpy.tl.chain_pairing` function, we can add a summary
@@ -261,15 +261,19 @@ ir.tl.clonotype_network(adata, min_size=2)
 ir.pl.clonotype_network(adata, color="clonotype", legend_loc="none")
 ```
 
-Let's re-compute the network with a `cutoff` of `20`.
-That's the equivalent of 4 `R`s mutating into `N` (using the BLOSUM62 distance matrix).
+Let's re-compute the network with a `cutoff` of `15`.
+That's the equivalent of 3 `R`s mutating into `N` (using the BLOSUM62 distance matrix).
 
 Additionally, we set `chains` to `all`. This results in the distances not being only
 computed between the most abundant pair of T-cell receptors, but instead, will
 take the minimal distance between any pair of T-cell receptors.
 
 ```python
-ir.pp.tcr_neighbors(adata, cutoff=20, chains="all")
+sc.settings.verbosity = 4
+```
+
+```python
+ir.pp.tcr_neighbors(adata, cutoff=15, chains="all")
 ir.tl.define_clonotypes(adata)
 ```
 
@@ -294,18 +298,37 @@ it is shared across tissues and/or patients.
 ir.pl.clonotype_network(adata, color="sample")
 ```
 
-Next, visualize the clonal expansion by cell-type cluster
+## Clonotype analysis
+
+Let's visualize the number of expanded clonotypes (i.e. clonotypes consisting
+of more than one cell) by cell-type: 
+
+```python
+ir.tl.clip_and_count(adata, groupby="cluster", target_col="clonotype")
+```
+
+```python
+sc.pl.umap(adata, color=["clonotype_clipped_count", "cluster"])
+```
+
+```python
+ir.pl.clip_and_count(adata, groupby="cluster", target_col="clonotype", fraction=True)
+```
 
 ```python
 ir.pl.clonal_expansion(adata, groupby="cluster", clip_at=4, fraction=False)
 ```
 
-Normalized to the cluster size
+Normalized to the cluster size:
 
 ```python
 ir.pl.clonal_expansion(adata, "cluster")
 ```
 
+Expectedly, the CD8+ effector T cells have the largest fraction of expanded clonotypes. 
+
+Consistent with this observation, they have the lowest alpha diversity: 
+
 ```python
 ir.pl.alpha_diversity(adata, groupby="cluster")
 ```
@@ -314,7 +337,13 @@ ir.pl.alpha_diversity(adata, groupby="cluster")
 
 ```python
 ir.pl.group_abundance(
-    adata, groupby="clonotype", target_col="cluster", max_cols=10, fraction=False
+    adata, groupby="clonotype", target_col="cluster", max_cols=10, fraction="patient"
+)
+```
+
+```python
+ir.pl.group_abundance(
+    adata, groupby="clonotype", target_col="patient", max_cols=10, fraction="patient"
 )
 ```
 

diff --git a/scirpy/_plotting/_clip_and_count.py b/scirpy/_plotting/_clip_and_count.py
@@ -1,6 +1,35 @@
 from .. import tl
 from anndata import AnnData
 from . import base
+from .._util import _normalize_counts, _is_na
+
+
+def _prepare_df(adata, groupby, target_col, clip_at, fraction):
+    """Turn the result of the `clip_and_count` tool into a plottable
+    dataframe"""
+    tmp_col = target_col + "clipped_count"
+    tmp_col_weight = target_col + "weight"
+
+    obs = adata.obs.loc[:, [groupby, target_col]]
+    obs[tmp_col] = tl.clip_and_count(
+        adata, target_col, groupby=groupby, clip_at=clip_at, inplace=False
+    )
+    # filter NA values
+    obs = obs.loc[~_is_na(obs[target_col]), :]
+
+    # add normalization vector
+    size_vector = _normalize_counts(obs, fraction, groupby)
+    obs[tmp_col_weight] = size_vector
+
+    obs = (
+        obs.groupby([groupby, tmp_col], observed=True)[tmp_col_weight]
+        .sum()
+        .reset_index()
+        .pivot(index=groupby, columns=tmp_col, values=tmp_col_weight)
+        .fillna(0)
+    )
+
+    return obs
 
 
 def clip_and_count(
@@ -12,8 +41,10 @@ def clip_and_count(
     fraction: bool = True,
     **kwargs,
 ):
-    """Plots the the number of identical entries in `target_col` 
-    for each group in `group_by`. 
+    """Plots the the *number of cells* in `target_col` that fall into 
+    a certain count bin for each group in `group_by`. 
+
+    Removes all entries with `NaN` in `target_col` prior to plotting. 
 
     Parameters
     ----------
@@ -32,11 +63,9 @@ def clip_and_count(
     **kwargs
         Additional arguments passed to :meth:`base.bar`
     """
-    res = tl.clip_and_count(
-        adata, groupby, target_col, clip_at=clip_at, fraction=fraction
-    )
+    plot_df = _prepare_df(adata, groupby, target_col, clip_at, fraction)
 
-    return base.bar(res, **kwargs)
+    return base.bar(plot_df, **kwargs)
 
 
 def clonal_expansion(

diff --git a/scirpy/_tools/__init__.py b/scirpy/_tools/__init__.py
@@ -1,5 +1,5 @@
 from ._chain_pairing import chain_pairing
-from ._clip_and_count import clip_and_count
+from ._clip_and_count import clip_and_count, clonal_expansion
 from ._diversity import alpha_diversity
 from ._group_abundance import group_abundance
 from ._spectratype import spectratype

diff --git a/scirpy/_tools/_clip_and_count.py b/scirpy/_tools/_clip_and_count.py
@@ -3,59 +3,109 @@
 from .._util import _is_na
 import numpy as np
 import pandas as pd
+from typing import Union
 
 
 def clip_and_count(
     adata: AnnData,
-    groupby: str,
     target_col: str,
     *,
+    groupby: Union[str, None] = None,
     clip_at: int = 3,
+    inplace: bool = True,
+    key_added: Union[str, None] = None,
     fraction: bool = True,
-) -> Dict:
+) -> Union[None, np.ndarray]:
     """Counts the number of identical entries in `target_col` 
     for each group in `group_by`. 
 
-    Ignores NaN values. 
-    
+    Counts NaN values like any other value in `target_col`. 
+
     Parameters
     ----------
     adata
         AnnData object to work on
-    groupby
-        Group by this column from `obs`
     target_col
         Column to count on.
+    groupby
+        Calculate counts within groups (e.g. sample or patient) 
     clip_at
         All entries in `target_col` with more copies than `clip_at`
         will be summarized into a single group.         
-    fraction
-        If True, compute fractions rather than reporting
-        abosolute numbers.
+    inplace
+        If True, adds a column to `obs`. Otherwise returns an array 
+        with the clipped counts. 
+    key_added
+        Key under which the results will be stored in `obs`. Defaults
+        to `{target_col}_clipped_count`
 
     Returns
     -------
-    Dictionary with counts/fractions per group 
+    Depending on the value of inplace, adds a column to adata or returns
+    an array with the clipped count per cell. 
     """
     if target_col not in adata.obs.columns:
         raise ValueError("`target_col` not found in obs.")
-    # count abundance of each clonotype
-    tcr_obs = adata.obs.loc[~_is_na(adata.obs[target_col]), :]
+
+    groupby = [groupby] if isinstance(groupby, str) else groupby
+    groupby_cols = [target_col] if groupby is None else groupby + [target_col]
     clonotype_counts = (
-        tcr_obs.groupby([groupby, target_col]).size().reset_index(name="count")
+        adata.obs.groupby(groupby_cols)
+        .size()
+        .reset_index(name="tmp_count")
+        .assign(
+            tmp_count=lambda X: [
+                ">= {}".format(min(n, clip_at)) if n >= clip_at else str(n)
+                for n in X["tmp_count"].values
+            ]
+        )
+    )
+    clipped_count = adata.obs.merge(clonotype_counts, how="left", on=groupby_cols)[
+        "tmp_count"
+    ].values
+
+    if inplace:
+        key_added = (
+            "{}_clipped_count".format(target_col) if key_added is None else key_added
+        )
+        adata.obs[key_added] = clipped_count
+    else:
+        return clipped_count
+
+
+def clonal_expansion(
+    adata: AnnData,
+    target_col: str = "clonotype",
+    *,
+    groupby: Union[str, None] = None,
+    clip_at: int = 3,
+    key_added: str = "clonal_expansion",
+    **kwargs,
+) -> Union[None, np.ndarray]:
+    """Adds a column to obs which clonotypes are expanded. 
+
+    Parameters
+    ----------
+    adata
+        Annoated data matrix
+    target_col
+        Column containing the clontype annoataion
+    groupby
+        Calculate clonal expansion within groups. Usually makes sense to set
+        this to the column containing sample annotation. 
+    clip_at:
+        All clonotypes with more than `clip_at` clones will be summarized into 
+        a single category
+    key_added
+        Key under which the results will be added to `obs`. 
+    kwargs
+        Additional arguments passed to :func:`scirpy.tl.clip_and_count`. 
+    """
+    return clip_and_count(
+        adata,
+        target_col,
+        groupby=groupby,
+        clip_at=clip_at,
+        key_added=key_added,
+        **kwargs,
     )
-    clonotype_counts.loc[clonotype_counts["count"] >= clip_at, "count"] = clip_at
-
-    result_dict = dict()
-    for group in clonotype_counts[groupby].unique():
-        result_dict[group] = dict()
-        for n in range(1, clip_at + 1):
-            label = ">= {}".format(n) if n == clip_at else str(n)
-            mask_group = clonotype_counts[groupby] == group
-            mask_count = clonotype_counts["count"] == n
-            tmp_count = np.sum(mask_group & mask_count)
-            if fraction:
-                tmp_count /= np.sum(mask_group)
-            result_dict[group][label] = tmp_count
-
-    return pd.DataFrame.from_dict(result_dict, orient="index")
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
@@ -10,12 +10,36 @@
     adata_vdj,
     adata_clonotype_network,
 )
+import scirpy._plotting as irplt
 import matplotlib.pyplot as plt
 import numpy.testing as npt
+import pandas.testing as pdt
 import numpy as np
 
 
-def test_clip_and_count(adata_tra):
+def test_clip_and_count(adata_tra, adata_clonotype):
+    res_fraction = irplt._clip_and_count._prepare_df(
+        adata_clonotype, "group", "clonotype", 2, True
+    )
+    res_counts = irplt._clip_and_count._prepare_df(
+        adata_clonotype, "group", "clonotype", 2, False
+    )
+    pdt.assert_frame_equal(
+        res_fraction,
+        pd.DataFrame.from_dict(
+            {"group": ["A", "B"], "1": [0, 3 / 5], ">= 2": [1.0, 2 / 5]}
+        ).set_index("group"),
+        check_names=False,
+    )
+    pdt.assert_frame_equal(
+        res_counts,
+        pd.DataFrame.from_dict(
+            {"group": ["A", "B"], "1": [0, 3], ">= 2": [3, 2]}
+        ).set_index("group"),
+        check_names=False,
+        check_dtype=False,
+    )
+
     p = pl.clip_and_count(
         adata_tra, target_col="TRA_1_cdr3", groupby="sample", fraction=False
     )
@@ -41,10 +65,12 @@ def test_spectratype(adata_tra):
     p = pl.spectratype(adata_tra, target_col="sample")
     assert isinstance(p, plt.Axes)
 
+
 def test_vdj_usage(adata_vdj):
     p = pl.vdj_usage(adata_vdj, fraction="sample")
     assert isinstance(p, plt.Axes)
 
+
 def test_clonotype_network(adata_clonotype_network):
     p = pl.clonotype_network(adata_clonotype_network)
     assert isinstance(p[0], plt.Axes)