diff --git a/docs/api.rst b/docs/api.rst index 11071517a..2112bf4ba 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -62,6 +62,7 @@ Tools: `tl` alpha_diversity group_abundance spectratype + vdj_usage Plotting: `pl` @@ -90,6 +91,7 @@ Base plotting functions: `pl.base` clonal_expansion group_abundance spectratype + vdj_usage clonotype_network diff --git a/docs/tutorials/tutorial_3k_tcr.md b/docs/tutorials/tutorial_3k_tcr.md index 08be8fe73..9a117eb0f 100644 --- a/docs/tutorials/tutorial_3k_tcr.md +++ b/docs/tutorials/tutorial_3k_tcr.md @@ -6,18 +6,19 @@ jupyter: text_representation: extension: .md format_name: markdown - format_version: '1.2' + format_version: "1.2" jupytext_version: 1.4.1 --- # Analysis of 3k T cells from cancer - + In this tutorial, we re-analize single-cell TCR/RNA-seq data from Wu et al (:cite:`Wu2020`) generated on the 10x Genomics VDJ platform. The original dataset consists of >140k T cells -from 14 treatment-naive patients across four different types of cancer. +from 14 treatment-naive patients across four different types of cancer. For this tutorial, to speed up computations, we use a downsampled version of 3k cells. + ```python @@ -34,7 +35,9 @@ from matplotlib import pyplot as plt ``` + The dataset ships with the `scirpy` package. We can conveniently load it from the `dataset` module: + ```python @@ -42,7 +45,9 @@ adata = ir.datasets.wu2020_3k() ``` + `adata` is a regular :class:`~anndata.AnnData` object: + ```python @@ -56,17 +61,20 @@ adata.obs ``` + .. note:: **Importing data** - `scirpy` supports importing TCR data from Cellranger or `TraCeR `_. - See :ref:`api-io` for more details. +`scirpy` supports importing TCR data from Cellranger or `TraCeR `\_. +See :ref:`api-io` for more details. + +This particular dataset has been imported using :func:`scirpy.read_10x_vdj_csv` and merged +with transcriptomics data using :func:`scirpy.pp.merge_with_tcr`. The exact procedure +is described in :func:`scirpy.datasets.wu2020`. - This particular dataset has been imported using :func:`scirpy.read_10x_vdj_csv` and merged - with transcriptomics data using :func:`scirpy.pp.merge_with_tcr`. The exact procedure - is described in :func:`scirpy.datasets.wu2020`. ## Preprocess Transcriptomics data + Transcriptomics data needs to be filtered and preprocessed as with any other single-cell dataset. We recommend following the [scanpy tutorial](https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html) and the best practice paper by [Luecken et al.](https://www.embopress.org/doi/10.15252/msb.20188746) @@ -81,9 +89,9 @@ sc.pp.normalize_per_cell(adata, counts_per_cell_after=1000) sc.pp.log1p(adata) ``` -For the *Wu2020* dataset, the authors already provide clusters and UMAP coordinates. -Instead of performing clustering and cluster annotation ourselves, we will just use -provided data. +For the _Wu2020_ dataset, the authors already provide clusters and UMAP coordinates. +Instead of performing clustering and cluster annotation ourselves, we will just use +provided data. ```python adata.obsm["X_umap"] = adata.obsm["X_umap_orig"] @@ -111,11 +119,11 @@ mapping = { adata.obs["cluster"] = [mapping[x] for x in adata.obs["cluster_orig"]] ``` -Let's inspect the UMAP plots. The first three panels show the UMAP plot colored by sample, patient and cluster. -We don't observe any clustering of samples or patients that could hint at batch effects. +Let's inspect the UMAP plots. The first three panels show the UMAP plot colored by sample, patient and cluster. +We don't observe any clustering of samples or patients that could hint at batch effects. -The lower three panels show the UMAP colored by the T cell markers *CD8*, *CD4*, and *FOXP3*. -We can confirm that the markers correspond to their respective cluster labels. +The lower three panels show the UMAP colored by the T cell markers _CD8_, _CD4_, and _FOXP3_. +We can confirm that the markers correspond to their respective cluster labels. ```python sc.pl.umap(adata, color=["sample", "patient", "cluster", "CD8A", "CD4", "FOXP3"], ncols=2, wspace=.5) @@ -124,18 +132,19 @@ sc.pl.umap(adata, color=["sample", "patient", "cluster", "CD8A", "CD4", "FOXP3"] ## TCR Quality Control -While most T cell receptors have exactely one pair of α and β chains, it has been reported that up to one third of -the receptors can be *Dual TCRs* posessing a second pair of receptors, one originating from each Allele (:cite:`Schuldt2019`). + +While most T cell receptors have exactely one pair of α and β chains, it has been reported that up to one third of +the receptors can be _Dual TCRs_ posessing a second pair of receptors, one originating from each Allele (:cite:`Schuldt2019`). Using the :func:`scirpy.tl.chain_pairing` function, we can add a summary -about the T cell receptor compositions to `adata.obs`. We can visualize it using :func:`scirpy.pl.group_abundance`. +about the T cell receptor compositions to `adata.obs`. We can visualize it using :func:`scirpy.pl.group_abundance`. .. note:: **chain pairing** - * *Orphan chain* refers to cells that have a single alpha or beta receptor chain, respectively. - * *Extra chain* refers to cells that have a full alpha/beta receptor pair, and an additional chain. - * *Multichain* refers to cells with more than two receptor pairs detected. These cells are likely doublets. - +- _Orphan chain_ refers to cells that have a single alpha or beta receptor chain, respectively. +- _Extra chain_ refers to cells that have a full alpha/beta receptor pair, and an additional chain. +- _Multichain_ refers to cells with more than two receptor pairs detected. These cells are likely doublets. + ```python ir.tl.chain_pairing(adata) @@ -155,7 +164,7 @@ print("Fraction of cells with more than one pair of TCRs: {:.2f}".format( )) ``` -Next, we visualize the *Multichain* cells on the UMAP plot and exclude them from downstream analysis: +Next, we visualize the _Multichain_ cells on the UMAP plot and exclude them from downstream analysis: ```python sc.pl.umap(adata, color="multi_chain") @@ -167,61 +176,61 @@ adata = adata[adata.obs["multi_chain"] != "True", :].copy() ## Define clonotypes - -In this section, we will define and visualize clonotypes. -*Scirpy* implements a network-based approach for clonotype definition. The steps to -create and visualize the clonotype-network are anologous to the construction of a +In this section, we will define and visualize clonotypes. + +_Scirpy_ implements a network-based approach for clonotype definition. The steps to +create and visualize the clonotype-network are anologous to the construction of a neighborhood graph we already know from transcriptomics data: .. list-table:: Analysis steps on transcriptomics data - :widths: 40 60 - :header-rows: 1 - - * - scanpy function - - objective - * - :func:`scanpy.pp.neighbors` - - Compute a nearest-neighbor graph based on gene expression. - * - :func:`scanpy.tl.leiden` - - Cluster cells by the similarity of their transcriptional profiles. - * - :func:`scanpy.tl.umap` - - Compute positions of cells in UMAP embedding. - * - :func:`scanpy.pl.umap` - - Plot UMAP colored by different parameters. - - -.. list-table:: Analysis steps on TCR data - :widths: 40 60 - :header-rows: 1 - - * - scirpy function - - objective - * - :func:`scirpy.pp.tcr_neighbors` - - Compute a neighborhood graph of CDR3-sequences. - * - :func:`scirpy.tl.define_clonotypes` - - Cluster cells by the similarity of their CDR3-sequences. - * - :func:`scirpy.tl.clonotype_network` - - Compute positions of cells in clonotype network. - * - :func:`scirpy.pl.clonotype_network` - - Plot clonotype network colored by different parameters. +:widths: 40 60 +:header-rows: 1 + +- - scanpy function + - objective +- - :func:`scanpy.pp.neighbors` + - Compute a nearest-neighbor graph based on gene expression. +- - :func:`scanpy.tl.leiden` + - Cluster cells by the similarity of their transcriptional profiles. +- - :func:`scanpy.tl.umap` + - Compute positions of cells in UMAP embedding. +- - :func:`scanpy.pl.umap` + - Plot UMAP colored by different parameters. +.. list-table:: Analysis steps on TCR data +:widths: 40 60 +:header-rows: 1 + +- - scirpy function + - objective +- - :func:`scirpy.pp.tcr_neighbors` + - Compute a neighborhood graph of CDR3-sequences. +- - :func:`scirpy.tl.define_clonotypes` + - Cluster cells by the similarity of their CDR3-sequences. +- - :func:`scirpy.tl.clonotype_network` + - Compute positions of cells in clonotype network. +- - :func:`scirpy.pl.clonotype_network` + - Plot clonotype network colored by different parameters. ### Compute CDR3 neighborhood graph -:func:`scirpy.pp.tcr_neighbors` computes pairwise sequence alignments of all CDR3 sequences and -derives a distance from the alignment score. This approach was originally proposed as *TCRdist* by Dash et al. (:cite:`TCRdist`). + +:func:`scirpy.pp.tcr_neighbors` computes pairwise sequence alignments of all CDR3 sequences and +derives a distance from the alignment score. This approach was originally proposed as _TCRdist_ by Dash et al. (:cite:`TCRdist`). The function requires to specify a `cutoff` parameter. All cells with a distance between their -CDR3 sequences lower than `cutoff` will be connected in the network. In the first example, -we set the cutoff to `0`, to define clontypes as cells with **identical** CDR3 sequences. +CDR3 sequences lower than `cutoff` will be connected in the network. In the first example, +we set the cutoff to `0`, to define clontypes as cells with **identical** CDR3 sequences. Then, the function :func:`scirpy.tl.define_clonotypes` will detect connected modules in the graph and annotate them as clonotypes. This will add a `clonotype` and `clonotype_size` column to `adata.obs`. + ```python @@ -230,9 +239,11 @@ ir.tl.define_clonotypes(adata) ``` -To visualize the network we first call :func:`scirpy.tl.clonotype_network` to compute the layout. + +To visualize the network we first call :func:`scirpy.tl.clonotype_network` to compute the layout. We can then visualize it using :func:`scirpy.pl.clonotype_network`. We recommend setting the -`min_size` parameter to `>=2`, to prevent the singleton clonotypes from cluttering the network. +`min_size` parameter to `>=2`, to prevent the singleton clonotypes from cluttering the network. + ```python @@ -240,12 +251,12 @@ ir.tl.clonotype_network(adata, min_size=2) ir.pl.clonotype_network(adata, color="clonotype", legend_loc="none") ``` -Let's re-compute the network with a `cutoff` of `20`. -That's the equivalent of 4 `R`s mutating into `N` (using the BLOSUM62 distance matrix). +Let's re-compute the network with a `cutoff` of `20`. +That's the equivalent of 4 `R`s mutating into `N` (using the BLOSUM62 distance matrix). -Additionally, we set `chains` to `all`. This results in the distances not being only +Additionally, we set `chains` to `all`. This results in the distances not being only computed between the most abundant pair of T-cell receptors, but instead, will -take the minimal distance between any pair of T-cell receptors. +take the minimal distance between any pair of T-cell receptors. ```python ir.pp.tcr_neighbors(adata, cutoff=20, chains="all") @@ -257,17 +268,17 @@ ir.tl.clonotype_network(adata, min_size=3) ``` When coloring by clonotype, we can see that the large, connected Hairball has been sub-divided in multiple clonotypes by -Graph-based clustering using the *Leiden-algorithm*. Also, the edges are now colored according to the distance -between nodes. The darker an edge, the lower the alignment-distance. +Graph-based clustering using the _Leiden-algorithm_. Also, the edges are now colored according to the distance +between nodes. The darker an edge, the lower the alignment-distance. ```python ir.pl.clonotype_network(adata, color="clonotype", legend_fontoutline=2) ``` -Now we show the same graph, colored by sample. -We observe that for instance clonotypes 292 and 279 are *private*, i.e. they contain cells from -a single sample only. On the other hand, for instance clonotype 16 is *public*, i.e. -it is shared across tissues and/or patients. +Now we show the same graph, colored by sample. +We observe that for instance clonotypes 292 and 279 are _private_, i.e. they contain cells from +a single sample only. On the other hand, for instance clonotype 16 is _public_, i.e. +it is shared across tissues and/or patients. ```python ir.pl.clonotype_network(adata, color="sample") @@ -363,13 +374,9 @@ ir.pl.spectratype( ``` ```python - -``` - -```python - +ir.pl.vdj_usage(adata) ``` ```python - +ir.pl.vdj_usage(adata, full_combination=False) ``` diff --git a/scirpy/_plotting/__init__.py b/scirpy/_plotting/__init__.py index f7d6544c7..27bdb9f8d 100644 --- a/scirpy/_plotting/__init__.py +++ b/scirpy/_plotting/__init__.py @@ -7,5 +7,6 @@ # from ._cdr_convergence import cdr_convergence from ._group_abundance import group_abundance from ._spectratype import spectratype +from ._vdj_usage import vdj_usage from ._clonotypes import clonotype_network, COLORMAP_EDGES, clonotype_network_igraph diff --git a/scirpy/_plotting/_base.py b/scirpy/_plotting/_base.py index 9b229d3f5..690fb01bb 100644 --- a/scirpy/_plotting/_base.py +++ b/scirpy/_plotting/_base.py @@ -54,6 +54,7 @@ def bar( ------- Axes object """ + if ax is None: ax = _init_ax(fig_kws) ax = data.plot.bar(ax=ax, stacked=stacked) diff --git a/scirpy/_plotting/_vdj_usage.py b/scirpy/_plotting/_vdj_usage.py new file mode 100644 index 000000000..e164e72da --- /dev/null +++ b/scirpy/_plotting/_vdj_usage.py @@ -0,0 +1,267 @@ +from anndata import AnnData +import matplotlib.pyplot as plt +from typing import Callable, Union, Tuple, Sequence +import numpy as np +from .._util import _normalize_counts +from ._styling import _init_ax + + +def vdj_usage( + adata: AnnData, + *, + target_cols: list = [ + "TRA_1_j_gene", + "TRA_1_v_gene", + "TRB_1_v_gene", + "TRB_1_d_gene", + "TRB_1_j_gene", + ], + fraction: Union[None, str, Sequence[float]] = None, + ax: Union[plt.axes, None] = None, + bar_clip: int = 5, + top_n: Union[None, int] = 10, + barwidth: float = 0.4, + draw_bars: bool = True, + full_combination: bool = True, + fig_kws: Union[dict, None] = None, +) -> plt.Axes: + """Creates a ribbon plot of the most abundant VDJ combinations in a + given subset of cells. + + Currently works with primary alpha and beta chains only. + Does not search for precomputed results in `adata`. + + Parameters + ---------- + adata + AnnData object to work on. + target_cols + Columns containing gene segment information. + Overwrite default only if you know what you are doing! + fraction + Either the name of a categorical column that should be used as the base + for computing fractions, or an iterable specifying a size factor for each cell. + By default, each cell count as 1, but due to normalization to different + sample sizes for example, it is possible that one cell + in a small sample is weighted more than a cell in a large sample. + ax + Custom axis if needed. + bar_clip + The maximum number of stocks for bars (number of different V, D or J segments + that should be shown separately). + top_n + The maximum number of ribbons (individual VDJ combinations). If set to `None`, + all ribbons are drawn. + barwidth + Width of bars. + draw_bars + If `False`, only ribbons are drawn and no bars. + full_combination + If set to `False`, the bands represent the frequency of a binary gene segment combination + of the two connectec loci (e.g. combination of TRBD and TRBJ genes). By default each + band represents an individual combination of all five loci. + fig_kws + Dictionary of keyword args that will be passed to the matplotlib + figure (e.g. `figsize`) + + Returns + ------- + Axes object. + """ + + df = adata.obs.assign( + cell_weights=_normalize_counts(adata.obs, fraction) + if isinstance(fraction, (bool, str)) or fraction is None + else fraction + ) + + # init figure + default_figargs = {"figsize": (7, 4)} + if fig_kws is not None: + default_figargs.update(fig_kws) + if ax is None: + ax = _init_ax(default_figargs) + + if top_n is None: + top_n = df.shape[0] + + # Draw a stacked bar for every gene loci and save positions on the bar + gene_tops = dict() + for i in range(len(target_cols)): + td = ( + df.groupby(target_cols[i]) + .agg({"cell_weights": "sum"}) + .sort_values(by="cell_weights", ascending=False) + .reset_index() + ) + genes = td[target_cols[i]].tolist() + td = td["cell_weights"] + sector = target_cols[i][2:7].replace("_", "") + # sector = sector.replace('_', '') + unct = td[bar_clip + 1 :,].sum() + if td.size > bar_clip: + if draw_bars: + ax.bar(i + 1, unct, width=barwidth, color="grey", edgecolor="black") + gene_tops["other_" + sector] = unct + bottom = unct + else: + gene_tops["other_" + sector] = 0 + bottom = 0 + for j in range(bar_clip + 1): + try: + y = td[bar_clip - j] + gene = genes[bar_clip - j] + if gene == "None": + gene = "No_" + sector + gene_tops[gene] = bottom + y + if draw_bars: + ax.bar( + i + 1, + y, + width=barwidth, + bottom=bottom, + color="lightgrey", + edgecolor="black", + ) + ax.text( + 1 + i - barwidth / 2 + 0.05, + bottom + 0.05, + gene.replace("TRA", "").replace("TRB", ""), + ) + bottom += y + except: + pass + + # Create a case for full combinations or just the neighbours + if full_combination: + draw_mat = [target_cols] + else: + draw_mat = [] + for lc in range(1, len(target_cols)): + draw_mat.append(target_cols[lc - 1 : lc]) + + init_n = 0 + for target_pair in draw_mat: + # Count occurance of individual VDJ combinations + td = df.loc[:, target_pair + ["cell_weights"]] + td["genecombination"] = td.apply( + lambda x, y: "|".join([x[e] for e in y]), y=target_pair, axis=1 + ) + td = ( + td.groupby("genecombination") + .agg({"cell_weights": "sum"}) + .sort_values(by="cell_weights", ascending=False) + .reset_index() + ) + td["genecombination"] = td.apply( + lambda x: [x["cell_weights"]] + x["genecombination"].split("|"), axis=1 + ) + + # Draw ribbons + for r in td["genecombination"][1 : top_n + 1]: + d = [] + ht = r[0] + for i in range(len(r) - 1): + g = r[i + 1] + sector = target_pair[i][2:7].replace("_", "") + if g == "None": + g = "No_" + sector + if g not in gene_tops: + g = "other_" + sector + t = gene_tops[g] + d.append([t - ht, t]) + t = t - ht + gene_tops[g] = t + if draw_bars: + gapped_ribbons(d, ax=ax, gapwidth=barwidth) + else: + gapped_ribbons(d, ax=ax, gapwidth=0.1) + + # Make tick labels nicer + ax.set_xticks(range(1, len(target_cols) + 1)) + if target_cols == [ + "TRA_1_j_gene", + "TRA_1_v_gene", + "TRB_1_v_gene", + "TRB_1_d_gene", + "TRB_1_j_gene", + ]: + ax.set_xticklabels(["TRAJ", "TRAV", "TRBV", "TRBD", "TRBJ"]) + else: + ax.set_xticklabels(target_cols) + + return ax + + +def gapped_ribbons( + data: list, + *, + ax: Union[plt.axes, list, None] = None, + xstart: float = 1.2, + gapfreq: float = 1.0, + gapwidth: float = 0.4, + fun: Callable = lambda x: x[3] + + (x[4] / (1 + np.exp(-((x[5] / x[2]) * (x[0] - x[1]))))), + figsize: Tuple[float, float] = (3.44, 2.58), + figresolution: int = 300, +) -> plt.Axes: + """Draws ribbons using `fill_between` + Called by VDJ usage plot to connect bars. + + Parameters + ---------- + data + Breakpoints defining the ribbon as a 2D matrix. Each row is an x position, columns are the lower and upper extent of the ribbon at that position. + ax + Custom axis, almost always called with an axis supplied. + xstart + The midpoint of the first bar. + gapfreq + Frequency of bars. Normally a bar would be drawn at every integer x position, hence default is 1. + gapwidth + At every bar position, there will be a gap. The width of this gap is identical to bar widths, but could also be set to 0 if we need continous ribbons. + fun + A function defining the curve of each ribbon segment from breakpoint to breakpoint. By default, it is a sigmoid with 6 parameters: + range between x position of bars, + curve start on the x axis, + slope of curve, + curve start y position, + curve end y position, + compression factor of the sigmoid curve + figsize + Size of the resulting figure in inches. + figresolution + Resolution of the figure in dpi. + + Returns + ------- + Axes object. + """ + + if ax is None: + fig, ax = plt.subplots(figsize=figsize, dpi=figresolution) + else: + if isinstance(ax, list): + ax = ax[0] + + spread = 10 + xw = gapfreq - gapwidth + slope = xw * 0.8 + x, y1, y2 = [], [], [] + for i in range(1, len(data)): + xmin = xstart + (i - 1) * gapfreq + tx = np.linspace(xmin, xmin + xw, 100) + xshift = xmin + xw / 2 + p1, p2 = data[i - 1] + p3, p4 = data[i] + ty1 = fun((tx, xshift, slope, p1, p3 - p1, spread)) + ty2 = fun((tx, xshift, slope, p2, p4 - p2, spread)) + x += tx.tolist() + y1 += ty1.tolist() + y2 += ty2.tolist() + x += np.linspace(xmin + xw, xstart + i * gapfreq, 10).tolist() + y1 += np.zeros(10).tolist() + y2 += np.zeros(10).tolist() + ax.fill_between(x, y1, y2, alpha=0.6) + + return ax diff --git a/scirpy/_tools/_group_abundance.py b/scirpy/_tools/_group_abundance.py index 718169289..057818cee 100644 --- a/scirpy/_tools/_group_abundance.py +++ b/scirpy/_tools/_group_abundance.py @@ -18,7 +18,7 @@ def _group_abundance( # normalize to fractions scale_vector = _normalize_counts(tcr_obs, normalize=fraction, default_col=groupby) - tcr_obs = tcr_obs.assign(count=1, weight=1 / scale_vector) + tcr_obs = tcr_obs.assign(count=1, weight=scale_vector) # Calculate distribution of lengths in each group. Use sum instead of count # to reflect weights diff --git a/scirpy/_util/__init__.py b/scirpy/_util/__init__.py index be8a3eb41..a3f140278 100644 --- a/scirpy/_util/__init__.py +++ b/scirpy/_util/__init__.py @@ -160,7 +160,7 @@ def _normalize_counts( raise ValueError("No colname specified in either `normalize` or `default_col") # https://stackoverflow.com/questions/29791785/python-pandas-add-a-column-to-my-dataframe-that-counts-a-variable - return obs.groupby(normalize_col)[normalize_col].transform("count").values + return 1 / obs.groupby(normalize_col)[normalize_col].transform("count").values def _get_from_uns(adata: AnnData, tool: str, *, parameters: dict = None) -> Any: diff --git a/tests/fixtures.py b/tests/fixtures.py index 643e87e9a..f9639a29c 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -167,6 +167,149 @@ def adata_tra(): adata = AnnData(obs=obs) return adata +@pytest.fixture +def adata_vdj(): + obs = { + 'LT1_ACGGCCATCCGAGCCA-2-24': + { + 'TRA_1_j_gene': 'TRAJ42', + 'TRA_1_v_gene': 'TRAV26-2', + 'TRB_1_v_gene': 'TRBV7-2', + 'TRB_1_d_gene': 'TRBD1', + 'TRB_1_j_gene': 'TRBJ2-5', + 'sample': 'LT1' + }, + 'LT1_CGCTTCACAAGGTGTG-2-24': + { + 'TRA_1_j_gene': 'TRAJ45', + 'TRA_1_v_gene': 'None', + 'TRB_1_v_gene': 'None', + 'TRB_1_d_gene': 'None', + 'TRB_1_j_gene': 'TRBJ2-3', + 'sample': 'LT1' + }, + 'LT1_AGGGAGTTCCCAAGAT-2-24': + { + 'TRA_1_j_gene': 'TRAJ29', + 'TRA_1_v_gene': 'TRAV12-1', + 'TRB_1_v_gene': 'TRBV20-1', + 'TRB_1_d_gene': 'TRBD2', + 'TRB_1_j_gene': 'TRBJ1-1', + 'sample': 'LT1' + }, + 'LT1_ATTACTCGTTGGACCC-2-24': + { + 'TRA_1_j_gene': 'TRAJ4', + 'TRA_1_v_gene': 'TRAV12-1', + 'TRB_1_v_gene': 'TRBV7-2', + 'TRB_1_d_gene': 'None', + 'TRB_1_j_gene': 'TRBJ2-6', + 'sample': 'LT1' + }, + 'LT1_GCAATCACAATGAATG-1-24': + { + 'TRA_1_j_gene': 'TRAJ52', + 'TRA_1_v_gene': 'TRAV8-6', + 'TRB_1_v_gene': 'TRBV30', + 'TRB_1_d_gene': 'TRBD1', + 'TRB_1_j_gene': 'TRBJ2-2', + 'sample': 'LT1' + }, + 'LT1_TCTCTAATCCACTGGG-2-24': + { + 'TRA_1_j_gene': 'TRAJ43', + 'TRA_1_v_gene': 'TRAV8-3', + 'TRB_1_v_gene': 'TRBV30', + 'TRB_1_d_gene': 'TRBD1', + 'TRB_1_j_gene': 'TRBJ1-2', + 'sample': 'LT1' + }, + 'LT1_TATTACCTCAACGGCC-2-24': + { + 'TRA_1_j_gene': 'TRAJ45', + 'TRA_1_v_gene': 'TRAV20', + 'TRB_1_v_gene': 'TRBV4-1', + 'TRB_1_d_gene': 'None', + 'TRB_1_j_gene': 'TRBJ1-3', + 'sample': 'LT1' + }, + 'LT1_CGTCAGGTCGAACTGT-1-24': + { + 'TRA_1_j_gene': 'TRAJ15', + 'TRA_1_v_gene': 'TRAV17', + 'TRB_1_v_gene': 'TRBV5-1', + 'TRB_1_d_gene': 'TRBD1', + 'TRB_1_j_gene': 'TRBJ1-1', + 'sample': 'LT1' + }, + 'LT1_GGGAATGGTTGCGTTA-2-24': + { + 'TRA_1_j_gene': 'None', + 'TRA_1_v_gene': 'None', + 'TRB_1_v_gene': 'TRBV30', + 'TRB_1_d_gene': 'TRBD1', + 'TRB_1_j_gene': 'TRBJ2-2', + 'sample': 'LT1' + }, + 'LT1_AGCTCCTGTAATCGTC-2-24': + { + 'TRA_1_j_gene': 'TRAJ13', + 'TRA_1_v_gene': 'TRAV13-1', + 'TRB_1_v_gene': 'TRBV18', + 'TRB_1_d_gene': 'TRBD2', + 'TRB_1_j_gene': 'TRBJ2-2', + 'sample': 'LT1' + }, + 'LT1_CAGCTGGTCCGCGGTA-1-24': + { + 'TRA_1_j_gene': 'TRAJ30', + 'TRA_1_v_gene': 'TRAV21', + 'TRB_1_v_gene': 'TRBV30', + 'TRB_1_d_gene': 'TRBD2', + 'TRB_1_j_gene': 'TRBJ2-1', + 'sample': 'LT1' + }, + 'LT1_CCTTTCTCAGCAGTTT-1-24': + { + 'TRA_1_j_gene': 'TRAJ23', + 'TRA_1_v_gene': 'TRAV9-2', + 'TRB_1_v_gene': 'TRBV3-1', + 'TRB_1_d_gene': 'None', + 'TRB_1_j_gene': 'TRBJ1-2', + 'sample': 'LT1' + }, + 'LT1_GTATCTTGTATATGAG-1-24': + { + 'TRA_1_j_gene': 'TRAJ40', + 'TRA_1_v_gene': 'TRAV36DV7', + 'TRB_1_v_gene': 'TRBV6-3', + 'TRB_1_d_gene': 'TRBD1', + 'TRB_1_j_gene': 'TRBJ2-5', + 'sample': 'LT1' + }, + 'LT1_TGCGCAGAGGGCATGT-1-24': + { + 'TRA_1_j_gene': 'TRAJ39', + 'TRA_1_v_gene': 'TRAV12-3', + 'TRB_1_v_gene': 'TRBV11-2', + 'TRB_1_d_gene': 'None', + 'TRB_1_j_gene': 'TRBJ2-7', + 'sample': 'LT1' + }, + 'LT1_CAGCAGCAGCGCTCCA-2-24': + { + 'TRA_1_j_gene': 'TRAJ32', + 'TRA_1_v_gene': 'TRAV38-2DV8', + 'TRB_1_v_gene': 'None', + 'TRB_1_d_gene': 'None', + 'TRB_1_j_gene': 'TRBJ2-3', + 'sample': 'LT1' + } + } + obs = pd.DataFrame.from_dict(obs, orient="index") + adata = AnnData(obs=obs) + return adata + @pytest.fixture def adata_clonotype(): diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 3d614abdc..17e73154d 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -7,6 +7,7 @@ adata_tra, adata_clonotype, adata_diversity, + adata_vdj, adata_clonotype_network, ) import matplotlib.pyplot as plt @@ -40,6 +41,9 @@ def test_spectratype(adata_tra): p = pl.spectratype(adata_tra, target_col="sample") assert isinstance(p, plt.Axes) +def test_vdj_usage(adata_vdj): + p = pl.vdj_usage(adata_vdj, fraction="sample") + assert isinstance(p, plt.Axes) def test_clonotype_network(adata_clonotype_network): p = pl.clonotype_network(adata_clonotype_network) diff --git a/tests/test_tools.py b/tests/test_tools.py index 171d066b1..c2e323512 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -7,7 +7,7 @@ import pandas.testing as pdt import numpy as np from scirpy._util import _get_from_uns -from .fixtures import adata_clonotype, adata_tra, adata_diversity +from .fixtures import adata_clonotype, adata_tra, adata_vdj, adata_diversity def test_chain_pairing(): diff --git a/tests/test_util.py b/tests/test_util.py index 6b8b36058..f5f460462 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -149,8 +149,8 @@ def test_normalize_counts(group_df): _normalize_counts(group_df, True, None) npt.assert_equal(_normalize_counts(group_df, False), [1] * 6) - npt.assert_equal(_normalize_counts(group_df, "sample"), [4, 2, 4, 4, 4, 2]) - npt.assert_equal(_normalize_counts(group_df, True, "sample"), [4, 2, 4, 4, 4, 2]) + npt.assert_equal(_normalize_counts(group_df, "sample"), [.25, .5, .25, .25, .25, .5]) + npt.assert_equal(_normalize_counts(group_df, True, "sample"), [.25, .5, .25, .25, .25, .5]) def test_layout_components():