In [None]:
import scanpy as sc
import pandas as pd
from pathlib import Path
import anndata as ad
import numpy as np
import os
import scvi

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

DPI = 300
FONTSIZE = 20  # 42

sc.settings.set_figure_params(
    scanpy=True, dpi=100, transparent=True, vector_friendly=True, dpi_save=DPI
)
from matplotlib import rcParams

rcParams["pdf.fonttype"] = 42

In [None]:
DIR2SAVE = Path(
    "/data/BCI-CRC/nasrine/data/CRC/Metastatic_CRC_LM_dataset/subpopulations/TNKILC/NK"
)
DIR2SAVE.mkdir(parents=True, exist_ok=True)

In [None]:
FIG2SAVE = DIR2SAVE.joinpath("figures/")
FIG2SAVE.mkdir(parents=True, exist_ok=True)
# set the global variable: sc.settings.figdir to save all plots
sc.settings.figdir = FIG2SAVE

In [None]:
adata = sc.read_h5ad(
    DIR2SAVE.joinpath("Multiome_Che_Wu_CRC_LM_integrated_scvi_hvg_NK.h5ad")
)

In [None]:
adata.shape

In [None]:
adata.obs.cell_source.value_counts()

In [None]:
adata.obs.Cell_subtype.value_counts()

In [None]:
sc.pl.umap(
    adata,
    color=[
        "NCAM1",
        "FCER1G",
        "NCR1",  # NK
        "LILRB1",
        "KLRC1",
        "KLRC2",
        "CSF1",
        "ENTPD1",
        "GZMA",
        "GZMB",
        "HAVCR2",
        "IL2RB",
        "GNLY",  # our NK2, dNK1
        "CD160",
        "RGS2",
        "CXCR4",
        "CCL5",
        "TIGIT",  # dNK3
        "CD2",
        "ITGB2",
        "CD27",
        "GZMH",
        "ANXA1",  # Our NK1 which is dNK2
    ],
    color_map="plasma_r",
    use_raw=True,
)  # paper dNK1, our NK2

In [None]:
sc.pl.umap(
    adata,
    color=[
        "AREG",
        "TLE1",
        "IL4I1",  # ILC
        "CCR6",
        "AHR",
        "IL3RA",
        "PTGDR2",
        "KIT",
        "RORC",
        "IL7R",
        "GATA3",
        "TNFRSF25",
        "IL4I1",  # ILC3
        "TBX21",
        "CD3D",
        "CXCR3",  # ILC2
        "KRT1",
        "HPGDS",
        "SLAMF1",  # ILC1
    ],
    color_map="plasma_r",
    use_raw=True,
)

In [None]:
sc.pl.umap(
    adata,
    color=["KLRB1", "CD3G", "FGFBP2"],
    color_map="plasma_r",
    use_raw=True,
)

### Leiden clustering

In [None]:
adata.uns["neighbors"]

In [None]:
from collections import Counter


def clustering_leiden_resolution(adata2test, res_range):
    """
    Performs hyperparameter search for resolution in leiden clustering
    :param adata2test: AnnData frame
    :param res_range: range of values to evaluate: i.e. np.arange(0.1, 1.5, 0.05)
    """
    resolution_dict = {r: None for r in res_range}
    # vary resolution parameter and see which nb of clusters occurs more frequently
    for r in res_range:
        # load adata
        adata = adata2test.copy()
        sc.tl.leiden(adata, resolution=r, random_state=7)
        # store nb of clusters for that resolution
        resolution_dict[r] = len(set(adata.obs["leiden"]))

    # plot figure: nb clusters in fct of resolution param
    fig, ax = plt.subplots(
        nrows=1, ncols=1, sharey=False, sharex=False, dpi=DPI, figsize=(5, 4.5)
    )
    plt.scatter(resolution_dict.keys(), resolution_dict.values())
    ax.set_xlabel("Resolution")
    ax.set_ylabel("Number of clusters")
    ax.tick_params(axis="both", which="major", labelsize=FONTSIZE - 10)
    plt.show()

    # display nb of times each number of clusters occurs
    print("Frequency of NB clusters")
    print(Counter(list(resolution_dict.values())))

In [None]:
clustering_leiden_resolution(adata, res_range=np.arange(0.1, 2, 0.1))

In [None]:
# neighbors were already computed using Harmony corrected PCs, so we perform clustering on that graph
sc.tl.leiden(adata, key_added="leiden_scVI_NK", resolution=0.5, random_state=7)

In [None]:
sc.pl.umap(
    adata,
    color="leiden_scVI_NK",
    legend_loc="on data",
    save="NK_general_clustering.pdf",
    show=True,
)

In [None]:
# neighbors were already computed using Harmony corrected PCs, so we perform clustering on that graph
sc.tl.leiden(adata, key_added="leiden_scVI_NK_r0.7", resolution=0.7, random_state=7)

In [None]:
sc.pl.umap(
    adata,
    color="leiden_scVI_NK_r0.7",
    legend_loc="on data",
    save="NK_general_clustering_r0.7.pdf",
    show=True,
)

### Plot some markers 

In [None]:
sc.pl.umap(
    adata,
    color=["NCAM1", "FCER1G", "KLRC1"],
    color_map="plasma_r",
    use_raw=True,
    show=True,
    save="NKmarkers.pdf",
)  # NK markers

In [None]:
sc.pl.umap(
    adata,
    color=["NCAM1", "FCER1G", "CD8A", "CD3G", "FGFBP2"],
    color_map="plasma_r",
    use_raw=True,
    show=True,
)

In [None]:
# ILC markers
markers_ILC = {
    "ILC": [
        "AREG",
        "TLE1",
        "IL4I1",
    ],  # ['NCR2', 'ITGAE', 'KIT', 'IL7R', 'KLRB1', 'AHR'],
    "ILC1": ["TBX21", "CD3D", "CXCR3", "PLCD4"],
    "ILC2": ["KRT1", "HPGDS", "SLAMF1"],  # ['HPGDS', 'GATA3', 'PTGDR2', 'IL1RL1'],
    "ILC3": ["IL4I1", "RORC", "TNFRSF25", "SPINK2", "KLRB1", "IL7R"],
}

import itertools

# use log1p data stored in .raw
markers2plot = list(
    itertools.chain(*list(markers_ILC.values()))
)  # get all markers in a single list
sc.pl.umap(
    adata,
    color=markers2plot,
    use_raw=True,
    vmin=0.0,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="markers_ILC.pdf",
    show=True,
)

In [None]:
sc.pl.umap(
    adata,
    color=[
        "ENTPD1",
        "CSF1",
        "KLRC1",
        "KLRC2",
        "TIGIT",
        "CYP26A1",
        "B4GALNT1",
        "CD9",
        "KIR2DL1",
        "KIR2DL3",
        "LILRB1",
        "KLRC3",
    ],
    color_map="plasma_r",
    use_raw=True,
)  # paper dNK1, our NK2

In [None]:
# our NK1, dNK2 in paper
sc.pl.umap(adata, color=["ANXA1", "ITGB2"], color_map="plasma_r", use_raw=True)

In [None]:
# dNK3 in paper
# CD160, KLRB1 and CD103 (also known as ITGAE), but not the innate lymphocyte cell marker CD127 (also known as IL7R)
sc.pl.umap(
    adata, color=["CD160", "KLRB1", "ITGAE", "IL7R"], color_map="plasma_r", use_raw=True
)

In [None]:
sc.pl.umap(
    adata,
    color=["KLRB1", "CD3G", "FGFBP2", "GZMA", "CCL5", "NKG7"],
    color_map="plasma_r",
    use_raw=True,
    show=True,
    save="NK_NKT_markers.pdf",
)  # NKT markers

In [None]:
sc.pl.umap(adata, color=["GNLY", "NKG7", "CD3D"], color_map="plasma_r", use_raw=True)
# immature NK, immature natural killer cells which originate from natural killer cell precursors and are committed to mature natural killer cells

In [None]:
### markers for stress
# HSP
dissocation_markers_dict = {
    "shock protein": [
        "HSPE1",
        "HSPA1A",
        "HSPA1B",
        "HSP90AA1",
        "HSP90AB1",
        "HSPA8",
        "HSPB1",
    ],
    "immediate early genes": ["FOS", "JUN"],
}

sc.pl.umap(
    adata,
    color=dissocation_markers_dict["shock protein"],
    vmax="p99",
    use_raw=True,
    vmin=0,
    color_map="plasma_r",
    save="NK_HSPmarkers.pdf",
    show=True,
)

In [None]:
markers = {
    "NK": ["NCAM1", "FCER1G", "KLRC1"],
    #'immature NK': ['GNLY', 'NKG7', 'CD3D'],
    "NK2": [
        "ENTPD1",
        "CSF1",
        "KLRC1",
        "KLRC2",
        "CD9",
        "KIR2DL1",
        "KIR2DL3",
        "LILRB1",
        "KLRC3",
        "GZMA",
        "GZMB",
        "GNLY",
        "PRF1",
        "HAVCR2",
        "EOMES",
        "IL2RB",
        "ID3",
    ],  # dNK1
    "NK1": ["ANXA1", "ITGB2", "CD9", "CD7", "EOMES", "XCL1", "SELL", "IFITM3"],  # dNK2
    "NK3": [
        "CCL5",
        "CD160",
        "KLRB1",
        "ITGAE",
        "AREG",
        "XCL1",
        "TIGIT",
        "FCGR3A",
        "KLRF1",
        "ITGB2",
        "KLRD1",
    ],  # dNK3
    "NKT": ["KLRB1", "CD3G", "FGFBP2"],
    "HSP": ["HSPE1", "HSPA1A", "HSPA1B", "HSP90AA1", "HSP90AB1", "HSPA8", "HSPB1"],
}

In [None]:
sc.pl.umap(
    adata,
    color=markers["NK1"],
    color_map="plasma_r",
    use_raw=True,
    show=True,
    save="NK1_markers.pdf",
)  # NK all markers

In [None]:
sc.pl.umap(
    adata,
    color=markers["NK2"],
    color_map="plasma_r",
    use_raw=True,
    show=True,
    save="NK2_markers.pdf",
)  # NK all markers

In [None]:
sc.pl.umap(
    adata,
    color=markers["NK3"],
    color_map="plasma_r",
    use_raw=True,
    show=True,
    save="NK3_markers.pdf",
)  # NK all markers

In [None]:
sc.pl.umap(
    adata,
    color=["CD247", "IL2RB", "KLRF1", "NCR1", "TRDC", "TXK", "CLIC3", "SH2D1B"],
    color_map="plasma_r",
    use_raw=True,
    show=True,
    save="protein_atlas_markers.pdf",
)  # NK all markers

### Differential expression to get DE genes upregulated per cluster 

In [None]:
# issue here https://github.com/theislab/single-cell-tutorial/issues/97
# This seems to be a scanpy bug as you can see here and here. The latter issue suggests to just add the line:
# adata.uns['log1p']["base"] = None after reading again, or downgrading to AnnData<0.8.
# Either way, this should be fixed soon by the maintenance team.
adata.uns["log1p"]["base"] = None

In [None]:
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI_NK",
    reference="rest",
    method="wilcoxon",
    use_raw=True,
    layer=None,
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon",
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI_NK",
    key="rank_genes_wilcoxon",
    var_names=markers,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="NKgeneral_dotplot.pdf",
    show=True,
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI_NK",
    key="rank_genes_wilcoxon",
    var_names=markers_ILC,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="NK_ILCmarkers_dotplot.pdf",
    show=True,
)

In [None]:
sc.pl.dotplot(
    adata,
    groupby="leiden_scVI_NK",
    use_raw=True,
    var_names=markers,
    cmap="plasma_r",
    standard_scale="var",
    vmin=0,
    vmax=1,
    colorbar_title="Mean expression",
    dendrogram=False,
    save="NK_dotplot_mean_general.pdf",
    show=True,
)

In [None]:
pval_thresh = 0.05
log2fc_thresh = 1
cluster_de_genes = dict()
for cluster in sorted(set(adata.obs["leiden_scVI_NK"])):
    cluster_de_genes[cluster] = sc.get.rank_genes_groups_df(
        adata,
        group=cluster,
        key="rank_genes_wilcoxon",
        pval_cutoff=pval_thresh,
        log2fc_min=log2fc_thresh,
        log2fc_max=None,
    ).sort_values("logfoldchanges", ascending=False)

# write to excel file DE genes per cluster
# Create a Pandas Excel writer using XlsxWriter as the engine.
path2save = DIR2SAVE.joinpath(
    "NK_pval{}_log2fc{}.xlsx".format(pval_thresh, log2fc_thresh)
)

with pd.ExcelWriter(path2save) as writer:
    for cluster in list(cluster_de_genes.keys()):

        # get celltype of cluster
        # celltype = np.unique(adata[adata.obs['leiden']==cluster,:].obs['cell identity'])[0]

        cluster_de_genes[cluster].to_excel(
            writer, sheet_name="cluster{}".format(cluster)
        )

### Annotations

**With r=0.7**
* Cluster 0: NK2 with GNLY, GZMB, GZMH, KI2DL1, PRF1, KIR3DL2, KLRC2, HAVCR2, LAG3
* Cluster 1: NK1 but could be ILC with AREG, KLRC1, KLRB1, but has also KRT81, KRT86, IL2RB, CD7, XCL1
* Cluster 2: NK1 with FCER1G, XCL1, CD160, XCL2, CD7, KLRC1
* Cluster 3: NKT with CD3G and FGFBP2

**but with r=0.5**
* Cluster 0: NK1,
* Cluster 1: NK2,
* Cluster 2: NKT

We find that dNK1 cells express higher levels of KIRs that can bind to HLA-C molecules: inhibitory KIR2DL1, KIR2DL2 and KIR2DL3 and activating KIR2DS1 and KIR2DS4 (NK1 contains more cytoplasmic granules than dNK2 and dNK3, which is consistent with our scRNA-seq data that show higher levels of expression of PRF1, GNLY, GZMA and GZMB RNA in this subset


In [None]:
old_to_new = {
    "0": "NK1",  # CD7, KRT81, KRT86, XCL1, XCL2, GZMK, CD160, KLRB1, CD27,
    "1": "NK2",  # GNLY, GZMB, GZMH, KIR2DL1, PRF1, KIR3DL2, HAVCR2;s
    "2": "NKT",
}

In [None]:
adata.obs["Annotation_scVI_detailed_NK"] = (
    adata.obs["leiden_scVI_NK"].map(old_to_new).astype("category")
)

In [None]:
adata.obs["Annotation_scVI_detailed_NK"].cat.categories

### Colours

In [None]:
adata.uns["Annotation_scVI_detailed_NK_colors"] = ["#1d91c0", "#dd3497", "#a6cee3"]

In [None]:
sc.pl.umap(
    adata,
    color="Annotation_scVI_detailed_NK",
    show=True,
    legend_loc="right margin",
    save="NKannotations.pdf",
    title="",
)

In [None]:
# write to file
adata.write(
    DIR2SAVE.joinpath("Multiome_Che_Wu_CRC_LM_integrated_scvi_hvg_NK_clustering.h5ad")
)

In [None]:
# plot marrkers polished
sc.pl.dotplot(
    adata,
    groupby="Annotation_scVI_detailed_NK",
    use_raw=True,
    var_names=markers,
    cmap="plasma_r",
    standard_scale="var",
    vmin=0,
    vmax=1,
    colorbar_title="Mean expression",
    dendrogram=False,
    save="NK_annot_dotplot_mean_general.pdf",
    show=True,
)