In [None]:
import scanpy as sc
import pandas as pd
from pathlib import Path
import anndata as ad
import numpy as np
import os

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

DPI = 300
FONTSIZE = 20  # 42

sc.settings.set_figure_params(
    scanpy=True, dpi=100, transparent=True, vector_friendly=True, dpi_save=DPI
)
from matplotlib import rcParams

rcParams["pdf.fonttype"] = 42

In [None]:
DIR2SAVE = Path(
    "/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/subpopulations/TNKILC"
)

In [None]:
FIG2SAVE = DIR2SAVE.joinpath("figures/")
FIG2SAVE
# set the global variable: sc.settings.figdir to save all plots
sc.settings.figdir = FIG2SAVE

In [None]:
adata = sc.read_h5ad(
    DIR2SAVE.joinpath("SMC_KUL_Pelka_Che_Wu_CRC_integrated_scvi_hvg_TNKILC.h5ad")
)

### Leiden clustering

In [None]:
adata.uns["neighbors"]

In [None]:
# neighbors were already computed using Harmony corrected PCs, so we perform clustering on that graph
sc.tl.leiden(adata, key_added="leiden_scVI", resolution=1, random_state=7)

In [None]:
sc.pl.umap(
    adata,
    color="leiden_scVI",
    legend_loc="on data",
    save="general_clustering.pdf",
    show=True,
)

In [None]:
# load cell cycle scores saved from integration
cell_cycle_scores = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/data_integration/SMC_KUL_Pelka_Che_Wu_CRC_integrated_scvi_hvg_cc_scores.txt",
    sep="\t",
    index_col=0,
)

In [None]:
adata.obs = adata.obs.merge(
    cell_cycle_scores, how="left", left_index=True, right_index=True
)

In [None]:
sc.pl.umap(
    adata,
    color=["S_score", "G2M_score", "phase", "cell_cycle_diff"],
    color_map="viridis",
    save="cell_cycle.pdf",
)

In [None]:
sc.pl.umap(
    adata,
    color=["doublet_score", "n_genes_by_counts", "pct_counts_mt", "pct_counts_ribo"],
    color_map="viridis",
    save="QC_covariates.pdf",
)

### Look at distribution of cell source across clusters 

In [None]:
from matplotlib.patches import Rectangle


def proportion_cells_patient(
    adata, groupby_labels, xlabel: str, ylabel: str, colors: dict
):  # colors
    # compute proportion of cells within each group
    table2plot = (
        adata.reset_index()
        .groupby(groupby_labels)
        .size()
        .groupby(level=0)
        .apply(lambda x: x * 100 / x.sum())
        .unstack()
    )

    fig, ax = plt.subplots(
        nrows=1, ncols=1, sharey=False, sharex=False, dpi=DPI, figsize=(4, 4)
    )

    print(table2plot)
    table2plot.plot.bar(stacked=True, ax=ax, color=colors.values())  # , color=colors
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    ax.grid(False, which="major", axis="both")  # removes major horizontal gridlinesd

    labels = list(colors.keys())
    l = [Rectangle((0, 0), 0, 0, color=color) for color in list(colors.values())]
    ax.legend(
        l,
        labels,
        loc="upper left",
        bbox_to_anchor=(1, 0.8),
        facecolor="white",
        edgecolor="white",
        ncol=1,
        borderaxespad=0.0,
        framealpha=0,
        frameon=False,
    )

    plt.show()

In [None]:
from collections import OrderedDict

color_dict = OrderedDict(
    zip(
        adata.obs["cell_source"].cat.categories.values.tolist(),
        adata.uns["cell_source_colors"],
    )
)

proportion_cells_patient(
    adata.obs,
    groupby_labels=["leiden_scVI", "cell_source"],
    xlabel="leiden_scVI",
    ylabel="Percent cell source",
    colors=color_dict,
)

### Plot some markers

In [None]:
markers_T_general = {
    "T": ["TRAC"],
    "CD4": ["CD4"],
    "CD8": ["CD8A", "CD8B"],
    "Cycling": ["MKI67"],
    "Treg": ["FOXP3", "CTLA4", "CCR4", "IL2RA"],  # Treg are CD127 (IL7R) low.
    "Exhausted T": [
        "PDCD1",
        "LAG3",
        "HAVCR2",
        "CTLA4",
        "TIGIT",
        "ENTPD1",
    ],  # PD1: PCDC1, TIM3: HAVCR2,  CD39:ENTPD1 https://www.nature.com/articles/s41467-021-23324-4
    "NK": [
        "EOMES",
        "CMC1",
        "GZMK",
        "XCL1",
        "NKG7",
        "PRF1",
        "NCR1",
        "NCAM1",
        "FCER1G",
        "ITGA1",
        "GZMB",
        "FCGR3A",
    ],
    "ILC": ["AREG", "TLE1", "IL4I1"],
}

markers_T_cd4 = {
    "Th": ["CD4", "TRAC", "CD3D", "TRBC1", "TRBC2"],
    "Tfh": ["ICOS", "CXCR5", "TCF7", "PDCD1", "CCR7"],
    "Naïve CD4 T": [
        "CCR7",
        "SELL",
        "TCF7",
        "LEF1",
    ],  # https://www.nature.com/articles/s41467-019-12464-3
    "Th1/Th17/Th2": [
        "CXCR3",
        "TBX21",
        "CCL5",
        "CCR6",
        "IL22",
        "RORA",
        "IL7R",
        "IL4",
        "IL13",
        "GATA3",
        "CCR4",
    ],
    "Th17": [
        "IL17A",
        "ODF2L",
        "IL7R",
        "PDE4D",
    ],  #'CCR4', 'CCR6', 'IL1R1', 'IL6R', 'IL21R', 'IL23R'],# 'TGFBR1', 'RORA', 'RORC', 'BATF', 'IRF4'], # IL17 not in data
    "Th1": ["CCL5", "PHLDA1", "LYAR"],
}

markers_T_cd8 = {
    "Effector CD8": [
        "CCL4",
        "CCL5",
        "GZMK",
        "GZMB",
        "PFN1",
        "GZMA",
        "GZMH",
        "NKG7",
    ],  # Cytotoxic is same as effector for the effector, if it’s only one cluster, and doesn’t have TCF7/CCR7, I would annotate them just as effector
    "Tmem": [
        "CCR7",
        "PTPRC",
        "ENPP1",
    ],  # https://panglaodb.se/markers.html?cell_type=%27T%20memory%20cells%27
    "Naive cytotoxic": ["CD8A", "CCR7", "SELL"],
}

markers_T_other = {
    "gdT": ["KLRC2", "TRGC1", "TRGC2", "TRDC"],  # not in data: 'TCRD','TCRG'
    "NKT": ["GZMA", "CCL5", "NKG7", "KLRB1", "CD3G", "FGFBP2"],
    "MAIT": ["SLC4A10", "NCR3", "KLRB1"],
}

markers_ILC = {
    "ILC": [
        "AREG",
        "TLE1",
        "IL4I1",
    ],  # ['NCR2', 'ITGAE', 'KIT', 'IL7R', 'KLRB1', 'AHR'],
    "ILC1": ["TBX21", "CD3D", "CXCR3", "PLCD4"],
    "ILC2": ["KRT1", "HPGDS", "SLAMF1"],  # ['HPGDS', 'GATA3', 'PTGDR2', 'IL1RL1'],
    "ILC3": ["IL4I1", "RORC", "TNFRSF25", "SPINK2", "KLRB1", "IL7R"],
}

In [None]:
import itertools

# use log1p data stored in .raw
markers2plot = list(
    itertools.chain(*list(markers_T_general.values()))
)  # get all markers in a single list
sc.pl.umap(
    adata,
    color=markers2plot,
    use_raw=True,
    vmin=0.0,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="general_markers.pdf",
    show=True,
)

In [None]:
import itertools

# use log1p data stored in .raw
markers2plot = list(
    itertools.chain(*list(markers_T_cd4.values()))
)  # get all markers in a single list
sc.pl.umap(
    adata,
    color=markers2plot,
    use_raw=True,
    vmin=0.0,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="general_markers_cd4.pdf",
    show=True,
)

In [None]:
import itertools

# use log1p data stored in .raw
markers2plot = list(
    itertools.chain(*list(markers_T_cd8.values()))
)  # get all markers in a single list
sc.pl.umap(
    adata,
    color=markers2plot,
    use_raw=True,
    vmin=0.0,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="general_markers_cd8.pdf",
    show=True,
)

In [None]:
import itertools

# use log1p data stored in .raw
markers2plot = list(
    itertools.chain(*list(markers_T_other.values()))
)  # get all markers in a single list
sc.pl.umap(
    adata,
    color=markers2plot,
    use_raw=True,
    vmin=0.0,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="general_markers_other.pdf",
    show=True,
)

In [None]:
import itertools

# use log1p data stored in .raw
markers2plot = list(
    itertools.chain(*list(markers_ILC.values()))
)  # get all markers in a single list
sc.pl.umap(
    adata,
    color=markers2plot,
    use_raw=True,
    vmin=0.0,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="general_markers_ILC.pdf",
    show=True,
)

In [None]:
### markers for stress
# HSP
dissocation_markers_dict = {
    "shock protein": [
        "HSPE1",
        "HSPA1A",
        "HSPA1B",
        "HSP90AA1",
        "HSP90AB1",
        "HSPA8",
        "HSPB1",
    ],
    "immediate early genes": ["FOS", "JUN"],
}

sc.pl.umap(
    adata,
    color=dissocation_markers_dict["shock protein"],
    vmax="p99",
    use_raw=True,
    vmin=0,
    color_map="plasma_r",
    save="general_HSPmarkers.pdf",
    show=True,
)

### Differential expression to get DE genes upregulated per cluster 

In [None]:
# issue here https://github.com/theislab/single-cell-tutorial/issues/97
# This seems to be a scanpy bug as you can see here and here. The latter issue suggests to just add the line:
# adata.uns['log1p']["base"] = None after reading again, or downgrading to AnnData<0.8.
# Either way, this should be fixed soon by the maintenance team.
adata.uns["log1p"]["base"] = None

In [None]:
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    reference="rest",
    method="wilcoxon",
    use_raw=True,
    layer=None,
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon",
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI",
    key="rank_genes_wilcoxon",
    var_names=markers_T_general,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="general_dotplot.pdf",
    show=True,
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI",
    key="rank_genes_wilcoxon",
    var_names=markers_T_cd8,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="general_dotplot_cd8.pdf",
    show=True,
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI",
    key="rank_genes_wilcoxon",
    var_names=markers_T_other,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="general_dotplot_other.pdf",
    show=True,
)

In [None]:
sc.pl.dotplot(
    adata,
    groupby="leiden_scVI",
    use_raw=True,
    var_names=markers_T_other,
    cmap="plasma_r",
    standard_scale="var",
    vmin=0,
    vmax=1,
    colorbar_title="Mean expression",
    dendrogram=False,
    save="TNKILC_dotplot_mean_Tother.pdf",
    show=True,
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI",
    key="rank_genes_wilcoxon",
    var_names=dissocation_markers_dict,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="general_dotplot_HSP.pdf",
    show=True,
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI",
    key="rank_genes_wilcoxon",
    var_names=markers_T_cd4,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="general_dotplot_cd4.pdf",
    show=True,
)

In [None]:
sc.pl.dotplot(
    adata,
    groupby="leiden_scVI",
    use_raw=True,
    var_names=markers_T_general,
    cmap="plasma_r",
    standard_scale="var",
    vmin=0,
    vmax=1,
    colorbar_title="Mean expression",
    dendrogram=False,
    save="TNKILC_dotplot_mean.pdf",
    show=True,
)

In [None]:
sc.pl.dotplot(
    adata,
    groupby="leiden_scVI",
    use_raw=True,
    var_names=markers_T_cd4,
    cmap="plasma_r",
    standard_scale="var",
    vmin=0,
    vmax=1,
    colorbar_title="Mean expression",
    dendrogram=False,
    save="TNKILC_CD4_dotplot_mean.pdf",
    show=True,
)

In [None]:
# save intermediate result
# write to file
adata.write(
    DIR2SAVE.joinpath(
        "SMC_KUL_Pelka_Che_Wu_CRC_integrated_scvi_hvg_TNKILC_clustering.h5ad"
    )
)

In [None]:
pval_thresh = 0.05
log2fc_thresh = 1
cluster_de_genes = dict()
for cluster in sorted(set(adata.obs["leiden_scVI"])):
    cluster_de_genes[cluster] = sc.get.rank_genes_groups_df(
        adata,
        group=cluster,
        key="rank_genes_wilcoxon",
        pval_cutoff=pval_thresh,
        log2fc_min=log2fc_thresh,
        log2fc_max=None,
    ).sort_values("logfoldchanges", ascending=False)

# write to excel file DE genes per cluster
# Create a Pandas Excel writer using XlsxWriter as the engine.
path2save = DIR2SAVE.joinpath(
    "TNKILC_pval{}_log2fc{}.xlsx".format(pval_thresh, log2fc_thresh)
)

with pd.ExcelWriter(path2save) as writer:
    for cluster in list(cluster_de_genes.keys()):

        # get celltype of cluster
        # celltype = np.unique(adata[adata.obs['leiden']==cluster,:].obs['cell identity'])[0]

        cluster_de_genes[cluster].to_excel(
            writer, sheet_name="cluster{}".format(cluster)
        )

### Look at scores for general signatures 

In [None]:
# t phenotype
### see some signatures from Zhang paper
# read excel file
tcell_phen_zhang = pd.read_excel(
    "/data/BCI-CRC/nasrine/data/zhang_signatures.xlsx",
    sheet_name="Tcell_markers",
    skiprows=None,
    header=1,
    dtype=str,
)

In [None]:
print(set(tcell_phen_zhang.Cluster))

In [None]:
NKT_signature = tcell_phen_zhang[
    tcell_phen_zhang["Cluster"].str.contains("c30_NKT-FCGR3A")
]["Gene"].values.tolist()
print(len(NKT_signature))
NKT_signature_var = [
    gene for gene in NKT_signature if gene in adata.raw.to_adata().var_names
]
sc.tl.score_genes(
    adata, gene_list=NKT_signature_var, score_name="NKT_score", use_raw=True
)
sc.pl.umap(
    adata,
    color="NKT_score",
    vmax="p99",
    use_raw=True,
    vmin=0,
    save="TNKILC_NKT_score.pdf",
    color_map="YlOrRd",
    show=True,
)
print("length of gene list in adata.var:", len(NKT_signature_var))

In [None]:
CD8_HSP_signature = tcell_phen_zhang[
    tcell_phen_zhang["Cluster"].str.contains("c12_CD8-HSPA1A")
]["Gene"].values.tolist()
print(len(CD8_HSP_signature))
CD8_HSP_signature_var = [
    gene for gene in CD8_HSP_signature if gene in adata.raw.to_adata().var_names
]
sc.tl.score_genes(
    adata, gene_list=CD8_HSP_signature_var, score_name="CD8_HSP_score", use_raw=True
)
sc.pl.umap(
    adata,
    color="CD8_HSP_score",
    vmax="p99",
    use_raw=True,
    vmin=0,
    save="TNKILC_CD8_HSP_score.pdf",
    color_map="YlOrRd",
    show=True,
)
print("length of gene list in adata.var:", len(CD8_HSP_signature_var))

In [None]:
Trm_XCL1_signature = tcell_phen_zhang[
    tcell_phen_zhang["Cluster"].str.contains("C09_CD8_Trm-XCL1")
]["Gene"].values.tolist()
print(len(Trm_XCL1_signature))
Trm_XCL1_signature_var = [
    gene for gene in Trm_XCL1_signature if gene in adata.raw.to_adata().var_names
]
sc.tl.score_genes(
    adata, gene_list=Trm_XCL1_signature_var, score_name="Trm_XCL1_score", use_raw=True
)
sc.pl.umap(
    adata,
    color="Trm_XCL1_score",
    vmax="p99",
    use_raw=True,
    vmin=0,
    save="TNKILC_Trm_XCL1_score.pdf",
    color_map="YlOrRd",
    show=True,
)
print("length of gene list in adata.var:", len(Trm_XCL1_signature_var))

In [None]:
CD8_Te_GZMK_signature = tcell_phen_zhang[
    tcell_phen_zhang["Cluster"].str.contains("C05_CD8_Tem-GZMK")
]["Gene"].values.tolist()
print(len(CD8_Te_GZMK_signature))
CD8_Te_GZMK_signature_var = [
    gene for gene in CD8_Te_GZMK_signature if gene in adata.raw.to_adata().var_names
]
sc.tl.score_genes(
    adata,
    gene_list=CD8_Te_GZMK_signature_var,
    score_name="CD8_Te_GZMK_score",
    use_raw=True,
)
sc.pl.umap(
    adata,
    color="CD8_Te_GZMK_score",
    vmax="p99",
    use_raw=True,
    vmin=0,
    save="TNKILC_CD8_Te_GZMK_score.pdf",
    color_map="YlOrRd",
    show=True,
)
print("length of gene list in adata.var:", len(CD8_Te_GZMK_signature_var))

In [None]:
CD4_Tn_signature = tcell_phen_zhang[tcell_phen_zhang["Cluster"].str.contains("CD4_Tn")][
    "Gene"
].values.tolist()
print(len(CD4_Tn_signature))
CD4_Tn_signature_var = [
    gene for gene in CD4_Tn_signature if gene in adata.raw.to_adata().var_names
]
sc.tl.score_genes(
    adata, gene_list=CD4_Tn_signature_var, score_name="CD4_Tn_score", use_raw=True
)
sc.pl.umap(
    adata,
    color="CD4_Tn_score",
    vmax="p99",
    use_raw=True,
    vmin=0,
    save="TNKILC_CD4_Tn_score.pdf",
    color_map="YlOrRd",
    show=True,
)
print("length of gene list in adata.var:", len(CD4_Tn_signature_var))

In [None]:
CD4_Treg_signature = tcell_phen_zhang[
    tcell_phen_zhang["Cluster"].str.contains("CD4_Treg")
]["Gene"].values.tolist()
print(len(CD4_Treg_signature))
CD4_Treg_signature_var = [
    gene for gene in CD4_Treg_signature if gene in adata.raw.to_adata().var_names
]
sc.tl.score_genes(
    adata, gene_list=CD4_Treg_signature_var, score_name="CD4_Treg_score", use_raw=True
)
sc.pl.umap(
    adata,
    color="CD4_Treg_score",
    vmax="p99",
    use_raw=True,
    vmin=0,
    save="TNKILC_CD4_Treg_score.pdf",
    color_map="YlOrRd",
    show=True,
)
print("length of gene list in adata.var:", len(CD4_Treg_signature_var))

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["3"],
    reference="5",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_3",
    use_raw=False,
)

sc.pl.rank_genes_groups(adata, groups=["3"], n_genes=30, key="rank_genes_wilcoxon_3")

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["5"],
    reference="3",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_5",
    use_raw=False,
)

sc.pl.rank_genes_groups(adata, groups=["5"], n_genes=30, key="rank_genes_wilcoxon_5")

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["2"],
    reference="10",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_2",
    use_raw=False,
)

sc.pl.rank_genes_groups(adata, groups=["2"], n_genes=30, key="rank_genes_wilcoxon_2")

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["10"],
    reference="2",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_10",
    use_raw=False,
)

sc.pl.rank_genes_groups(adata, groups=["10"], n_genes=30, key="rank_genes_wilcoxon_10")

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["10"],
    reference="13",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_10_13",
    use_raw=False,
)

sc.pl.rank_genes_groups(
    adata, groups=["10"], n_genes=30, key="rank_genes_wilcoxon_10_13"
)

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["13"],
    reference="10",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_13_10",
    use_raw=False,
)

sc.pl.rank_genes_groups(
    adata, groups=["13"], n_genes=30, key="rank_genes_wilcoxon_13_10"
)

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["13"],
    reference="2",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_13_2",
    use_raw=False,
)

sc.pl.rank_genes_groups(
    adata, groups=["13"], n_genes=30, key="rank_genes_wilcoxon_13_2"
)

In [None]:
### Compare DE genes
# DE genes between Therapy treated vs None for C1QC TAMs
sc.tl.rank_genes_groups(
    adata,
    groupby="leiden_scVI",
    groups=["2"],
    reference="13",
    method="wilcoxon",
    layer="log1p",
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon_2_13",
    use_raw=False,
)

sc.pl.rank_genes_groups(adata, groups=["2"], n_genes=30, key="rank_genes_wilcoxon_2_13")

In [None]:
adata = sc.read_h5ad(
    DIR2SAVE.joinpath(
        "SMC_KUL_Pelka_Che_Wu_CRC_integrated_scvi_hvg_TNKILC_clustering.h5ad"
    )
)

### Annotate cell types

In [None]:
old_to_new = {
    "0": "CD8 Tex",  # exhausted GZMB, GZMA, CCL5, HAVCR2, GNLY, NKG7, CCL4, CD8A, IFNG, GZMH, LAG3, PRF1, TIGIT, PDCD1, TOX resembles mirjana T cell paper TPEX in Two subsets of stem-like CD8 memory T cell progenitors with 2 distinct fate commitments in humans
    "1": "CD4 Tn",  # naive CCR7, LEF1, SELL, TCF7
    "2": "CD8 Tem",  # effector memory GZMK, CCL4, CD8A, GZMM, HLA-DRB1, HLA-DRA, IFNG, CCL5, NKG7, EOMES, GZMA
    "3": "Treg",  # FOXP3, IL2RA, CTLA4, TIGIT
    "4": "CD4 Th",  # ANXA1 , CD40LG, IL7R
    "5": "Treg HSP",  # FOXP3, IL2RA, CTLA4, TIGIT, CD79B?
    "6": "NK",  # NCAM1, FCER1G, KLRC1
    "7": "CD4 Th17",  # Il22, IL17A, RORA, PDE4D
    "8": "T cycling",
    "9": "gdT",  # TRDC, TRGC1, ITGA1 but has a bit of NK maybe
    "10": "Myeloid-T doublet",  # APOC1, SPP1, APOE, HSP, IGHG, JCHAIN
    "11": "CD4 Th HSP",  # CD40LG, IL7R plus umap has cd4
    "12": "CD4 Tfh",  # CD4, ITM2A, LPAR6, PDCD1
    "13": "CD8 Tem HSP",  # effector memory HSP, GZMK, CCL5, CCL4, IFNG, EOMES, NKG7, GZMH, CD8B, CD8A, GZMA
    "14": "B-T doublet",  # CD22, CD79A, MS4A1, CD79B
    "15": "ILC3",  # IL4I1, RORC, TNFRSF25
}

In [None]:
adata.obs["Annotation_scVI_detailed"] = (
    adata.obs["leiden_scVI"].map(old_to_new).astype("category")
)

In [None]:
sc.pl.umap(
    adata, color="Annotation_scVI_detailed", show=True, legend_loc="right margin"
)

In [None]:
# todo dotplot with all cell types annotated
markers_T_dotplot = {
    "T": ["TRAC"],
    "CD4": ["CD4"],
    "CD8": ["CD8A", "CD8B"],
    "Cycling": ["MKI67"],
    "Treg": ["FOXP3", "CTLA4", "CCR4", "IL2RA"],  # Treg are CD127 (IL7R) low.
    "Exhausted T": [
        "PDCD1",
        "LAG3",
        "HAVCR2",
        "CTLA4",
        "TIGIT",
        "ENTPD1",
    ],  # PD1: PCDC1, TIM3: HAVCR2,  CD39:ENTPD1 https://www.nature.com/articles/s41467-021-23324-4
    "NK": [
        "EOMES",
        "CMC1",
        "GZMK",
        "XCL1",
        "NKG7",
        "PRF1",
        "NCR1",
        "NCAM1",
        "FCER1G",
        "ITGA1",
        "GZMB",
        "FCGR3A",
    ],
    "ILC": ["AREG", "TLE1", "IL4I1"],
    "Th": ["CD4", "TRAC", "CD3D", "TRBC1", "TRBC2"],
    "Tfh": ["ICOS", "CXCR5", "TCF7", "PDCD1", "CCR7"],
    "Naïve CD4 T": ["CCR7", "SELL", "TCF7", "LEF1"],
    "Th17": ["IL17A", "ODF2L", "IL7R", "PDE4D"],
    "Effector CD8": ["CCL4", "CCL5", "GZMK", "GZMB", "PFN1", "GZMA", "GZMH", "NKG7"],
    "gdT": ["KLRC2", "TRGC1", "TRGC2", "TRDC"],
    "ILC3": ["IL4I1", "RORC", "TNFRSF25", "SPINK2", "KLRB1", "IL7R"],
}

In [None]:
sc.pl.dotplot(
    adata,
    groupby="leiden_scVI",
    use_raw=True,
    var_names=markers_T_dotplot,
    cmap="plasma_r",
    standard_scale="var",
    vmin=0,
    vmax=1,
    colorbar_title="Mean expression",
    dendrogram=False,
    save="TNKILC_annotations_dotplot_mean.pdf",
    show=True,
)

### Write to file

In [None]:
# save intermediate result
# write to file
adata.write(
    DIR2SAVE.joinpath(
        "SMC_KUL_Pelka_Che_Wu_CRC_integrated_scvi_hvg_TNKILC_clustering.h5ad"
    )
)

In [None]:
# write csv of annotations
adata.obs["Annotation_scVI_detailed"].to_csv(
    DIR2SAVE.joinpath(
        "SMC_KUL_Pelka_Che_Wu_CRC_integrated_scvi_hvg_annotations_TNKILC.txt"
    ),
    sep="\t",
    index=True,
    header=True,
)

In [None]:
sc.pl.umap(
    adata,
    color=[
        "ENTPD1",
        "CSF1",
        "KLRC1",
        "KLRC2",
        "TIGIT",
        "CYP26A1",
        "B4GALNT1",
        "CD9",
        "KIR2DL1",
        "KIR2DL3",
        "LILRB1",
        "KLRC3",
    ],
    color_map="plasma_r",
    use_raw=True,
)  # paper dNK1, our NK2

In [None]:
sc.pl.umap(
    adata,
    color=['APOC1', 'APOE', 'JCHAIN', 'IGHA2', "CD79A"], # CD22, CD79A, MS4A1, CD79B APOC1, SPP1, APOE, HSP, IGHG, JCHAIN
    use_raw=True,
    vmin=0.4,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="general_markers_doublet.pdf",
    show=True,
) 


In [None]:
doublet_markers_dict = {
    "Myeloid": ['APOE', 'APOC1'],
    "B": ['CD79A', 'MS4A1', 'CD22'],
    "T": ['TRAC', 'CD3E', 'CD3D', 'CD4', 'CD8A'],
    'NK': ['NCAM1', 'FCER1G'],
    'Cycling': ['MKI67'],
    'Treg': ['FOXP3', 'IL2RA'],
    'immediate early genes': ['JUN', 'FOS'],
    'HSP': ["HSPA1A",
        "HSPA1B",
        "HSP90AA1",
        "HSP90AB1",] 
}

sc.pl.rank_genes_groups_dotplot(
    adata,
    groupby="leiden_scVI",
    key="rank_genes_wilcoxon",
    var_names=doublet_markers_dict,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    vmin=-4,
    vmax=4,
    min_logfoldchange=1,
    colorbar_title="log fold change",
    save="general_dotplot_doublet.pdf",
    show=True,
)

In [None]:
adata_zscore = adata.raw.to_adata().copy()
# scale and store results in layer for later plotting
adata_zscore.layers['scaled'] = sc.pp.scale(adata_zscore, copy=True).X
adata_zscore.obs=adata.obs


sc.pl.dotplot(adata=adata_zscore, 
              var_names=doublet_markers_dict,
              groupby="leiden_scVI",
              cmap="seismic", 
              layer="scaled", 
              vmin=-2,
              vmax=2, 
              dot_max=0.5, 
              colorbar_title = "scaled expression", 
              size_title="Fraction of cells (%)",
              save='zscore_general_doublet.pdf')

In [None]:
sc.pl.umap(
    adata,
    color=['CD3E', 'CD3G'], # CD22, CD79A, MS4A1, CD79B APOC1, SPP1, APOE, HSP, IGHG, JCHAIN
    use_raw=True,
    vmin=0,
    vmax="p99",
    color_map="plasma_r",  #'RdPu',
    save="general_markers_cd3.pdf",
    show=True,
) 
