# Keap tumors WT vs KO
Python analysis using scanpy 1.9.3
Docker image: TODO add link dockerhub

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import pathlib
import anndata as anndata
import matplotlib as mpl
import decoupler as dc

In [None]:
new_rc_params = {'text.usetex': False,
"svg.fonttype": 'none'
}
mpl.rcParams.update(new_rc_params)

In [None]:
import sys
sys.path.insert(0, "../../shared/src")
import decoupler_helpers

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor='white', dpi_save=300, frameon = False)
results_path = '../results/'
sc.settings.figdir = results_path
save_figure = True
aspect_ratio = 1
umap_point_size = 15
umap_transparency = 0.3
umap_continuous_point_size = 30
umap_continuous_transparency = 0.7

In [None]:
data_path = pathlib.Path('../data/raw')
path_to_results = pathlib.Path('../results')
path_to_results.mkdir(parents=True, exist_ok=True)

# Data import

In [None]:
#samples = ['KeapKO_tumor_1', 'KeapKO_tumor_2', 'KeapWT_tumor_1', 'KeapWT_tumor_2']
samples = ['KeapWT_tumor_2', 'KeapWT_tumor_1', 'KeapKO_tumor_2', 'KeapKO_tumor_1']

In [None]:
adatas = []
for sam in samples:
    adata = sc.read_10x_h5(data_path / '{}_sample_filtered_feature_bc_matrix.h5'.format(sam)) # TODO: add backup URL (GEO link)
    adata.obs['sample'] = sam
    genotype = sam.split('_')[0]
    adata.obs['genotype'] = genotype
    print(adata.shape)
    adata.var_names_make_unique()
    adatas.append(adata)


# QC

In [None]:
 for adata in adatas:
    adata.var['mt'] = adata.var_names.str.startswith('mt-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata.var['Rp'] = adata.var_names.str.startswith('Rp') 
    sc.pp.calculate_qc_metrics(adata, qc_vars=['Rp'], percent_top=None, log1p=False, inplace=True)

In [None]:
 for adata, name in zip(adatas, samples):
    print(name)
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}.png" if save_figure else None)

In [None]:
for adata in adatas:
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color= 'n_genes_by_counts')
    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color = "pct_counts_mt")
    sc.pl.scatter(adata, x='pct_counts_mt', y='pct_counts_Rp', color = 'n_genes_by_counts')

In [None]:
min_numof_genes = 200
max_numof_genes = 8000#7000
for adata, name in zip(adatas, samples):
    fig, axs = plt.subplots(1, 4, figsize=(12, 3))
    fig.suptitle(f"Covariates for filtering: {name}")

    sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0])
    sns.histplot(
        adata.obs["total_counts"][adata.obs["total_counts"] < 40000],
        kde=False,
        bins=40,
        ax=axs[1],
    )
    plot = sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
    plot.axvline(x = min_numof_genes, color = 'red')
    plot.axvline(x = max_numof_genes, color = 'red')
    sns.histplot(
        adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < min_numof_genes + 1000],
        kde=False,
        bins=60,
        ax=axs[3],
    )
    plt.axvline(x = min_numof_genes, color = 'red')

# Filtering

In [None]:
max_pct_mt = 5
for idx, adata in enumerate(adatas):
    adatas[idx] = adata[adata.obs['pct_counts_mt'] < max_pct_mt , :]
    #helpers.assert_all_finite(adata.X)

In [None]:
 for adata in adatas:
    sc.pp.filter_cells(adata, min_genes = min_numof_genes)
    sc.pp.filter_cells(adata, max_genes = max_numof_genes)
    sc.pp.filter_genes(adata, min_cells = 1)
    print(adata.shape)

# QC after filtering

In [None]:
for adata, name in zip(adatas, samples):
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}_filtered.png" if save_figure else None)

In [None]:
for idx, adata in enumerate(adatas):
    adatas[idx].layers['counts'] = adata.X

In [None]:
samples

# Concatenate samples

In [None]:
adata = anndata.concat(adatas, join = "outer", fill_value=0)
adata.obs_names_make_unique()

# Normalization, transformation and PCA

sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

scaled_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
adata.layers["concat_log1p_norm"] = sc.pp.log1p(scaled_counts["X"], copy=True)

In [None]:
sc.pp.normalize_total(adata, target_sum = None)
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata)

adata.X = adata.layers['concat_log1p_norm']
sc.pp.pca(adata, n_comps = 20, use_highly_variable=True)

In [None]:
sc.pp.pca(adata, n_comps = 20, use_highly_variable=True)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.8)

In [None]:
for obs_key in ['sample', 'leiden', 'genotype']:
    fig, (ax1) = plt.subplots(1,1)
    ax1.set_aspect('equal')
    sc.pl.umap(adata, color = obs_key, layer = 'concat_log1p_norm', size=umap_point_size, alpha = umap_transparency, ax = ax1, save = '_{}.png'.format(obs_key))

In [None]:
sc.tl.embedding_density(adata, groupby='sample')

# Fig 1B

In [None]:
for sam in adata.obs['sample'].cat.categories:
    fig = sc.pl.embedding_density(adata, groupby='sample', group = sam, color_map = 'Greys', return_fig=True)
    fig.savefig(path_to_results / 'umap_density_sample_{}.png'.format(sam), transparent = True)

# Cell type annotation

In [None]:
cell_type_marker_genes = { #'Leukocytes' : ['Ptprc'],
                 'Neutrophils': ['S100a8', 'S100a9', 'Csf3r'],
                 'Macrophages': ['Cd68', 'Csf1r', 'Adgre1'],#, 'Fcgr2b'],
                 'DC': ['Batf3', 'Zbtb46', 'Ccr7'],
                 'T-cells': ['Cd8a', 'Cd4', 'Cd3d'],
                 'NK-cells': ['Gzmb', 'Gzma', 'Klrk1'],
                 'B-cells': ['Cd19', 'Ms4a1'],
                 'Epithelial cells': ['Krt19'],
                 'Endothelial cells': ['Epcam'],
                 'Tumor cells': ['Twist1', 'Tead1'],
                 'CAF': ['Col1a1', 'Cxcl5', 'Mmp2', 'Fap']
                }

In [None]:
for cell_type in cell_type_marker_genes.keys():
    print(cell_type)
    for gene in cell_type_marker_genes[cell_type]:
        fig, (ax1) = plt.subplots(1,1)
        ax1.set_aspect('equal')
        sc.pl.umap(adata, color = gene, size=umap_continuous_point_size, alpha = umap_continuous_transparency, ax = ax1, save = '_{}.png'.format(gene) if save_figure else None)

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
sc.pl.umap(adata, color = 'leiden')

In [None]:
# create a dictionary to map cluster to annotation label
cluster2celltype = {
     '0': 'Macrophages',
     '1': 'Macrophages',
     '2': 'Macrophages',
     '3': 'Macrophages',
     '4': 'Macrophages',
     '5': 'Macrophages',
     '6': 'Macrophages',
     '7': 'Macrophages',
     '8': 'Macrophages',
     '9': 'DC',
     '10': 'Tumor cells',
     '11': 'CAF',
     '12': 'Macrophages',
    '13': 'Neutrophils',
    '14': 'NK cells'
}

# add a new `.obs` column called `cell type` by mapping clusters to annotation using pandas `map` function
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster2celltype).astype('category')

In [None]:
fig, (ax1) = plt.subplots(1,1)
ax1.set_aspect('equal')
sc.pl.umap(adata, color = 'cell_type', size=umap_point_size, alpha = umap_transparency, ax = ax1, save = '_cell_type.png' if save_figure else None)

In [None]:
cell_type_colors = adata.uns['cell_type_colors']

In [None]:
cell_type_colors

In [None]:
cell_type_colors[4], cell_type_colors[2] = cell_type_colors[2], cell_type_colors[4]

In [None]:
cell_type_colors

In [None]:
adata.uns['cell_type_colors'] = cell_type_colors

In [None]:
adata.uns['cell_type_colors']

# Fig 1B (middle)

In [None]:
fig, (ax1) = plt.subplots(1,1)
ax1.set_aspect('equal')
sc.pl.umap(adata, color = 'cell_type', size=umap_point_size, alpha = umap_transparency, ax = ax1, save = '_cell_type.png' if save_figure else None)

In [None]:
adata_mac = adata[adata.obs.cell_type == 'Macrophages']

# Fig 4B (right)

In [None]:
fig, (ax1) = plt.subplots(1,1)
ax1.set_aspect('equal')
sc.pl.umap(adata_mac, color = 'genotype', size=umap_point_size, alpha= umap_transparency, ax = ax1, save = '_mac_genotype.png' if save_figure else None)

# Cell function

In [None]:
marker_genes = ['Arg1', 'Spp1', 'Gsr', 'Gclm', 'Slc7a11', 'Mrc1', 'Ccl8', 'C1qa',
'Stat1', 'H2-Ab1', 'H2-Eb1', 'Cd74', 'Chil3',
               'Cd163', 'Marco', 'Cxcl9', 'Cxcl10', 'Cd5l', 'Retnla', 'Hmox1',
               'Cd274', 'Cd86', 'Gclc']

In [None]:
for gene in marker_genes:
    fig, (ax1) = plt.subplots(1,1)
    ax1.set_aspect('equal')
    sc.pl.umap(adata_mac, color = gene, size=umap_continuous_point_size, alpha= umap_continuous_transparency, ax = ax1, save = '_{}.png'.format(gene) if save_figure else None)
    for genotype in adata.obs.genotype.cat.categories:
        print(genotype)
        fig, (ax1) = plt.subplots(1,1)
        ax1.set_aspect('equal')
        sc.pl.umap(adata_mac, ax = ax1, size = umap_continuous_point_size, show=False)
        sc.pl.umap(adata_mac[adata_mac.obs.genotype.isin([genotype])], color = gene,
              frameon = False,
               ax = ax1,
               size = umap_continuous_point_size,
               alpha = umap_continuous_transparency,
               vmin = adata_mac[:, gene].X.min(),
               vmax = adata_mac[:, gene].X.max(),
               legend_loc = 'right margin', save = '_{}_{}_score.png'.format(genotype, gene).replace(" ","_") if save_figure else None
              )

In [None]:
mac_function = decoupler_helpers.gmt_to_decoupler('../../shared/databases/macrophage_function.gmt')

In [None]:
important_terms = [
             'Complement & Phagocytosis',
             'Oxidative Stress',
             'Cycling',
             'Antigen Processing And Presentation']

In [None]:
mac_function = mac_function[mac_function.geneset.isin(important_terms)]

In [None]:
mac_function = decoupler_helpers.prepare_db_for_gsea(mac_function, min_geneset_size=1)

In [None]:
dc.run_aucell(adata_mac, mac_function, source='geneset', target='genesymbol', use_raw=False)

In [None]:
adata_mac.obs[important_terms] = adata_mac.obsm["aucell_estimate"][important_terms]

# Fig 4D

In [None]:
for term in important_terms:
    fig, (ax1) = plt.subplots(1,1)
    ax1.set_aspect('equal')
    sc.pl.umap(adata_mac, color = term,  size=umap_continuous_point_size, alpha= umap_continuous_transparency, color_map='magma', ax = ax1, save = '_{}.png'.format(term).replace(" ","_") if save_figure else None)
    for genotype in adata.obs.genotype.cat.categories:
        print(genotype)
        fig, (ax1) = plt.subplots(1,1)
        ax1.set_aspect('equal')
        sc.pl.umap(adata_mac, ax = ax1, size = umap_continuous_point_size, show=False)
        sc.pl.umap(adata_mac[adata_mac.obs.genotype.isin([genotype])], color = term,
              frameon = False,
               ax = ax1,
               size = umap_continuous_point_size,
               alpha = umap_continuous_transparency,
               color_map='magma',
               vmin = adata_mac.obs[term].min(),
               vmax = adata_mac.obs[term].max(),
               legend_loc = 'right margin', save = '_{}_{}_score.png'.format(genotype, term).replace(" ","_") if save_figure else None
              )

# DGE macrophages KO vs WT

In [None]:
sc.tl.rank_genes_groups(adata_mac, 'genotype', method='wilcoxon')
sc.pl.rank_genes_groups(adata_mac, n_genes=25, sharey=False)

In [None]:
rank_genes_res = sc.get.rank_genes_groups_df(adata_mac, group = 'KeapKO')

# Fig 4C: generated from csv created below

In [None]:
rnk_dict = {}
for gt in adata_mac.obs.genotype.cat.categories:
    dedf = sc.get.rank_genes_groups_df(adata_mac, group=gt)
    dedf.to_csv(path_to_results / "rank_genes_groups_{}.csv".format(gt))
    rnk_dict[gt] = dedf.set_index('names')

In [None]:
msigdb = dc.get_resource('MSigDB')
msigdb

In [None]:
msigdb['collection'].unique()

In [None]:
# Filter by hallmark
msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]
msigdb

In [None]:
msigdb.genesymbol = msigdb.genesymbol.str.lower().str.capitalize()
msigdb.geneset = msigdb.geneset.str.replace('HALLMARK_', '')

In [None]:
msigdb

# Fig 4C

In [None]:
res = {}
for gt in adata_mac.obs.genotype.cat.categories:
    df = rnk_dict[gt]
    print(len(df))
    res[gt] = dc.get_gsea_df(df = df, stat = 'scores', net = msigdb, source="geneset",
        target="genesymbol", times=1000, min_n=5, seed=42, verbose=True)
    res[gt].to_csv(path_to_results / 'GSEA_Hallmark_{}.csv'.format(gt))
    dc.plot_dotplot(res[gt][res[gt]['FDR p-value'] <= 0.1],
    x='NES',
    y='Term',
    s='NES',
    c='FDR p-value',
    scale=1,
    figsize=(3, 12),
    title = '{} vs. rest'.format(gt)
               )
    if save_figure:
        plt.savefig(path_to_results / 'GSEA_Hallmark_{}.svg'.format(gt), bbox_inches="tight")

# TF analysis

In [None]:
trust_db = decoupler_helpers.gmt_to_decoupler('../../shared/databases/TRRUST_Transcription_Factors_2019.gmt')

In [None]:
trust_db = trust_db[~trust_db.duplicated(['geneset', 'genesymbol'])]

In [None]:
trust_db.genesymbol = trust_db.genesymbol.str.lower().str.capitalize()

In [None]:
trust_db

In [None]:
de_KO = sc.get.rank_genes_groups_df(adata_mac, group='KeapKO', pval_cutoff=1e-3, log2fc_min=1.5)

In [None]:
de_KO.set_index('names', drop=True, inplace=True)

In [None]:
res_ko = dc.get_ora_df(df = de_KO, net = trust_db, source="geneset",
        target="genesymbol")

In [None]:
res_ko['-log10(FDR p-value)'] = -np.log10(res_ko['FDR p-value'])

In [None]:
res_ko.to_csv(path_to_results / 'ORA_TRRUST_KOvsWT.csv')

In [None]:
res_ko

# Fig 4E

In [None]:
dc.plot_dotplot(res_ko[res_ko['FDR p-value'] <= 0.1],
    x='-log10(FDR p-value)',
    y='Term',
    s='-log10(FDR p-value)',
    c='FDR p-value',
    scale=0.5,
    figsize=(3, 8),
    title = 'KO vs. WT'
               )
if save_figure:
    plt.savefig(path_to_results / 'ORA_TRRUST_KOvsWT.svg', bbox_inches="tight")

In [None]:
de_WT = sc.get.rank_genes_groups_df(adata_mac, group='KeapWT', pval_cutoff=1e-3, log2fc_min=1.5)

In [None]:
de_WT.set_index('names', drop=True, inplace=True)

In [None]:
res_wt = dc.get_ora_df(df = de_WT, net = trust_db, source="geneset",
        target="genesymbol")

In [None]:
res_wt['-log10(FDR p-value)'] = -np.log10(res_wt['FDR p-value'])

In [None]:
res_wt.to_csv(path_to_results / 'ORA_TRRUST_WTvsKO.csv')

In [None]:
res_wt

In [None]:
dc.plot_dotplot(res_wt[res_wt['FDR p-value'] <= 0.1],
    x='-log10(FDR p-value)',
    y='Term',
    s='-log10(FDR p-value)',
    c='FDR p-value',
    scale=0.5,
    figsize=(3, 12),
    title = 'WT vs. KO'
               )
if save_figure:
    plt.savefig(path_to_results / 'ORA_TRRUST_WTvsKO.svg', bbox_inches="tight")

In [None]:
obs_keys_to_export = list()
obs_keys_to_export = important_terms.copy()
obs_keys_to_export.append('sample')

In [None]:
data_export_for_plotting = sc.get.obs_df(adata_mac, keys= obs_keys_to_export, obsm_keys =[("X_umap", 0), ("X_umap", 1)])

In [None]:
data_export_for_plotting.to_csv(path_to_results / 'data_for_plotting.csv')