# Analysis of macrophages

In [None]:
import scanpy as sc
import gseapy as gp
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy as scipy
import pathlib
import anndata
import decoupler as dc
import itertools
import seaborn.objects as so

In [None]:
dc.__version__ # need version >= 1.6.0 for zero division error ("when many 0s were present") see changelog dc

In [None]:
import sys
sys.path.insert(0, "../../shared/src")
import decoupler_helpers
import bicolor_embedding_plot

In [None]:
import matplotlib as mpl
new_rc_params = {'text.usetex': False,
"svg.fonttype": 'none',
"pdf.fonttype": 42 # 42 is code for TrueType; when opening in Illustrator text will be represented as text
}
mpl.rcParams.update(new_rc_params) 

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, dpi_save = 300, frameon=False, transparent = True)

In [None]:
path_to_intermediate_data = pathlib.Path('../data/intermediate/')
path_to_results = pathlib.Path('../results/analysis_of_macrophages')
sc.settings.figdir = path_to_results
if not path_to_results.exists():
    path_to_results.mkdir(parents= True)

In [None]:
save_figure = True
aspect_ratio = 1
umap_point_size = 15
umap_transparency = 0.3
umap_continuous_point_size = 30
umap_continuous_transparency = 0.7

#  Import preprocessed data

In [None]:
sample_files = list(path_to_intermediate_data.glob('macrophages_*.h5ad'))

In [None]:
samples = [sam_file.stem.split('_')[1] for sam_file in sample_files]

In [None]:
samples = ['ctrl', '1xCD40', '3xCD40']

adatas = {}
for sam_file in sample_files:
    sample_name = sam_file.stem.split('_')[1]
    adatas[sample_name] = sc.read(sam_file)

In [None]:
adatas = {}
for sam in samples:
    adatas[sam] = sc.read(path_to_intermediate_data / 'macrophages_{}.h5ad'.format(sam))

In [None]:
# subsample to lowest number of cells
min_number_of_cells = min([len(ad) for ad in adatas.values()])
for ad in adatas.values():
    sc.pp.subsample(ad, n_obs= min_number_of_cells)

In [None]:
adata = anndata.concat(adatas, join = "outer", fill_value=0)
adata.obs_names_make_unique()

In [None]:
adata.X = adata.layers['counts'].copy()

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.layers['log1p_norm_concat'] = adata.X.copy()

In [None]:
sc.pp.highly_variable_genes(adata)

In [None]:
sc.pp.pca(adata, n_comps=20, use_highly_variable = True)

In [None]:
sc.pl.pca_overview(adata, components = ['1, 2', '2, 3'], color = ['total_counts', 'n_genes_by_counts', 'sample', 'Spp1', 'Cxcl9', 'Cxcl10', 'Arg1', 'Ccl5', 'H2-Eb1', 'Cd74'], frameon = True)

In [None]:
pc1_loadings = pd.DataFrame(adata.varm['PCs'][:, 0], index=adata.var_names)

In [None]:
pc1_loadings.columns = ['PC1']

In [None]:
pc1_loadings = pc1_loadings.sort_values(by='PC1', ascending=False)

In [None]:
pc1_loadings.to_csv(path_to_results / 'pc1_loadings.csv')

In [None]:
pc1_loadings

In [None]:
pc1_top20 = pd.concat([pc1_loadings.nlargest(50, 'PC1'), pc1_loadings.nsmallest(50, 'PC1').sort_values(by = 'PC1', ascending=False)])

In [None]:
pc1_top20

# Fig 1D

In [None]:
so.Plot(pc1_top20, y=pc1_top20.index, x='PC1').add(so.Dot()).layout(size=(4, 15))

# Fig 1G

In [None]:
inflammatory_genes = ['Cxcl9', 'Cxcl10', 'H2-Eb1']
anti_inflammatory_genes = ['Spp1']

In [None]:
for x in itertools.product(anti_inflammatory_genes, inflammatory_genes):
    print(x[0], x[1])
    bicolor_embedding_plot.bicolor_embedding_plot(adata, x[0], x[1], save_path= path_to_results if save_figure else None)

# Fig 1C

In [None]:
df = sc.get.obs_df(adata, keys=['sample'], obsm_keys=[('X_pca', 0), ('X_pca', 1)])

In [None]:
sc.pl.pca(adata, color='sample', size=umap_point_size, alpha = umap_transparency, save = '_sample.png')

In [None]:
sns.kdeplot(df, x = 'X_pca-0', hue = 'sample', fill = True)
plt.savefig(path_to_results / 'sample_density_on_PC1_with_filling.pdf')

# GSEA pc1 with Hallmark

In [None]:
msigdb = dc.get_resource('MSigDB')
msigdb

In [None]:
# Filter by hallmark
msigdb = msigdb[msigdb['collection']=='hallmark']

# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]
msigdb

In [None]:
msigdb

In [None]:
msigdb

In [None]:
msigdb = decoupler_helpers.prepare_db_for_gsea(msigdb)

In [None]:
msigdb.genesymbol = msigdb.genesymbol.str.lower().str.capitalize()
msigdb.geneset = msigdb.geneset.str.replace('HALLMARK_', '')

In [None]:
gsea_pc1_res = dc.get_gsea_df(pc1_loadings, stat='PC1', net=msigdb, source="geneset",
    target="genesymbol", verbose = True)

In [None]:
gsea_pc1_res

In [None]:
gsea_pc1_res.to_csv(path_to_results / 'gsea_pc1.csv')

In [None]:
gsea_pc1_res

# Fig 1E

In [None]:
dc.plot_dotplot(gsea_pc1_res.sort_values('NES', ascending= False).head(10)
            .assign(**{"-log10(FDR)": lambda x: -np.log10(x["FDR p-value"])}), x="NES", y="Term", c = '-log10(FDR)', s = "NES", scale = 2, cmap = 'viridis', save = path_to_results / 'gsea_pc1_hallmark_up.pdf')

In [None]:
dc.plot_dotplot(gsea_pc1_res.sort_values('NES', ascending= False).tail(10)
            .assign(**{"-log10(FDR)": lambda x: -np.log10(x["FDR p-value"])}), x="NES", y="Term", c = '-log10(FDR)', s = "NES", scale = 2, cmap = 'viridis', save = path_to_results / 'gsea_pc1_hallmark_down.pdf')

# Overrepresentation analysis

In [None]:
trrust_db = decoupler_helpers.gmt_to_decoupler('../../shared/databases/TRRUST_Transcription_Factors_2019.gmt')

In [None]:
trrust_db.shape

In [None]:
trrust_db = decoupler_helpers.prepare_db_for_gsea(trrust_db)

In [None]:
trrust_db

In [None]:
trrust_db.genesymbol = trrust_db.genesymbol.str.lower().str.capitalize()

In [None]:
ora_pc1_top20up_tf_res = dc.get_ora_df(pc1_loadings.head(20).index, net = trrust_db, source="geneset",
    target="genesymbol")

In [None]:
ora_pc1_top20down_tf_res = dc.get_ora_df(pc1_loadings.tail(20).index, net = trrust_db, source="geneset",
    target="genesymbol")

In [None]:
ora_pc1_top20down_tf_res

In [None]:
FDR_cutoff = 0.2
ora_pc1_top20down_tf_res_fdr_cutoff = ora_pc1_top20up_tf_res[ora_pc1_top20up_tf_res['FDR p-value'] <= FDR_cutoff]

# Fig 1F

In [None]:
dc.plot_dotplot(ora_pc1_top20down_tf_res_fdr_cutoff.sort_values("Combined score", ascending= False).head(10)
            .assign(**{"-log10(FDR)": lambda x: -np.log10(x["FDR p-value"])}), x="Combined score", y="Term", c = '-log10(FDR)', s = 'Overlap ratio', scale = 25, cmap = 'viridis', save = path_to_results / 'ora_pc1_top20up_TRRUST.pdf')