# KeapKO, Nrf2KO, WT
Python analysis using scanpy 1.9 Docker image: cr.gitlab.uzh.ch/elena.duerst/docker-images/scanpy1p9_bioc:0.10
TODO: insert link for docker image on dockerhub

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import pathlib
import anndata as anndata
import matplotlib as mpl
import pathlib
import decoupler as dc

In [None]:
import sys
sys.path.insert(0, "../../shared/src")
import decoupler_helpers
import bicolor_embedding_plot

In [None]:
new_rc_params = {'text.usetex': False,
"svg.fonttype": 'none'
}
mpl.rcParams.update(new_rc_params)

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=100, facecolor='white', figsize = (4,4), dpi_save=300, frameon = False)
results_path = pathlib.Path('../results/analysis')
sc.settings.figdir = results_path
umap_point_size = 30
umap_transparency = 0.3
umap_continuous_point_size = 30
umap_continuous_transparency = 0.7
aspect_ratio = 1
save_figure = False

In [None]:
intermediate_data_path = pathlib.Path('../data/intermediate/')
intermediate_data_path.mkdir(parents=True, exist_ok=True)
results_file = pathlib.Path(intermediate_data_path) / 'preprocessed.h5ad'
data_path = pathlib.Path('../data/raw')

# Data import

In [None]:
samples = ['KeapKO', 'WT', 'Nrf2KO'] # order to get exact same results

In [None]:
# TODO: use GEO data: GSE290431

In [None]:
sample_files = list(data_path.glob('*_filtered_feature_bc_matrix.h5'))
adatas = {}
for sam in sample_files:
    adata = sc.read_10x_h5(sam)
    sample_name = sam.name.replace('_filtered_feature_bc_matrix.h5', '')
    adata.obs['sample'] = sample_name
    print(sample_name)
    day = sample_name.split('_')[1]
    adata.obs['day'] = day
    sample_type = sample_name.split('_')[0]
    adata.obs['sample_type'] = sample_type
    print(adata.shape)
    adata.var_names_make_unique()
    adatas[sample_name] = adata

# QC

In [None]:
 for name,adata in adatas.items():
    adata.var['mt'] = adata.var_names.str.startswith('mt-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata.var['Rp'] = adata.var_names.str.startswith('Rp') 
    sc.pp.calculate_qc_metrics(adata, qc_vars=['Rp'], percent_top=None, log1p=False, inplace=True)

In [None]:
 for name, ad in adatas.items():
    print(name)
    print(ad.shape)
    sc.pl.violin(ad, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}.png" if save_figure else None)

In [None]:
for name, adata in adatas.items():
    print(name)
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color= 'n_genes_by_counts')
    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color = "pct_counts_mt")
    sc.pl.scatter(adata, x='pct_counts_mt', y='pct_counts_Rp', color = 'n_genes_by_counts')

In [None]:
min_numof_genes = 200
max_numof_genes = 10000#7000
for name, adata in adatas.items():
    fig, axs = plt.subplots(1, 4, figsize=(12, 3))
    fig.suptitle(f"Covariates for filtering: {name}")

    sns.histplot(adata.obs["total_counts"], kde=False, ax=axs[0])
    sns.histplot(
        adata.obs["total_counts"][adata.obs["total_counts"] < 40000],
        kde=False,
        bins=40,
        ax=axs[1],
    )
    plot = sns.histplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
    plot.axvline(x = min_numof_genes, color = 'red')
    plot.axvline(x = max_numof_genes, color = 'red')
    sns.histplot(
        adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < min_numof_genes + 1000],
        kde=False,
        bins=60,
        ax=axs[3],
    )
    plt.axvline(x = min_numof_genes, color = 'red')

# Filtering

In [None]:
max_pct_mt = 5
for name, adata in adatas.items():
    adatas[name] = adata[adata.obs['pct_counts_mt'] < max_pct_mt , :]
    #helpers.assert_all_finite(adata.X)

In [None]:
 for adata in adatas.values():
    sc.pp.filter_cells(adata, min_genes = min_numof_genes)
    sc.pp.filter_cells(adata, max_genes = max_numof_genes)
    sc.pp.filter_genes(adata, min_cells = 1)
    print(adata.shape)

# QC after filtering

In [None]:
for name, adata in adatas.items():
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_Rp'],
             jitter=0.4, multi_panel=True, save = f"{name}_filtered.png" if save_figure else None)

# Concatenate samples

In [None]:
adata = anndata.concat(adatas, join = "outer", fill_value=0)
adata.obs_names_make_unique()

In [None]:
del adatas

In [None]:
scaled_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
adata.layers["concat_log1p_norm"] = sc.pp.log1p(scaled_counts["X"], copy=True)

# Inspect unintegrated samples

In [None]:
sc.pp.highly_variable_genes(adata, layer = 'concat_log1p_norm')

In [None]:
adata.X = adata.layers['concat_log1p_norm']
sc.pp.pca(adata, n_comps = 20, use_highly_variable=True)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.8)

In [None]:
for obs_key in ['sample', 'leiden', 'Gata2']:
    fig, (ax1) = plt.subplots(1,1)
    ax1.set_aspect('equal')
    sc.pl.umap(adata, color = obs_key, layer = 'concat_log1p_norm', size=umap_point_size, alpha = umap_transparency, ax = ax1, save = '_{}.png'.format(obs_key))

# remove Gata2 cluster

In [None]:
adata = adata[adata.obs.leiden != '18']

In [None]:
adatas = [adata[adata.obs['sample'] == sam] for sam in adata.obs['sample'].cat.categories]

min_n_obs = min([ad.n_obs for ad in adatas])

for ad in adatas:
    if ad.n_obs > min_n_obs:
         sc.pp.subsample(ad, n_obs=min_n_obs)

adata = adatas[0].concatenate(*adatas[1:])

In [None]:
sc.pp.highly_variable_genes(adata, layer = 'concat_log1p_norm')

In [None]:
adata.X = adata.layers['concat_log1p_norm']
sc.pp.pca(adata, n_comps = 20, use_highly_variable=True)

In [None]:
sc.pl.pca_overview(adata, components = ['1, 2'] ,color = ['sample', 'Spp1', 'H2-Eb1'])

In [None]:
df = sc.get.obs_df(adata, keys=['sample'], obsm_keys=[('X_pca', 0), ('X_pca', 1)])

# Fig 6A

In [None]:
sns.jointplot(data=df, x = 'X_pca-0', y="X_pca-1", hue="sample", edgecolor = 'none', s = 2)
plt.savefig(results_path / 'jointplot_pca_sample_density.pdf')

# Fig 6B (middle)

In [None]:
bicolor_embedding_plot.bicolor_embedding_plot(adata, x_key='Spp1', y_key='H2-Eb1', save_path=results_path)

# Mac function scoring

In [None]:
mac_function = decoupler_helpers.gmt_to_decoupler('../../shared/databases/macrophage_function.gmt')

In [None]:
mac_function = decoupler_helpers.prepare_db_for_gsea(mac_function, min_geneset_size=1)

In [None]:
pc1_loadings = pd.DataFrame(adata.varm['PCs'][:, 0], index=adata.var_names)

In [None]:
pc1_loadings.columns = ['PC1']

In [None]:
pc1_loadings = pc1_loadings.sort_values(by='PC1', ascending=False)

In [None]:
pc1_loadings

In [None]:
dc.run_aucell(adata, mac_function, source='geneset', target='genesymbol', use_raw=False)

In [None]:
terms = adata.obsm["aucell_estimate"].columns

In [None]:
adata.obs[terms] = adata.obsm["aucell_estimate"][terms]

for term in terms:
    fig, (ax1) = plt.subplots(1,1)
    ax1.set_aspect('equal')
    sc.pl.umap(adata, color = term,  size=umap_continuous_point_size, alpha= umap_continuous_transparency, color_map='magma', ax = ax1, save = '_{}.png'.format(term).replace(" ","_") if save_figure else None)

In [None]:
for term in terms:
    fig, (ax1) = plt.subplots(1,1)
    ax1.set_aspect('equal')
    sc.pl.pca(adata, color = term, components = ['1,2'], size=umap_continuous_point_size, alpha= umap_continuous_transparency, color_map='magma', ax = ax1, save = '_{}.png'.format(term).replace(" ","_") if save_figure else None)

# Fig 6 B (left)

In [None]:
bicolor_embedding_plot.bicolor_embedding_plot(adata, x_key='Oxidative Stress', y_key='Antigen Presentation', save_path=results_path)