<font size="+3.8">Scanpy single-cell pre-processing</font>  
<font size="+1.5"></font>  

Aim: Preprocess mouse brain single-cell data from the Tabula Muris Senis Dataset (2020 Nature)  
Publication: http://www.nature.com/articles/s41586-020-2496-1

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
os.getlogin()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import matplotlib.pyplot as plt
import platform
from matplotlib.pyplot import rc_context

In [None]:
import utils

In [None]:
os.environ['CONDA_DEFAULT_ENV'] # conda env

In [None]:
platform.platform()

In [None]:
sc.settings.verbosity = 3

In [None]:
main_dir='/run/user/1000/gvfs/smb-share:server=138.245.4.35,share=bd-dichgans/SF' # Linux

In [None]:
main_dir='\\\isdsynnas.srv.med.uni-muenchen.de\BD-Dichgans\SF' # Win

In [None]:
dataset_name = "TabulaMurisSenis2020"
organism = "Mouse"

# Load data

Downloaded from: https://figshare.com/articles/dataset/Tabula_Muris_Senis_Data_Objects/12654728 02/03/2023

## Myeloid dataset

In [None]:
# myeloid
ad = sc.read_h5ad(os.path.join(main_dir, "P6_vasc_scRNAseq", "TabulaMurisSenis2020", "tabula-muris-senis-facs-processed-official-annotations-Brain_Myeloid.h5ad"))
ad

In [None]:
ad.obs.head()

In [None]:
pd.value_counts(ad.obs["cell_ontology_class"])

In [None]:
new_names = {
    'Microglia/Macrophages': {'microglial cell','macrophage'},
}
result = {}
for key, value in new_names.items():
    for elem in value:
        result[(elem)] = key
new_cluster_names = dict(sorted(result.items()))
print(new_cluster_names)

In [None]:
ad.obs['clusters'] = (
    ad.obs['cell_ontology_class']
    .map(new_cluster_names)
    .astype('category')
)

In [None]:
pd.value_counts(ad.obs["clusters"])

In [None]:
with rc_context({'figure.figsize': (7, 6)}):
    sc.pl.umap(ad, color=['clusters','age'], wspace=0.3)

Check if data is normalized:

In [None]:
ad.layers

In [None]:
# show expression of 100 random genes (across all spots)
import random
import seaborn as sns
random_genes=random.sample(range(0, ad.X.shape[1]), 100)
adata_sub = ad[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 10));
sns.set_style("ticks")
plt.show()

In [None]:
adata_sub.X.todense()

Data seems to be normalized

In [None]:
ad.layers["normalized"] = ad.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
ad

In [None]:
ad.layers

In [None]:
adm = ad

In [None]:
del ad

## Non-myeloid dataset

In [None]:
ad = sc.read_h5ad(main_dir+"\\P6_vasc_scRNAseq\TabulaMurisSenis2020\\tabula-muris-senis-facs-processed-official-annotations-Brain_Non-Myeloid.h5ad")
ad

In [None]:
ad.obs.head()

In [None]:
pd.value_counts(ad.obs["cell_ontology_class"])

In [None]:
new_names = {
    'Endothelial cells': {'endothelial cell'},
    'Oligos': {'oligodendrocyte'},
    'Astrocytes': {'astrocyte','Bergmann glial cell'},
    'Pericytes': {'brain pericyte'},
    'Neurons': {'neuron','interneuron', 'medium spiny neuron'},
    'OPCs': {'oligodendrocyte precursor cell'},
    'Neuroepithelial cells': {'neuroepithelial cell'},
    'Microglia/Macrophages': {'microglial cell','macrophage'},
    'Neuronal stem cells': {'neuronal stem cell'},
    'Ependymal cells': {'ependymal cell'},
    'NK/T cells': {'mature NK T cell','T cell', 'CD8-positive, alpha-beta T cell'},
}
result = {}
for key, value in new_names.items():
    for elem in value:
        result[(elem)] = key
new_cluster_names = dict(sorted(result.items()))
print(new_cluster_names)

In [None]:
ad.obs['clusters'] = (
    ad.obs['cell_ontology_class']
    .map(new_cluster_names)
    .astype('category')
)

In [None]:
pd.value_counts(ad.obs["clusters"])

In [None]:
with rc_context({'figure.figsize': (7, 6)}):
    sc.pl.umap(ad, color=['clusters','age'], wspace=0.3)

No SMCs?

In [None]:
with rc_context({'figure.figsize': (7, 6)}):
    sc.pl.umap(ad, color=['Pdgfrb','Vtn','Kcnj8','Rgs5','Tagln','Acta2','Myh11','Cnn1'], wspace=0.2, size=35)

In [None]:
sc.pl.dotplot(ad, var_names=['Cldn5',"Aqp4","Mbp",'Pdgfrb','Vtn','Kcnj8','Rgs5','Atp13a5','Tagln','Acta2','Myh11','Cnn1'], groupby="clusters", swap_axes=False);

Last 4 cols are SMC markers

Mural cell signature is ambigous.

Re-cluster to separate PCs and SMCs:

In [None]:
sc.tl.leiden(ad, resolution=2)

In [None]:
with rc_context({'figure.figsize': (7, 5)}):
    sc.pl.umap(ad, color=['leiden'],legend_loc="on data", legend_fontoutline=3,legend_fontsize=15)

In [None]:
sc.pl.dotplot(ad, var_names=['Cldn5',"Aqp4","Mbp",'Pdgfrb','Vtn','Kcnj8','Rgs5','Atp13a5','Tagln','Acta2','Myh11','Cnn1'], groupby="leiden", swap_axes=False);

Note: Still no PC signature. Likely few PCs in dataset.  
Rename "Pericytes" to "SMCs/Pericytes"

In [None]:
new_names = {
    'Endothelial cells': {'Endothelial cells'},
    'Oligos': {'Oligos'},
    'Astrocytes': {'Astrocytes'},
    'SMCs/Pericytes': {'Pericytes'},
    'Neurons': {'Neurons'},
    'OPCs': {'OPCs'},
    'Neuroepithelial cells': {'Neuroepithelial cells'},
    'Microglia': {'Microglia'},
    'Neuronal stem cells': {'Neuronal stem cells'},
    'Ependymal cells': {'Ependymal cells'},
    'Macrophages': {'Macrophages'},
    'NK/T cells': {'NK/T cells'},
}
result = {}
for key, value in new_names.items():
    for elem in value:
        result[(elem)] = key
new_cluster_names = dict(sorted(result.items()))
print(new_cluster_names)

In [None]:
ad.obs['clusters'] = (
    ad.obs['clusters']
    .map(new_cluster_names)
    .astype('category')
)

Check if data is normalized:

In [None]:
ad.layers

In [None]:
# show expression of 100 random genes (across all spots)
import random
import seaborn as sns
random_genes=random.sample(range(0, ad.X.shape[1]), 100)
adata_sub = ad[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 10));
sns.set_style("ticks")
plt.show()

In [None]:
adata_sub.X.todense()

Data seems to be normalized

In [None]:
ad.layers["normalized"] = ad.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
ad

In [None]:
ad.layers

In [None]:
adnm = ad

In [None]:
del ad

## Merge

In [None]:
del adata_sub
del random_genes

In [None]:
# merge datasets
adata = anndata.concat([adnm, adm], join="inner") # ignore batch effects etc. because dataset is pre-annotated
adata

In [None]:
pd.value_counts(adata.obs["age"])

In [None]:
pd.value_counts(adata.obs["clusters"])

In [None]:
pd.value_counts(adata.obs["sex"])

In [None]:
pd.value_counts(adata.obs["mouse.id"])

# Focus on: Foxf2

In [None]:
adata_backup = adata

In [None]:
# drop aged samples
# adata = adata[adata.obs['age'].isin(['3m'])]

In [None]:
gene="Foxf2"

## Plot

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
utils.summarize_gene_expression(adata = adata, gene = gene, groupby = "clusters", 
                                study_name = dataset_name, organism = organism,
                                export = True, output_dir = os.path.join(main_dir, "P06_Foxf2_per_celltype", "Foxf2_summarized")
                               )

# Focus on: Other genes

In [None]:
target_genes = ["Foxo1", "Tek", "Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Ptprb", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, study_name = dataset_name, organism = organism, groupby = "clusters",
                                          output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized"), export=True
                                         ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

# Focus on: Aging

In [None]:
adata = adata_backup

In [None]:
pd.value_counts(adata.obs["age"])

## Foxf2 by age group

In [None]:
gene=["Foxf2","Foxo1","Tek","Nos3"]

### All cell types

In [None]:
sc.pl.dotplot(adata, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="All cell types")

### ECs only

In [None]:
# ECs only
adata_ecs = adata[adata.obs['clusters'].isin(['Endothelial cells'])]
adata_ecs

In [None]:
pd.value_counts(adata_ecs.obs["age"])

In [None]:
sc.pl.dotplot(adata_ecs, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="ECs")

### SMCs/PCs only

Liekly PCs + SMCs (see above)

In [None]:
adata_pcs = adata[adata.obs['clusters'].isin(['SMCs/Pericytes'])]
pd.value_counts(adata_pcs.obs["age"])

In [None]:
sc.pl.dotplot(adata_pcs, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="PCs/SMCs")

### Neurons only

In [None]:
adata_neur = adata[adata.obs['clusters'].isin(['Neurons'])]
pd.value_counts(adata_neur.obs["age"])

In [None]:
sc.pl.dotplot(adata_neur, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Neurons")

### Astrocytes only

In [None]:
adata_astro = adata[adata.obs['clusters'].isin(['Astrocytes'])]
pd.value_counts(adata_astro.obs["age"])

In [None]:
sc.pl.dotplot(adata_astro, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Astrocytes")

### Oligos only

In [None]:
adata_olig = adata[adata.obs['clusters'].isin(['Oligos'])]
pd.value_counts(adata_olig.obs["age"])

In [None]:
sc.pl.dotplot(adata_olig, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Oligos")

### Microglia only

In [None]:
adata_mg = adata[adata.obs['clusters'].isin(['Microglia/Macrophages'])]
pd.value_counts(adata_mg.obs["age"])

In [None]:
sc.pl.dotplot(adata_mg, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Microglia/Macrophages")

## Correlate gene expression (Foxf2 and Foxo1)

### ECs

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adata

In [None]:
adata_ecs

In [None]:
adat = adata_ecs
subs = adat[:,gg].X.todense()
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
import scipy as sp
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x="age", y="Foxf2", data=subs, showfliers = False, palette="Blues")
ax = sns.stripplot(x="age", y="Foxf2", data=subs, color=".25",size=1)
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x="age", y="Foxo1", data=subs, showfliers = False, palette="Blues")
ax = sns.stripplot(x="age", y="Foxo1", data=subs, color=".25",size=1)
plt.show()

Exclude dropouts (cells with 0 expression of both genes)

In [None]:
coex = (adata_ecs[:,'{}'.format("Foxf2")].X.todense() > 0) | (adata_ecs[:,'{}'.format("Foxo1")].X.todense() > 0)

In [None]:
adata_ecs_above0 = adata_ecs[coex]
adata_ecs_above0

In [None]:
pd.value_counts(adata_ecs_above0.obs["age"])

In [None]:
sc.pl.dotplot(adata_ecs_above0, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="ECs")

In [None]:
adat = adata_ecs_above0
subs = adat[:,gg].X.todense()
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x="age", y="Foxf2", data=subs, showfliers = False, palette="Blues")
ax = sns.stripplot(x="age", y="Foxf2", data=subs, color=".25",size=1)
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x="age", y="Foxo1", data=subs, showfliers = False, palette="Blues")
ax = sns.stripplot(x="age", y="Foxo1", data=subs, color=".25",size=1)
plt.show()

### All cell types

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adata

In [None]:
adat = adata
subs = adat[:,gg].X.todense()
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

Exclude dropouts (cells with 0 expression of both genes)

In [None]:
coex = (adat[:,'{}'.format("Foxf2")].X.todense() > 0) | (adat[:,'{}'.format("Foxo1")].X.todense() > 0)

In [None]:
adata_above0 = adat[coex]
adata_above0

In [None]:
pd.value_counts(adata_above0.obs["age"])

In [None]:
sc.pl.dotplot(adata_above0, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized")

In [None]:
adat = adata_above0
subs = adat[:,gg].X.todense()
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x="age", y="Foxf2", data=subs, showfliers = False, palette="Blues")
ax = sns.stripplot(x="age", y="Foxf2", data=subs, color=".25",size=1)
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x="age", y="Foxo1", data=subs, showfliers = False, palette="Blues")
ax = sns.stripplot(x="age", y="Foxo1", data=subs, color=".25",size=1)
plt.show()

## Pseudobulk analysis

Because correlation on single cell level is biased by large number of dropouts

In [None]:
import decoupler as dc

In [None]:
adata

In [None]:
padata = dc.get_pseudobulk(adata, sample_col='mouse.id', groups_col='clusters', layer='normalized', min_prop=0.2, min_smpls=3)

In [None]:
padata

### Correlate gene expression (Foxf2 and Foxo1)

#### All cell types

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
padata

In [None]:
adat = padata
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
import scipy as sp
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

#### ECs

In [None]:
# ECs only
padata_ecs = padata[padata.obs['clusters'].isin(['Endothelial cells'])]
padata_ecs

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adat = padata_ecs
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
import scipy as sp
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

# Session Info

In [None]:
sc.logging.print_versions()