<font size="+3.8">Scanpy single-cell pre-processing</font>  
<font size="+1.5"></font>  

sfrerich

Aim: Preprocess mouse brain single-cell data from Ximerakis 2019 Nat Neuroscience

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import matplotlib.pyplot as plt
import os
from datetime import date
from matplotlib.pyplot import rc_context

In [None]:
sc.settings.verbosity = 3

In [None]:
main_dir='/run/user/1000/gvfs/smb-share:server=138.245.4.35,share=bd-dichgans/SF' # Linux

In [None]:
main_dir='\\\isdsynnas.srv.med.uni-muenchen.de\BD-Dichgans\SF' # Win

# Load data

Downloaded from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE133283 06/03/2023

Note: Data provided unprocessed and not annotated per animal.

In [None]:
t = pd.read_csv(main_dir+"\\P6_vasc_scRNAseq\Jeong2022\\count_matrix\\GSM3904816_Adult-1_gene_counts.tsv.gz", sep="\t",index_col=None)
t["count"].value_counts()

In [None]:
t

In [None]:
t=pd.read_csv(main_dir+"\\P6_vasc_scRNAseq\Jeong2022\\count_matrix\\GSM3904817_Adult-2_gene_counts.tsv.gz", sep="\t")
t = t.pivot(index='cell', columns='gene', values='count')
t = t.fillna(0)
adata = sc.AnnData(t)
adata

In [None]:
t

In [None]:
import glob
path = main_dir+'\\P6_vasc_scRNAseq\\Jeong2022\\count_matrix' 
all_files = glob.glob(os.path.join(path , "*.tsv.gz"))
all_files

In [None]:
all_files[0]

In [None]:
index = all_files[0].find("GSM")
index

In [None]:
gsm_number = (all_files[0].split('_')[0]
              .split('GSM')[1])
print(gsm_number)


In [None]:
li = []

for filename in all_files:
    df = pd.read_csv(filename, sep="\t",index_col=None)
    df["cell"] = "GSM"+filename.split('GSM')[1].split('_')[0]+"_"+filename.split('GSM')[1].split('_')[1]+"_"+df["cell"] # add gsm and sample to cell id
    #df["cell"] = df["cell"]+
    df = df.pivot(index='cell', columns='gene', values='count')
    df = df.fillna(0)
    adata = sc.AnnData(df)   
    li.append(df)
#all = pd.concat(li, axis=1, ignore_index=True)

In [None]:
li

In [None]:
all = pd.concat(li, axis=1)

In [None]:
all

In [None]:
all.pivot(index='cell', columns='gene', values='count')


In [None]:
ad = sc.read_text(main_dir+"\\P6_vasc_scRNAseq\Jeong2022\\GSM3904817_Adult-2_gene_counts.tsv.gz")
ad

In [None]:
ad.obs.head()

In [None]:
ad.var_names

In [None]:
# add metadata
meta = pd.read_csv(main_dir+"\\P6_vasc_scRNAseq\Ximerakis2019\\meta_Aging_mouse_brain_portal_data.txt", sep="\t", skiprows=[1])
meta.head()

In [None]:
assert all(ad.obs.index == meta.NAME)
ad.obs = meta
ad

In [None]:
pd.value_counts(ad.obs["cell_class"])

In [None]:
pd.value_counts(ad.obs["cell_type"])

In [None]:
ad.obs["age"] = ad.obs["all_cells_by_age"]
pd.value_counts(ad.obs["age"])

In [None]:
pd.crosstab(ad.obs["cell_type"], ad.obs["cell_class"])

In [None]:
# also see website

In [None]:
new_names = {
    'Endothelial cells': {'EC'},
    'Oligos': {'OLG'},
    'Olfactory ensheathing glia': {'OEG'},
    'Astrocytes': {'ARP','ASC'},
    'Pericytes': {'PC'},
    'SMCs': {'VSMC'},
    'Neurons': {'NRP','NEUR_immature','NEUR_mature','NendC'},
    'OPCs': {'OPC'},
    'VLMCs': {'VLMC','ABC'},
    'Microglia/Macrophages': {'MG','MAC'},
    'Neural stem cells': {'NSC'},
    'Ependymal cells': {'EPC'},
    'Hypendymal cells': {'HypEPC','TNC'},
    'Choroid plexus epithelial cells':{'CPC'},
    'Immune_Other': {'MNC','DC','NEUT'},
    'Hemoglobin-expressing vascular cells':{'Hb_VC'},
}
result = {}
for key, value in new_names.items():
    for elem in value:
        result[(elem)] = key
new_cluster_names = dict(sorted(result.items()))
print(new_cluster_names)

In [None]:
ad.obs['clusters'] = (
    ad.obs['cell_type']
    .map(new_cluster_names)
    .astype('category')
)

In [None]:
pd.value_counts(ad.obs["clusters"])

In [None]:
sc.pl.dotplot(ad, var_names=['Cldn5',"Aqp4","Mbp",'Foxj1','Pdgfrb','Vtn','Kcnj8','Rgs5','Atp13a5','Tagln','Acta2','Myh11'], groupby="clusters", swap_axes=False);

Last 3 cols are SMC markers

Check if data is normalized:

In [None]:
ad.layers

In [None]:
# show expression of 100 random genes (across all spots)
import random
import seaborn as sns
random_genes=random.sample(range(0, ad.X.shape[1]), 100)
adata_sub = ad[:,random_genes]
exp=pd.DataFrame(np.matrix(adata_sub.X))
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 10));
sns.set_style("ticks")
plt.show()

In [None]:
np.matrix(adata_sub.X)

Data seems to be normalized

In [None]:
ad.layers["normalized"] = ad.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
ad

In [None]:
ad.layers

In [None]:
sc.tl.pca(ad)
sc.pp.neighbors(ad)
sc.tl.umap(ad)
sc.tl.leiden(ad)

In [None]:
sc.pl.umap(ad, color=['clusters'])

In [None]:
sc.pl.umap(ad, color=['Foxf2'])

# Focus on: Foxf2

In [None]:
adata=ad
del ad

In [None]:
adata_backup = adata

In [None]:
# drop aged samples
# adata = adata[adata.obs['age'].isin(['2-3mo'])]

In [None]:
gene="Foxf2"

## Plot

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
# mean expression per group (normalized and scaled)
ex = sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", return_fig=True, layer="normalized").values_df
ex.columns = ['mean_expression']
ex

In [None]:
# dotplot dot size = fraction of cells in group expressing Foxf2
fc = sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,7), standard_scale="var",return_fig=True, layer="normalized").dot_size_df
fc.columns = ['fraction_of_cells']
fc

In [None]:
foxf2 = pd.concat([ex, fc], axis=1)
foxf2['gene'] = gene
foxf2['source'] = "Ximerakis2019"
foxf2['organism'] = "Mouse"
foxf2['cell_number'] = pd.DataFrame(adata.obs["clusters"].value_counts()).reindex(foxf2.index)
foxf2

In [None]:
# export
name='Ximerakis2019'

In [None]:
# Win
foxf2.to_csv(main_dir+'\\P6_Foxf2_per_celltype\\Foxf2_summarized\\'+ date.today().strftime("%Y%m%d")+'_'+name+'_normalized.csv',sep=';')

# Focus on: Other genes

## Foxo1

In [None]:
gene="Foxo1"

### Plot

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
# mean expression per group (normalized and scaled)
ex = sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", return_fig=True, layer="normalized").values_df
ex.columns = ['mean_expression']
ex

In [None]:
# dotplot dot size = fraction of cells in group expressing Foxf2
fc = sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,7), standard_scale="var",return_fig=True, layer="normalized").dot_size_df
fc.columns = ['fraction_of_cells']
fc

In [None]:
foxf2 = pd.concat([ex, fc], axis=1)
foxf2['gene'] = gene
foxf2['source'] = "Ximerakis2019"
foxf2['organism'] = "Mouse"
foxf2['cell_number'] = pd.DataFrame(adata.obs["clusters"].value_counts()).reindex(foxf2.index)
foxf2

In [None]:
# Win
foxf2.to_csv(main_dir+'\\P6_Foxf2_per_celltype\\Other_genes_summarized\\'+ date.today().strftime("%Y%m%d")+'_'+gene+'_'+name+'_normalized.csv',sep=';')

## Tek = Tie2

In [None]:
gene="Tek"

### Plot

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
# mean expression per group (normalized and scaled)
ex = sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", return_fig=True, layer="normalized").values_df
ex.columns = ['mean_expression']
ex

In [None]:
# dotplot dot size = fraction of cells in group expressing Foxf2
fc = sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,7), standard_scale="var",return_fig=True, layer="normalized").dot_size_df
fc.columns = ['fraction_of_cells']
fc

In [None]:
foxf2 = pd.concat([ex, fc], axis=1)
foxf2['gene'] = gene
foxf2['source'] = "Ximerakis2019"
foxf2['organism'] = "Mouse"
foxf2['cell_number'] = pd.DataFrame(adata.obs["clusters"].value_counts()).reindex(foxf2.index)
foxf2

In [None]:
# export
name='Ximerakis2019'

In [None]:
# Win
foxf2.to_csv(main_dir+'\\P6_Foxf2_per_celltype\\Other_genes_summarized\\'+ date.today().strftime("%Y%m%d")+'_'+gene+'_'+name+'_normalized.csv',sep=';')

## Nos3 = eNOS

In [None]:
gene="Nos3"

### Plot

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
# mean expression per group (normalized and scaled)
ex = sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", return_fig=True, layer="normalized").values_df
ex.columns = ['mean_expression']
ex

In [None]:
# dotplot dot size = fraction of cells in group expressing Foxf2
fc = sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,7), standard_scale="var",return_fig=True, layer="normalized").dot_size_df
fc.columns = ['fraction_of_cells']
fc

In [None]:
foxf2 = pd.concat([ex, fc], axis=1)
foxf2['gene'] = gene
foxf2['source'] = "Ximerakis2019"
foxf2['organism'] = "Mouse"
foxf2['cell_number'] = pd.DataFrame(adata.obs["clusters"].value_counts()).reindex(foxf2.index)
foxf2

In [None]:
# Win
foxf2.to_csv(main_dir+'\\P6_Foxf2_per_celltype\\Other_genes_summarized\\'+ date.today().strftime("%Y%m%d")+'_'+gene+'_'+name+'_normalized.csv',sep=';')

# Focus on: Aging

In [None]:
adata = adata_backup

In [None]:
pd.value_counts(adata.obs["age"])

## Foxf2 by age group

In [None]:
gene=["Foxf2","Foxo1","Tek","Nos3"]

### All cell types

In [None]:
sc.pl.dotplot(adata, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="All cell types")

### ECs only

In [None]:
# ECs only
adata_ecs = adata[adata.obs['clusters'].isin(['Endothelial cells'])]
adata_ecs

In [None]:
pd.value_counts(adata_ecs.obs["age"])

In [None]:
sc.pl.dotplot(adata_ecs, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="ECs")

### PCs only

In [None]:
adata_pcs = adata[adata.obs['clusters'].isin(['Pericytes'])]
pd.value_counts(adata_pcs.obs["age"])

In [None]:
sc.pl.dotplot(adata_pcs, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="PCs")

### SMCs only

In [None]:
adata_pcs = adata[adata.obs['clusters'].isin(['SMCs'])]
pd.value_counts(adata_pcs.obs["age"])

In [None]:
sc.pl.dotplot(adata_pcs, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="SMCs")

### Neurons only

In [None]:
adata_neur = adata[adata.obs['clusters'].isin(['Neurons'])]
pd.value_counts(adata_neur.obs["age"])

In [None]:
sc.pl.dotplot(adata_neur, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Neurons")

### Astrocytes only

In [None]:
adata_astro = adata[adata.obs['clusters'].isin(['Astrocytes'])]
pd.value_counts(adata_astro.obs["age"])

In [None]:
sc.pl.dotplot(adata_astro, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Astrocytes")

### Oligos only

In [None]:
adata_olig = adata[adata.obs['clusters'].isin(['Oligos'])]
pd.value_counts(adata_olig.obs["age"])

In [None]:
sc.pl.dotplot(adata_olig, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Oligos")

### Microglia only

In [None]:
adata_mg = adata[adata.obs['clusters'].isin(['Microglia/Macrophages'])]
pd.value_counts(adata_mg.obs["age"])

In [None]:
sc.pl.dotplot(adata_mg, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="Microglia/Macrophages")

## Correlate gene expression (Foxf2 and Foxo1)

### ECs

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adata

In [None]:
adata_ecs

In [None]:
adat = adata_ecs
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
import scipy as sp
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

Exclude dropouts (cells with 0 expression of both genes)

In [None]:
coex = (adata_ecs[:,'{}'.format("Foxf2")].X > 0) | (adata_ecs[:,'{}'.format("Foxo1")].X > 0)

In [None]:
adata_ecs_above0 = adata_ecs[coex]
adata_ecs_above0

In [None]:
pd.value_counts(adata_ecs_above0.obs["age"])

In [None]:
sc.pl.dotplot(adata_ecs_above0, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized", title="ECs")

In [None]:
adat = adata_ecs_above0
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

### All cell types

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adata

In [None]:
adat = adata
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

Exclude dropouts (cells with 0 expression of both genes)

In [None]:
coex = (adat[:,'{}'.format("Foxf2")].X > 0) | (adat[:,'{}'.format("Foxo1")].X > 0)

In [None]:
adata_above0 = adat[coex]
adata_above0

In [None]:
pd.value_counts(adata_above0.obs["age"])

In [None]:
sc.pl.dotplot(adata_above0, gene, groupby='age', swap_axes=True, figsize=(4,2), standard_scale="var", layer="normalized")

In [None]:
adat = adata_above0
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

## Pseudobulk analysis

Because correlation on single cell level is biased by large number of dropouts

In [None]:
import decoupler as dc

In [None]:
adata

In [None]:
adata.obs["sample"] = adata.obs['NAME'].str.extract('_([0-9]+)_?')
pd.value_counts(adata.obs["sample"])

In [None]:
pd.crosstab(adata.obs["sample"], adata.obs["age"]).T

In [None]:
padata = dc.get_pseudobulk(adata, sample_col='sample', groups_col=None, layer='normalized', min_prop=0.01, min_smpls=0)

In [None]:
padata

### Correlate gene expression (Foxf2 and Foxo1)

#### All cell types

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
padata

In [None]:
adat = padata
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
import scipy as sp
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

#### ECs

In [None]:
padata = dc.get_pseudobulk(adata, sample_col='sample', groups_col='clusters', layer='normalized', min_prop=0.2, min_smpls=3)

In [None]:
padata

In [None]:
# ECs only
padata_ecs = padata[padata.obs['clusters'].isin(['Endothelial cells'])]
padata_ecs

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adat = padata_ecs
subs = adat[:,gg].X
subs = pd.DataFrame(subs, columns = gg, index=adat.obs.index)
age = pd.DataFrame(adat.obs["age"])
subs = subs.join(age)
subs.head()

In [None]:
g = sns.lmplot(x="Foxf2", y="Foxo1", data=subs, hue="age", fit_reg=False, scatter_kws={'alpha':0.6})
sns.regplot(x="Foxf2", y="Foxo1", data=subs, scatter=False, ax=g.axes[0, 0], color="grey")
g.figure.set_size_inches(7,6)
plt.show()

In [None]:
import scipy as sp
r, p = sp.stats.pearsonr(subs['Foxf2'], subs['Foxo1'])
'Pearson R = {:.2f}, p = {:.2g}'.format(r, p)

In [None]:
graph = sns.jointplot(data=subs, x="Foxf2", y="Foxo1", hue="age")
plt.show()

# Session Info

In [None]:
sc.logging.print_versions()