<font size="+3.8">Scanpy single-cell pre-processing</font>  
<font size="+1.5"></font>  

Aim: Preprocess annotated human brain single-cell data from Siletti 2022 bioRxiv

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
import random
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sns

In [None]:
import anndata as ad
import scanpy as sc
import scipy as sci
sc.settings.verbosity = 3

In [None]:
#import loompy

In [None]:
import utils

In [None]:
main_dir='/cluster2/sfrerich/P6/P06_vasc_scRNAseq' # HPC

In [None]:
dataset_name = "Siletti2022" # Siletti 2022 biorXiv 
organism = "Human"

In [None]:
target_genes = ["Foxo1", "Tek", "Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Ptprb", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]
target_genes = [gene.upper() for gene in target_genes]

In [None]:
genes_subset = ["FOXF2"] + target_genes

# Load + format data

Annotated by authors

### All cells

All superclusters downloaded from cellxgene https://cellxgene.cziscience.com/collections/283d65eb-dd53-496d-adb7-7570c7caa443 
(23/02/2023) except for Miscancelleous

In [None]:
# run once

In [None]:
f=os.listdir(os.path.join(main_dir,'Siletti2022','all_cellxgene'))
f

In [None]:
%%time
# load, format, normalize, export each h5ad object
for p in f[16:]:
    print("processing "+p)
    adata = ad.read_h5ad(os.path.join(main_dir,'Siletti2022','all_cellxgene',p))
    adatas = adata[adata.obs["BadCells"] == 0]
    adata = adata[adata.obs["tissue"] != "Spinal cord"]
    adata.obs = adata.obs[["supercluster_term", "cell_type", "assay", "organism", "disease", "tissue", "BadCells"]]
    adata.var = adata.var[["Gene","feature_name"]]
    del adata.uns
    adata.var_names = adata.var["Gene"].tolist()
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
    # log-normalize
    #adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_total(adata,inplace=True)
    sc.pp.log1p(adata)
    #adata.layers["normalized"] = adata.X.copy()
    print("saving "+p)
    adata.write_h5ad(os.path.join(main_dir,'Siletti2022','all_reduced_normalized',p.split(".")[0]+'_reduced.h5ad.gz'), compression="gzip")
del adata

In [None]:
# then load from here

In [None]:
f=os.listdir(os.path.join(main_dir,'Siletti2022','all_reduced_normalized'))
f

In [None]:
adatas = {p.split(".")[0]:ad.read_h5ad(os.path.join(main_dir,'Siletti2022','all_reduced_normalized',p)) for p in f}

In [None]:
adatas.keys()

In [None]:
adatas.values()

In [None]:
# too large for standard merge, use AnnCollection instead
# adata2 = ad.concat(adatas, axis=0, join="inner", merge="first")

In [None]:
from anndata.experimental.multi_files import AnnCollection

adata2 = AnnCollection(adatas, join_vars='inner', label='dataset')
adata2

In [None]:
[i for i in adata2.var_names if "NOS3" in i]

In [None]:
pd.value_counts(adata2.obs["cell_type"])

In [None]:
new_cluster_names = {'Bergmann glial cell':'Astrocytes',
                     'astrocyte':'Astrocytes',
                     'central nervous system macrophage':'Macrophages',
                     'choroid plexus epithelial cell':'Choroid plexus epithelial cells',
                     'endothelial cell':'Endothelial cells',
                     'ependymal cell':'Ependymal cells',
                     'fibroblast':'Fibroblasts',
                     'neuron':'Neurons',
                     'oligodendrocyte':'Oligos',
                     'oligodendrocyte precursor cell':'OPCs',
                     'pericyte':'Pericytes',
                     'vascular associated smooth muscle cell':'SMCs'}
#new_cluster_names=list(new_cluster_names.values())

In [None]:
adata2.obs['clusters'] = (
    adata2.obs['cell_type']
    .map(new_cluster_names)
    .astype('category')
)

In [None]:
pd.value_counts(adata2.obs["clusters"])

In [None]:
#adata3 = adata2.to_adata() # does not contain .X - must subset first into AnnCollectionView object

In [None]:
# subset genes
adata3 = adata2[:,adata2.var_names.isin(genes_subset)]

In [None]:
adata3

In [None]:
adata3 = adata3.to_adata()

In [None]:
adata3

In [None]:
adata3.obsm["X_umap"] = adata3.obsm["X_UMAP"]

In [None]:
adata3.var_names

In [None]:
del adata2

In [None]:
# check if data is normalized
adata3.X.max()

In [None]:
# show expression of genes (across all spots)
random_genes=random.sample(range(0, adata3.X.shape[1]), 15)
adata_sub = adata3[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',4)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007))
sns.set_theme(style='white')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
sns.set_theme(style='white')
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',4)))
pl.set(xlim=(0, 10));
plt.show()

In [None]:
adata3.layers["normalized"] = adata3.X

---

# Focus on: Foxf2

In [None]:
gene="FOXF2"

In [None]:
sc.pl.umap(adata3, color='clusters', size=0.05)

In [None]:
sc.pl.umap(adata3, color='clusters', size=0.05)

In [None]:
sc.pl.umap(adata3, color=gene, size=0.05)

In [None]:
sc.pl.matrixplot(adata3, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
sc.pl.dotplot(adata3, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
utils.summarize_gene_expression(adata = adata3, gene = gene, groupby = "clusters", 
                                study_name = dataset_name, organism = organism,
                                export = True, output_dir = os.path.join(main_dir, "Foxf2_summarized")
                               )

# Focus on: Other genes

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata3, gene, study_name = dataset_name, organism = organism, groupby = "clusters",
                                          output_dir=os.path.join(main_dir, "Other_genes_summarized"), export=True
                                         ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata3, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata3, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata3, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata3, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

# Session Info

In [None]:
sc.logging.print_versions()