<font size="+3.8">Scanpy single-cell pre-processing</font>  


Saunders/Makosco (2018 Cell) mouse brain single-cell data

Note: Same script/preprocessing used for p6-visium, see SF/Git/p6/visium-foxf2/p6_visium_foxf2_sc_pp.ipynb

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
os.getlogin()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import platform
from datetime import date
from matplotlib.pyplot import rc_context

In [None]:
import utils

In [None]:
os.environ['CONDA_DEFAULT_ENV'] # conda env

In [None]:
platform.platform()

In [None]:
sc.settings.verbosity = 3

In [None]:
main_dir='/run/user/1000/gvfs/smb-share:server=138.245.4.35,share=bd-dichgans/SF' # Linux
main_dir='\\\isdsynnas.srv.med.uni-muenchen.de\BD-Dichgans\SF' # Win
main_dir='/Volumes/BD-Dichgans/SF' # Mac

In [None]:
dataset_name = "Saunders2018"
organism = "Mouse"

# Load data

## Raw

Data downloaded from http://dropviz.org/ -> Data -> DGE By Region

Code partly based on https://github.com/theislab/scib-reproducibility/blob/main/notebooks/data_preprocessing/mouse_brain/01_collect_mouse_brain_studies.ipynb

In [None]:
adata = sc.read_h5ad(os.path.join(main_dir,"P06_vasc_scRNAseq","DropViz","all_by_region","input","processed","20220816_all_regions_merged_mbuettner_pipeline.h5ad"))

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique()

In [None]:
adata.layers

In [None]:
adata

## Previous work

In [None]:
# load QCed + normalized file (continue below)
date_set='20220823'
adata = sc.read_h5ad(os.path.join(main_dir,"P06_vasc_scRNAseq","DropViz","all_by_region", date_set+'_saunders_normalised_logarithmised_annotated_no_rank_genes_groups.h5ad'))
adata.uns['log1p']['base'] = None

In [None]:
adata

# QC

In [None]:
# genes with highest fraction of counts per cell
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
# Basic QC
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True, size=0.2)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
# Filtering
adata = adata[adata.obs.n_genes_by_counts < 6000, :]
#adata = adata[adata.obs.n_genes_by_counts > 500, :]
adata = adata[adata.obs.pct_counts_mt < 20, :]

In [None]:
# do not filter genes for Tangram!

In [None]:
adata

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

# Normalisation, logarithmization

For use in Tangram, preprocessing should be as similar as possible between spatial and single-cell data. Also see https://github.com/broadinstitute/Tangram/issues/3.  
Hence, normalize and logarithmise with default settings, but don't scale.

Check if data is normalized:

In [None]:
adata.layers

In [None]:
np.max(adata.X)

In [None]:
# show expression of 100 random genes (across all cells)
import random
import seaborn as sns
random_genes=random.sample(range(0, adata.X.shape[1]), 100)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
sns.set_theme(style='white')
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 20),ylim=(0,1e6));

In [None]:
exp

In [None]:
#adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, inplace=True) # Normalize each spot by total counts over all genes, so that every spot has the same total count after normalization.

In [None]:
# show expression of 100 random genes (across all cells)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005))

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 20),ylim=(0,1e6));

In [None]:
sc.pp.log1p(adata)

In [None]:
# show expression of 100 random genes (across all cells)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.5) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005));

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)));
pl.set(xlim=(0, 20),ylim=(0,1e6));

In [None]:
del exp
del adata_sub
del random_genes

In [None]:
adata.layers["normalized"] = adata.X.copy()

In [None]:
# Identify highly-variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)

In [None]:
adata.layers

In [None]:
adata

# Dim Reduction

In [None]:
# Run PCA
plt.rcParams['figure.figsize'] = [7, 7]
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color='Foxf2')

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
sc.pp.neighbors(adata, n_pcs=20)

In [None]:
# Run UMAP
sc.tl.umap(adata)

Available cell annotations:

In [None]:
plt.rcParams['figure.figsize'] = [9, 9]
sc.pl.umap(adata, color=['class','region'], wspace=0.15, size=0.4, legend_loc='on data', legend_fontweight='normal', legend_fontoutline=3, legend_fontsize=7)

In [None]:
# exclude cerebellum (mostly neurons)
adata = adata[adata.obs.region != 'CB']

# Clustering

In [None]:
%%time
# Run Leiden clustering
sc.tl.leiden(adata, resolution = 1.5)

In [None]:
adata.obs.head(2)

In [None]:
pd.crosstab(adata.obs['class'],adata.obs['region'])

In [None]:
# Saunders: remove cluster 34 as its only 1 single cell
adata = adata[adata.obs.leiden != '34']

In [None]:
plt.rcParams['figure.figsize'] = [9, 9]
sc.pl.umap(adata, color=['leiden', 'class', 'region'], wspace=0.3, size=0.4)

In [None]:
sc.pl.umap(adata, color=['leiden', 'batch'], wspace=0.3, size=0.4)

# Cell annotation

Verify and refine cell annotation from authors

In [None]:
pd.value_counts(adata.obs['class'])

In [None]:
pd.value_counts(adata.obs['refined_class'])

In [None]:
pd.value_counts(adata.obs['cell_type'])

Re-annotate to better distinguish vascular cells:

Manual marker gene selection

|Vascular     |EC         |Pericytes|SMCs   |Fibroblasts|Oligos|OPCs         |Ependymal|Neurons    |Astrocytes|Microglia|Immune (broad/hematopoetic)|Macrophages     |Macrophages/Microglia|Monocytes|Mononcytes/B-cells|Granulocytes|B cells|T/NK cells|
|---          |---        |---      |---    |---        |---   |---          |---      |---        |---       |---      |---                        |---             |---|---|---|---|---|---|
|PDGFRA=CD140A|CLDN5      |VTN      |ACTA2  |DCN        |MBP   |CSPG4=NG2    |PIFO     |RBFOX3=NEUN|AQP4      |AIF1     |PTPRC=CD45                 |CD14            |TREM2|CCR2|CD74|CD16/32|CD19|CD4|
|MCAM=CD146   |PECAM1=CD31|PDGFRB   |MYOCD  |COL6A1     |ENPP2 |PDGFRA=CD140A|FOXJ1    |TUBB3      |          |         |                           |ITGB2=CD18=CD11B||||ITGB2=CD18=CD11B||CD8A|
|FOXF2        |           |         |       |COL3A1     |      |             |DYNLRB2  |           |          |         |                           |CD86            ||||CD15||CD8B|
|             |           |         |       |           |      |             |MEIG1    |           |          |         |                           |ADGRE1=F4/80    ||||||IL2RB|
|||||||||||||||||||IFNG|

In [None]:
# plot marker genes
plt.rcParams['figure.figsize'] = [7, 5]
marker_genes = ["Pdgfra", "Mcam", "Pecam1", "Cldn5","Vtn", "Pdgfrb","Acta2", "Dcn", "Col6a1", "Mbp","Enpp2","Cspg4","Dcx","Pifo","Foxj1","Rbfox3","Tubb3", "Aqp4", "Aif1", "Ptprc", "Ccr2","Adgre1","Itgb2","Cd14","Cd86","Trem2","Vcan","Cd4","Il2rb","Cd244", "Cd74","Cd68","Ifng","Ptgdr2","Ccr3"]
marker_genes=[x for x in marker_genes if x in list(adata.var_names)] # remove those not in adata.var_names
sc.pl.umap(adata, ncols=3, color=marker_genes)

# "Cd19", "Cd8a" not in Saunders

In [None]:
# Run DE test for annotation (Wilcoxon)
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', key_added='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, ncols=3, fontsize=13, key='wilcoxon')

In [None]:
plt.rcParams['figure.figsize'] = [11, 8]
sc.pl.umap(adata, color=['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], size=0.5)

In [None]:
# Dotplot
sc.tl.dendrogram(adata, groupby="leiden")
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, key="wilcoxon", groupby="leiden")

In [None]:
# plot cell markers used by Christina
import ast
with open(main_dir+'\P6_Visium_Christina\Tangram\cell_markers_from_Christina.txt') as f:
    data = f.read()
d = ast.literal_eval(data) # load json
markers_christina = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.items() ])) # turn to pd.df
markers_christina.head(8)

In [None]:
markers_christina=sum(markers_christina.transpose().values.tolist(),[]) # turn to list
markers_christina=[x for x in markers_christina if str(x) != 'nan'] # remove nan
markers_christina=[x for x in markers_christina if x in list(adata.var_names)] # remove those not in adata.var_names

In [None]:
sc.pl.dotplot(adata, var_names=markers_christina, groupby="leiden")

In [None]:
plt.rcParams['figure.figsize'] = [8, 6]
marker_genes = sum(list(d.values()), [])
sc.pl.umap(adata, ncols=4, color=markers_christina)

In [None]:
utils.cluster_small_multiples(adata, clust_key="leiden")

In [None]:
# Rename clusters
old_to_new = {
    '0': 'Neurons','1':'Neurons',
    '2':'Oligos','3':'Neurons',
    '4':'Astrocytes','5':'Neurons',
    '6':'Endothelial cells','7':'Neurons',
    '8':'Neurons','9':'Neurons',
    '10':'Oligos','11':'Neurons',
    '12':'Neurons','13':'Oligos',
    '14':'Neurons','15':'Neurons',
    '16':'Neurons','17':'OPCs',
    '18':'Astrocytes','19':'Neurons',
    '20':'Oligos','21':'Neurons',
    '22':'SMCs','23':'Microglia/Macrophages',
    '24':'Pericytes','25':'Oligos',
    '26':'Neurogenesis','27':'Fibroblasts',
    '28':'Neurons','29':'Endothelial cells',
    '30':'Ependymal cells','31':'Oligos',
    '32':'Neurons','33':'Neurons'
}

In [None]:
adata.obs['clusters'] = (
    adata.obs['leiden'].map(old_to_new).astype('category')
)
adata.obs['clusters'].head(2)

In [None]:
plt.rcParams['figure.figsize'] = [11, 8]
sc.pl.umap(adata, color=['clusters','class','cell_type'], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=3, legend_fontsize=12, size=0.5)

Refined own labels on left. Center and right labels from authors.

Left: Novel annotation, right: Annotation from Saunders et al.

In [None]:
# Dotplot of marker genes
sc.pl.dotplot(adata, marker_genes, groupby='clusters')

# DE analysis

In [None]:
adata.obs.clusters.value_counts()

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters', method='wilcoxon', key_added = "dea")

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=15, sharey=False, key = "dea", fontsize = 14)

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=6, key="dea", groupby="clusters")

In [None]:
sc.get.rank_genes_groups_df(adata, key = "dea", group = "Oligos")[0:15]

In [None]:
sc.get.rank_genes_groups_df(adata, key = "dea", group = "Neurons")[0:15]["names"]

In [None]:
sc.pl.dotplot(adata, var_names=["Fcrls","Hexb","P2ry12","Ptprc","Mertk","Mrc1","Tmem119","Fos","Junb"], groupby="cell_type")

In [None]:
sc.pl.dotplot(adata, var_names=["Fcrls","Hexb","P2ry12","Ptprc","Mertk","Mrc1","Tmem119","Fos","Junb"], groupby="refined_class")

In [None]:
# filter

In [None]:
sc.tl.filter_rank_genes_groups(adata, key = "dea", groupby="clusters", key_added = "dea_filtered", min_in_group_fraction=0.5, max_out_group_fraction=0.25)

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=15, sharey=False, key = "dea_filtered", fontsize = 14)

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=6, key="dea_filtered", groupby="clusters")

# Focus on Foxf2

In [None]:
gene="Foxf2"

In [None]:
dataset_name = "Saunders2018"
organism = "Mouse"

In [None]:
with rc_context({'figure.figsize': (7,7)}):
    sc.pl.umap(adata, color=['clusters',gene], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=2, legend_fontsize=8, size=0.4)

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
adata.X.max()

In [None]:
utils.summarize_gene_expression(adata = adata, gene = gene, groupby = "clusters", 
                          study_name = dataset_name, organism = organism,
                          export = True, output_dir = os.path.join(main_dir, "P06_Foxf2_per_celltype", "Foxf2_summarized")
                         )

# Focus on: Other genes

In [None]:
target_genes = ["Foxo1", "Tek", "Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, study_name = dataset_name, organism = organism, groupby = "clusters",
                                    output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized"), export=True
                                   ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

# Focus on: Tspo (for Herms lab)

In [None]:
# for Herms lab 02/2025

In [None]:
gene="Tspo"

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
utils.summarize_gene_expression(adata = adata, layer=None, gene = gene,
                          study_name = dataset_name, organism = organism,
                          export = True, output_dir = os.path.join(main_dir, 'P06_Foxf2_per_celltype', "202502-Tspo-HermsLab", "Genes_summarized")
                          )

In [None]:
# mt genes
mt_genes = [gene for gene in adata.var_names if gene.startswith("mt")]
mt_genes

In [None]:
mt_results = {
    gene: utils.summarize_gene_expression(adata = adata, layer=None, gene = gene, study_name = dataset_name, organism = organism, 
                                    output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", "Genes_summarized"), export=True
                                   ) for gene in mt_genes
}

# Correlate gene expression (Foxf2 and Foxo1)

Using MAGIC denoising

In [None]:
import magic
import scprep

In [None]:
gg = ["Foxf2","Foxo1"]

## ECs

In [None]:
adata_EC = adata[adata.obs.clusters == "Endothelial cells"]

In [None]:
adata_EC

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = 500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))

scprep.plot.scatter(x=adata_EC[:,'Foxf2'].X.todense(), y=adata_EC[:,'Foxo1'].X.todense(), c=adata_EC[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')

scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((0.1,0.1), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## PCs

In [None]:
adata_EC = adata[adata.obs.clusters == "Pericytes"]

In [None]:
adata_EC

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = 500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))

scprep.plot.scatter(x=adata_EC[:,'Foxf2'].X.todense(), y=adata_EC[:,'Foxo1'].X.todense(), c=adata_EC[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')

scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((0.1,0.1), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## SMCs

In [None]:
adata_EC = adata[adata.obs.clusters == "SMCs"]

In [None]:
adata_EC

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = 500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))

scprep.plot.scatter(x=adata_EC[:,'Foxf2'].X.todense(), y=adata_EC[:,'Foxo1'].X.todense(), c=adata_EC[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')

scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((0.1,0.1), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## All cell types

In [None]:
adata

In [None]:
#matrix = pd.DataFrame(adata.X) # not compatible with sparse 
matrix = adata.X
matrix.columns = adata.var.index.tolist()

In [None]:
cutoff_var = 500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['Foxf2','Foxo1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))

scprep.plot.scatter(x=adata[:,'Foxf2'].X.todense(), y=adata[:,'Foxo1'].X.todense(), c=adata[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')

scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')

plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")

plt.tight_layout()
plt.show()

# Correlate gene expression (TSPO and mt genes)  

For Herms lab 02/2025

In [None]:
mt_genes = [gene for gene in adata.var_names if gene.startswith("mt")]
mt_genes

In [None]:
gg = ["Tspo"] + mt_genes

## All Cell Types

Too slow..

In [None]:
#matrix = pd.DataFrame(adata.X) # not compatible with sparse 
matrix = adata.X
matrix.columns = adata.var.index.tolist()

In [None]:
cutoff_var = 500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
plt.show()

In [None]:
adata.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata, genes=gg)

In [None]:
gene_x = 'Tspo'
correlations = []
for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))
    print(f"Correlation ({gene_x} vs {gene_y}): {corr:.3f}")

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')
    plt.axline((0.5, 0.5), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()
    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = "across_all"
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + "_Saunders2018_MAGIC_correlation_results.xlsx"), index=False)

## ECs

In [None]:
celltype = "Endothelial cells"

In [None]:
adata_EC = adata[adata.obs.clusters == celltype]

In [None]:
adata_EC

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = 500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
plt.show()

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

In [None]:
gene_x = 'Tspo'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #plt.axline((0.1, 0.01), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{celltype}.xlsx"), index=False)

## Microglia/Macrophages

In [None]:
celltype = "Microglia/Macrophages"

In [None]:
adata_sub = adata[adata.obs.clusters == celltype]

In [None]:
adata_sub

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_sub.X
matrix.columns = adata_sub.var.index.tolist()

In [None]:
cutoff_var = 500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
plt.show()

In [None]:
adata_sub.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_sub, genes=gg)

In [None]:
gene_x = 'Tspo'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #plt.axline((0.1, 0.01), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype.replace("/", "")}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype.replace("/", "")
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
ct = celltype.replace("/", "")
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{ct}.xlsx"), index=False)

# Save

In [None]:
ad_sc

In [None]:
name='saunders_normalised_logarithmised_annotated'

In [None]:
main_dir

In [None]:
# Saunders
# Linux: writing to mounted drive causes error. save locally then move.
#sc.write(adata=ad_sc, main_dir+"\P6_vasc_scRNAseq\DropViz\\all_by_region\input\\processed\\" + date.today().strftime("%Y%m%d")+'_'+name+'.h5ad')
os.chdir('/home/simon/Downloads')
sc.write(adata=ad_sc,filename=date.today().strftime("%Y%m%d")+'_'+name+'.h5ad')

# Session Info

In [None]:
sc.logging.print_versions()