<font size="+3.8">Scanpy single-cell pre-processing</font>  
<font size="+1.5"></font>  

Aim: Preprocess mouse brain single-cell data from Yang 2022 Nature  
Publication: https://www.nature.com/articles/s41586-021-04369-3

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
os.getlogin()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import matplotlib.pyplot as plt
import random
import seaborn as sns
import platform
from matplotlib.pyplot import rc_context

In [None]:
import utils

In [None]:
os.environ['CONDA_DEFAULT_ENV'] # conda env

In [None]:
platform.platform()

In [None]:
sc.settings.verbosity = 3

In [None]:
main_dir = {
    'nt': r'\\srvisds1.srv.med.uni-muenchen.de\BD-Dichgans\SF',  # Win
    'posix': '/Volumes/BD-Dichgans/SF' if os.uname().sysname == 'Darwin' else # Mac
    '/run/user/1000/gvfs/smb-share:server=138.245.4.35,share=bd-dichgans/SF'  # Linux
}[os.name]
main_dir

In [None]:
dataset_name = "Yang2022"
organism = "Human"

# Load data

## Annotated 
Downloaded from: https://cells.ucsc.edu/?ds=brain-vasc-atlas

In [None]:
ad = sc.read_mtx(os.path.join(main_dir, "P06_vasc_scRNAseq", "Yang2021", "ucsc_cellbrowser_data", "matrix.mtx.gz")).transpose()

In [None]:
ad

In [None]:
# Add var

In [None]:
var = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Yang2021", "ucsc_cellbrowser_data", "features.tsv.gz"), sep="\t", header=None)
all(var[0] == var[1])

In [None]:
var.index = var[0]
var.index.name = None

In [None]:
ad.var = var[[0]]
ad.var

In [None]:
# Original cellnames

In [None]:
cellnames = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Yang2021", "ucsc_cellbrowser_data", "barcodes.tsv.gz"), sep="\t", header=None)
# used below to assert order

In [None]:
# Add metadata

In [None]:
meta = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Yang2021", "ucsc_cellbrowser_data", "meta.tsv"), sep="\t")

In [None]:
ad.obs = meta
ad

In [None]:
ad.obs.columns

In [None]:
ad.obs.head(2)

In [None]:
assert(all(ad.obs["Cell"] == cellnames[0]))

In [None]:
# add umap coords
umap = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Yang2021", "ucsc_cellbrowser_data", "UMAP.coords.tsv.gz"), sep="\t", header=None)
ad.obsm['X_umap'] = umap[[1,2]].to_numpy()

In [None]:
# Exclude AD cases - keep healthy controls only

In [None]:
ad.obs["Sample"].unique()

In [None]:
pd.value_counts(ad.obs["Sample"])

In [None]:
controls = ['C1', 'C2', 'C3', 'C4', 'C5', 'C7', 'C8', 'C9']

In [None]:
ad = ad[ad.obs["Sample"].isin(controls)]

In [None]:
pd.value_counts(ad.obs["Cell_Type"])

In [None]:
pd.value_counts(ad.obs["Region"])

In [None]:
pd.value_counts(ad.obs["Sample_Region"])

In [None]:
with rc_context({'figure.figsize': (9, 7)}):
    sc.pl.umap(ad, color=['Cell_Type'])

In [None]:
# Renaming
old_to_new = {'Arterial':'ECs_Arterial',
              'Astrocyte':'Astrocytes',
              'Capillary':'ECs_Capillary',
              'Ependymal':'Ependymal cells',
              'M. Fibro':'Fibroblasts',
              'Microglia/Mφ':'Microglia/Macrophages',
              'Neuron':'Neurons',
              'OPC':'OPCs',
              'P. Fibro':'Fibroblasts',
              'Pericyte':'Pericytes',
              'SMC':'SMCs',
              'T cell':'T cells',
              'Veinous':'ECs_Venous',
              'Oligo':'Oligos'}
ad.obs['clusters2'] = (ad.obs['Cell_Type'].map(old_to_new).astype('category'))

In [None]:
# Renaming
old_to_new = {'ECs_Arterial':'ECs',
              'Astrocytes':'Astrocytes',
              'ECs_Capillary':'ECs',
              'Ependymal cells':'Ependymal cells',
              'Fibroblasts':'Fibroblasts',
              'Microglia/Macrophages':'Microglia/Macrophages',
              'Neurons':'Neurons',
              'OPCs':'OPCs',
              'Fibroblasts':'Fibroblasts',
              'Pericytes':'Pericytes',
              'SMCs':'SMCs',
              'T cells':'T cells',
              'ECs_Venous':'ECs',
              'Oligos':'Oligos'}
ad.obs['clusters'] = (ad.obs['clusters2'].map(old_to_new).astype('category'))

In [None]:
# Renaming
# for judit DEA 01/2025

old_to_new = {'Arterial':'EC',
              'Astrocyte':'Astrocyte',
              'Capillary':'EC',
              'Ependymal':'Ependymal',
              'M. Fibro':'M. Fibro',
              'Microglia/Mφ':'Microglia/Mφ',
              'Neuron':'Neuron',
              'OPC':'OPC',
              'P. Fibro':'P. Fibro',
              'Pericyte':'Pericyte',
              'SMC':'SMC',
              'T cell':'T cell',
              'Veinous':'EC',
              'Oligo':'Oligo'}
ad.obs['clusters_judit'] = (ad.obs['Cell_Type'].map(old_to_new).astype('category'))

In [None]:
with rc_context({'figure.figsize': (9, 7)}):
    sc.pl.umap(ad, color=['clusters'])

In [None]:
with rc_context({'figure.figsize': (9, 7)}):
    sc.pl.umap(ad, color=['clusters2'])

In [None]:
with rc_context({'figure.figsize': (7, 7)}):
    sc.pl.umap(ad, color=['clusters_judit'], legend_loc='on data', legend_fontoutline=2, legend_fontsize=12, legend_fontweight = "normal")

In [None]:
with rc_context({'figure.figsize': (7, 7)}):
    sc.pl.umap(ad, color=['Region'])

In [None]:
pd.crosstab(ad.obs["clusters"], ad.obs["Region"])

In [None]:
adata=ad

In [None]:
del ad

# Standard pipeline

Skip, because data is pre-analyzed.

Check if data is normalized:

In [None]:
adata.layers

In [None]:
adata.X

In [None]:
sns.histplot(adata.X.sum(1), kde=False)
plt.show()

In [None]:
# show expression of 100 random genes (across all spots)
random_genes=random.sample(range(0, adata.X.shape[1]), 100)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100))) # genes with 0 expression are excluded
pl.set(xlim=(0, 10));
sns.set_style("ticks")
plt.show()

Data is not normalized, therefore normalize and log-transform

In [None]:
adata.layers["counts"] = adata.X.copy() # save unnormalized raw RNA counts - retrieve via ad.X = ad.layers["counts"]

In [None]:
sc.pp.normalize_total(adata, inplace=True) # Normalize each spot by total counts over all genes, so that every spot has the same total count after normalization.

In [None]:
# show expression of 100 random genes (across all spots)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005))
plt.show()

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100))) # genes with 0 expression are excluded
pl.set(xlim=(0, 10))
plt.show()

In [None]:
sc.pp.log1p(adata) # X = log(X + 1)

In [None]:
# show expression of 100 random genes (across all spots)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.5) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005));
plt.show()

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100))); # genes with 0 expression are excluded
pl.set(xlim=(0, 10));
plt.show()

In [None]:
sns.histplot(adata.X.sum(1), kde=False)
plt.show()

In [None]:
adata.layers["normalized"] = adata.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
# Identify highly-variable genes
sc.pp.highly_variable_genes(adata)
sc.pl.highly_variable_genes(adata)

In [None]:
adata

In [None]:
adata.layers

In [None]:
# Basic QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-') 
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],jitter=0.4, multi_panel=True)

# General marker genes

For Judit 01/2025

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters_judit', method='wilcoxon', key_added='clusters_judit')

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, key='clusters_judit')

Export DEG table

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters_judit', method='wilcoxon')

In [None]:
adata.uns['rank_genes_groups'].keys()

In [None]:
sc.get.rank_genes_groups_df(adata=adata, group=group)

In [None]:
# export DEG table
celltypes = adata.obs.clusters_judit.unique().tolist()
merged_df = pd.DataFrame()
for group in celltypes:
    rank_df = sc.get.rank_genes_groups_df(adata=adata, group=group)
    rank_df["cell_type"] = group
    merged_df = pd.concat([merged_df, rank_df])
merged_df['reference'] = 'all_other_clusters'
merged_df = merged_df.rename(columns={'names': 'gene', 'logfoldchanges':'logFC', 'scores':'score', 'pvals':'pval', 'pvals_adj':'FDR'})
merged_df = merged_df[['gene', 'cell_type', 'logFC', 'score', 'pval', 'FDR', 'reference']]
merged_df.head(3)

In [None]:
# export
with pd.ExcelWriter(os.path.join(main_dir, "P06_Foxf2_per_celltype", "DEA", f"{date.today().strftime('%Y%m%d')}_cell_type_markers_Yang2022.xlsx")) as writer:
    merged_df.to_excel(writer, index=False)

# Vascular marker genes

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters', method='wilcoxon', key_added='clusters')

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, key='clusters')

Export DEG table

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters', method='wilcoxon')

In [None]:
# export DEG table
celltypes = adata.obs.clusters.unique().tolist()
merged_df = pd.DataFrame()
for group in celltypes:
    rank_df = rank_genes_groups_df(adata=adata, group=group, adj_pval_cutoff=None, log2fc_cutoff=None, sortby='zscore')
    merged_df = pd.concat([merged_df, rank_df])
    merged_df['reference'] = 'all other clusters'
merged_df.head(3)

In [None]:
merged_df.group.unique().tolist()

In [None]:
# export as csv
merged_df.to_csv(main_dir+'\\P6_Foxf2_per_celltype\\DEA\\'+ date.today().strftime("%Y%m%d")+'_DE_genes_Yang2022.csv',sep=';')

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters2', method='wilcoxon', key_added='clusters2')

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, key='clusters2')

Export DEG table

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters2', method='wilcoxon')

In [None]:
celltypes = adata.obs.clusters2.unique().tolist()
merged_df = pd.DataFrame()
for group in celltypes:
    rank_df = rank_genes_groups_df(adata=adata, group=group, adj_pval_cutoff=None, log2fc_cutoff=None, sortby='zscore')
    merged_df = pd.concat([merged_df, rank_df])
    merged_df['reference'] = 'all other clusters'
merged_df.head(3)

In [None]:
merged_df.group.unique().tolist()

In [None]:
# export as csv
merged_df.to_csv(main_dir+'\\P6_Foxf2_per_celltype\\DEA\\'+ date.today().strftime("%Y%m%d")+'_DE_genes_Yang2022_incl_zonation.csv',sep=';')

# Focus on: Foxf2

In [None]:
gene="FOXF2"

## Plot

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters2', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters2', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

## Excl zonation

In [None]:
utils.summarize_gene_expression(adata = adata, groupby = "clusters", layer = "normalized", gene = gene,
                                study_name = dataset_name, organism = organism,
                                export = True, output_dir = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'Foxf2_summarized')
                               )

## Incl zonation

In [None]:
utils.summarize_gene_expression(adata = adata, groupby = "clusters2", layer = "normalized", gene = gene,
                                study_name = dataset_name, organism = organism,
                                export = True, output_dir = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'Foxf2_summarized', 'incl_zonation')
                               )

# Focus on: Other genes

In [None]:
target_genes = ["Foxo1", "Tek", "Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]
target_genes = [gene.upper() for gene in target_genes]
target_genes

### Excl zonation

In [None]:
groupby = "clusters"

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, study_name = dataset_name, organism = organism, groupby = groupby, 
                                          output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized"), export=True
                                         ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

### Incl zonation

In [None]:
groupby = "clusters2"

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, study_name = dataset_name, organism = organism, groupby = groupby, 
                                          output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized", "incl_zonation"), export=True
                                         ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

# Focus on: Tspo (for Herms lab)

In [None]:
# for Herms lab 02/2025

In [None]:
gene="TSPO"

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var")

In [None]:
summarize_gene_expression(adata = adata, layer="normalized", gene = gene,
                          study_name = dataset_name, organism = organism,
                          export = True, output_dir = os.path.join(main_dir, 'P06_Foxf2_per_celltype', "202502-Tspo-HermsLab", "Genes_summarized")
                          )

In [None]:
# mt genes
mt_genes = [gene for gene in adata.var_names if gene.startswith("MT-")]
mt_genes

In [None]:
mt_results = {
    gene: summarize_gene_expression(adata = adata, layer="normalized", gene = gene, study_name = dataset_name, organism = organism, 
                                    output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", "Genes_summarized"), export=True
                                   ) for gene in mt_genes
}

# Correlate gene expression (Foxf2 and Foxo1)

Using MAGIC denoising

In [None]:
import magic
import scprep

In [None]:
#sc.pp.scale(adata)

## ECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters == "ECs"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X.todense(), y=adata_EC[:,'FOXO1'].X.todense(), c=adata_EC[:,'NOS3'].X.todense(), ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## PCs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters == "Pericytes"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X.todense(), y=adata_EC[:,'FOXO1'].X.todense(), c=adata_EC[:,'NOS3'].X.todense(), ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0.4,0.4), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## All cell types

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata

In [None]:
adata.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata.X
matrix.columns = adata.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata[:,'FOXF2'].X.todense(), y=adata[:,'FOXO1'].X.todense(), c=adata[:,'NOS3'].X.todense(), ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## capECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters2 == "ECs_Capillary"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X.todense(), y=adata_EC[:,'FOXO1'].X.todense(), c=adata_EC[:,'NOS3'].X.todense(), ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## aECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters2 == "ECs_Arterial"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X.todense(), y=adata_EC[:,'FOXO1'].X.todense(), c=adata_EC[:,'NOS3'].X.todense(), ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## vECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters2 == "ECs_Venous"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X.todense(), y=adata_EC[:,'FOXO1'].X.todense(), c=adata_EC[:,'NOS3'].X.todense(), ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

# Correlate gene expression (TSPO and mt genes)  


For Herms lab 02/2025

In [None]:
mt_genes = [gene for gene in adata.var_names if gene.startswith("MT-")]
mt_genes

In [None]:
gg = ["TSPO"] + mt_genes

## All cell types

In [None]:
celltype = "AllCellTypes"

In [None]:
#matrix = pd.DataFrame(adata.X) # not compatible with sparse 
matrix = adata.X
matrix.columns = adata.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
plt.show()

In [None]:
adata.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
# remove unexpressed genes
adata.shape

In [None]:
adata = adata[:, (adata.X.sum(axis=0) > 0)]
adata.shape

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata, genes=gg)

In [None]:
gene_x = 'TSPO'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #plt.axline((0.1, 0.1), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{celltype}.xlsx"), index=False)

## ECs

In [None]:
celltype = "ECs"

In [None]:
adata_EC = adata[adata.obs.clusters == celltype]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
plt.show()

In [None]:
adata_EC.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
# remove unexpressed genes
adata_EC.shape

In [None]:
adata_EC = adata_EC[:, (adata_EC.X.sum(axis=0) > 0)]
adata_EC.shape

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

In [None]:
gene_x = 'TSPO'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #plt.axline((0.1, 0.1), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{celltype}.xlsx"), index=False)

## Microglia/Macrophages

In [None]:
celltype = "Microglia/Macrophages"

In [None]:
adata_MM = adata[adata.obs.clusters == celltype]

In [None]:
adata_MM

In [None]:
adata_MM.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_MM.X
matrix.columns = adata_MM.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
plt.show()

In [None]:
adata_MM.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
# remove unexpressed genes
adata_MM.shape

In [None]:
adata_MM = adata_MM[:, (adata_MM.X.sum(axis=0) > 0)]
adata_MM.shape

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_MM, genes=gg)

In [None]:
gene_x = 'TSPO'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #plt.axline((0.1, 0.1), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype.replace("/", "")}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype.replace("/", "")
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
ct = celltype.replace("/", "")
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{ct}.xlsx"), index=False)

# Session Info

In [None]:
sc.logging.print_versions()