<font size="+3.8">Scanpy single-cell pre-processing</font>  
<font size="+1.5"></font>  

Aim: Preprocess own mouse brain single-cell dataset

Mice: male, C57BL6J, age 8 weeks 

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
os.getlogin()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import platform
from matplotlib.pyplot import rc_context

In [None]:
import utils

In [None]:
os.environ['CONDA_DEFAULT_ENV'] # conda env

In [None]:
platform.platform()

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
main_dir='\\\srvisds1.srv.med.uni-muenchen.de\BD-Dichgans\SF' # Win
main_dir='/Volumes/BD-Dichgans/SF' # Mac

In [None]:
dataset_name = "OwnData"
organism = "Mouse"

# Load data

## Raw

In [None]:
adata = sc.read_10x_h5(main_dir+"\\P06_Foxf2_per_celltype\\scRNAseq\Steffi_mouse_brain_scRNAseq\\Sample_22L000027_Naive\\outs\\filtered_feature_bc_matrix.h5")

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique()

In [None]:
adata

## Previous work

In [None]:
# load final file
date_set='20230310'
adata=sc.read_h5ad(os.path.join(main_dir,'P06_Foxf2_per_celltype', 'scRNAseq', 'Steffi_mouse_brain_scRNAseq', 'h5ad', date_set+'_heindl_normalised_logarithmised_annotated.h5ad'))
adata.uns['log1p']['base'] = None

In [None]:
adata.layers["normalized"] = adata.X
adata.X.max()

In [None]:
adata

# QC

In [None]:
# genes with highest fraction of counts per cell
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
# Basic filtering
#sc.pp.filter_cells(adata, min_genes=200)
#sc.pp.filter_genes(adata, min_cells=3)

# do not filter genes for Tangram!

In [None]:
# Basic QC
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
# # Filtering
# adata = adata[adata.obs.n_genes_by_counts < 7000, :]
# adata = adata[adata.obs.n_genes_by_counts > 250, :]
# adata = adata[adata.obs.pct_counts_mt < 15, :]

In [None]:
# Filtering
adata = adata[adata.obs.n_genes_by_counts < 7000, :]
adata = adata[adata.obs.n_genes_by_counts > 200, :]
adata = adata[adata.obs.pct_counts_mt < 20, :]

In [None]:
adata

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
# Basic QC
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

# Normalisation, logarithmization

In [None]:
adata.layers

In [None]:
# show expression of 100 random genes (across all spots)
import random
import seaborn as sns
random_genes=random.sample(range(0, adata.X.shape[1]), 100)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
sns.set_theme(style='white')
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100))) # genes with 0 expression are excluded
pl.set(xlim=(0, 20),ylim=(0,1e5));

In [None]:
adata.layers["counts"] = adata.X.copy() # save unnormalized raw RNA counts - retrieve via adata.X = adata.layers["counts"]

In [None]:
sc.pp.normalize_total(adata, inplace=True) # Normalize each spot by total counts over all genes, so that every spot has the same total count after normalization.

In [None]:
# show expression of 100 random genes (across all spots)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005))

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100))) # genes with 0 expression are excluded
pl.set(xlim=(0, 20),ylim=(0,1e5));

In [None]:
sc.pp.log1p(adata) # X = log(X + 1)

In [None]:
# show expression of 100 random genes (across all spots)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.5) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005));

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100))); # genes with 0 expression are excluded
pl.set(xlim=(0, 20),ylim=(0,1e5));

In [None]:
adata.layers["normalized"] = adata.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
# Identify highly-variable genes
sc.pp.highly_variable_genes(adata)
sc.pl.highly_variable_genes(adata)

In [None]:
adata

In [None]:
adata.layers

# Dim Reduction

In [None]:
# Run PCA
with rc_context({'figure.figsize': (8, 8)}):
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pl.pca(adata, color='Foxf2')

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
sc.pp.neighbors(adata)

In [None]:
# Run UMAP
sc.tl.umap(adata)

In [None]:
list(adata.obs.columns)

In [None]:
with rc_context({'figure.figsize': (9, 9)}):
    sc.pl.umap(adata, color=['n_genes_by_counts', 'pct_counts_mt'], wspace=0.2)

# Clustering

In [None]:
# Run Leiden clustering
sc.tl.leiden(adata, resolution=1.1)

In [None]:
with rc_context({'figure.figsize': (10, 10)}):
    sc.pl.umap(adata, color=['leiden'])

# Cell annotation

In [None]:
adatatemp = adata.copy()

## Cell Cycle

In [None]:
cell_cyle_genes = {
"G1":["Ccne1","Pttg1"],
"G1/S":["Slbp","Cdca7","Ung","Cdc6","Pcna","Mcm5","Orc1","Dtl"],
"S":["Rrm2","E2f8","Atad2"],
"G2":["Ccnf","Cdca8","Hjurp","Cdk1","Top2a","Kif23"],
"G2/M":["Kpna2","Bub1b","Tacc3"],
"M":["Plk1","Mapk13","Aurka","Tpx2"]
}

Score groups:

In [None]:
def sort_and_score(adatatemp, marker_genes):
    
    marker_genes_in_data = {}
    for ct, markers in marker_genes.items():
        markers_found = []
        for marker in markers:
            if marker in adatatemp.var.index:
                markers_found.append(marker)
        if markers_found:
            marker_genes_in_data[ct] = markers_found

    for key in marker_genes_in_data.keys():
        sc.tl.score_genes(adatatemp, marker_genes_in_data[key], score_name=key)
        
    return adatatemp, marker_genes_in_data

In [None]:
adatatemp, marker_genes_in_data = sort_and_score(adatatemp, cell_cyle_genes)

In [None]:
with rc_context({'figure.figsize': (6,6)}):
    sc.pl.umap(
    adatatemp,
    color=marker_genes_in_data.keys(), 
    frameon= False, ncols=3,
    cmap='RdYlBu_r', vmin=0, vmax=0.3, 
    size=14, sort_order=True
    )

Individual marker genes:

In [None]:
for ct in marker_genes_in_data:
    print(f"{ct.upper()}:")  # print cell subtype name
    sc.pl.umap(
        adata,
        color=marker_genes_in_data[ct],
        vmin=0,
        vmax=0.3,
        #vmax="p99",  # set vmax to the 99th percentile of the gene count instead of the maximum, to prevent outliers from making expression in other cells invisible. Note that this can cause problems for extremely lowly expressed genes.
        sort_order=True,  # do not plot highest expression on top, to not get a biased view of the mean expression among cells
        frameon=False, cmap="Reds",
    )
    print("\n\n\n")  # print white space for legibility

## Cell types

|Vascular     |EC         |Pericytes|SMCs   |Fibroblasts|Oligos|OPCs         |Ependymal|Neurons    |immature/migrating Neurons|Astrocytes|Microglia|Immune (broad/hematopoetic)|Macrophages     |Macrophages/Microglia|Monocytes|Mononcytes/B-cells|Granulocytes|B-cells|T/NK cells|
|---          |---        |---      |---    |---        |---   |---          |---      |---        |---   |---       |---      |---                        |---             |---|---|---|---|---|---|
|PDGFRA =CD140A|CLDN5      |VTN      |ACTA2  |DCN        |MBP   |CSPG4 =NG2    |PIFO     |RBFOX3 =NEUN|DCX   |AQP4      |AIF1     |PTPRC =CD45                 |CD14            |TREM2|CCR2|CD74|CD16/32|CD19|CD4|
|MCAM =CD146   |PECAM1 =CD31|PDGFRB   |MYOCD  |COL6A1     |ENPP2 |PDGFRA =CD140A|FOXJ1    |TUBB3      ||          |         |                           |ITGB2 =CD18 =CD11B||||ITGB2 =CD18 =CD11B||CD8A|
|FOXF2        |           |         |       |COL3A1     |      |             |DYNLRB2  |           ||          |         |                           |CD86            ||||CD15||CD8B|
|             |           |         |       |           |      |             |MEIG1    |           ||          |         |                           |ADGRE1 =F4/80    ||||||IL2RB|
||||||||||||||||||||IFNG|

In [None]:
marker_genes = {
'Vascular': ['Pdgfra','Mcam'],
'ECs': ['Cldn5', 'Pecam1'], 
'Pericytes': ['Vtn','Pdgfrb'],
'SMCs': ['Acta2', 'Myocd'], 
'Fibroblasts': ['Dcn', 'Col6a1', 'Col3a1'], 
'Oligodendrocytes': ['Mbp', 'Enpp2'], 
'OPCs': ['Cspg4', 'Pdgfra'], 
'Ependymal cells': ['Pifo','Foxj1','Dynlrb2','Meig1'],
'Neurons': ['Rbfox3', 'Tubb3'], 
'immature/migrating Neurons': ['Dcx'], 
'Astrocytes': ['Aqp4', 'Aldoc'], 
'Microglia': ['Aif1', 'Tmem119'], 
'Immune': ['Ptprc','Trem2'],
'Monocytes/Macrophages': ['Cd14', 'Itgb2', 'Cd86', 'Adgre1','Ccr2'], 
'Mononcytes/B-cells': ['Cd74'],
'B cells': ['Cd19'], 
'T/NK cells': ['Cd4','Cd8a','Cd8b','Cd3e', 'Il2rb', 'Lat','Ifng'],
'Granulocytes': ['Cd16','Itgb2','Cd15'],
'Neutrophils': ['S100a9']
}

Score groups:

In [None]:
# load above
adatatemp, marker_genes_in_data = sort_and_score(adatatemp, marker_genes)

In [None]:
with rc_context({'figure.figsize': (6,6)}):
    sc.pl.umap(
    adatatemp,
    color=marker_genes_in_data.keys(), 
    frameon= False, ncols=3,
    cmap='RdYlBu_r', vmin=0, vmax=0.3, 
    size=10, sort_order=True
    )

Individual marker genes:

In [None]:
for ct in marker_genes_in_data:
    print(f"{ct.upper()}:")  # print cell subtype name
    sc.pl.umap(
        adata,
        size=8,
        color=marker_genes_in_data[ct],
        vmin=0,
        vmax=0.9,
        #vmax="p99",  # set vmax to the 99th percentile of the gene count instead of the maximum, to prevent outliers from making expression in other cells invisible. Note that this can cause problems for extremely lowly expressed genes.
        sort_order=True,  # do not plot highest expression on top, to not get a biased view of the mean expression among cells
        frameon=False,
        cmap="Reds",  # or choose another color map e.g. from here: https://matplotlib.org/stable/tutorials/colors/colormaps.html
    )
    print("\n\n\n")  # print white space for legibility

In [None]:
# plot marker genes
#plt.rcParams['figure.figsize'] = [8, 6] # set plot sizes
#marker_genes = ["Pdgfra", "Mcam","Foxf2", "Pecam1", "Cldn5","Vtn", "Pdgfrb","Acta2", "Myocd","Dcn", "Col6a1", "Mbp","Enpp2","Cspg4","Pifo","Foxj1","Dynlrb2","Meig1","Rbfox3","Tubb3","Dcx","Aqp4", "Aif1", "Ptprc", "Ccr2","Adgre1","Itgb2","Cd14","Cd86","Trem2","Vcan","Cd4","Cd19", "Cd8a","Il2rb","Cd244", "Cd74","Cd68","Ifng","Ptgdr2","Ccr3"]
#marker_genes=[x for x in marker_genes if x in list(adata.var_names)] # remove those not in adata.var_names
#sc.pl.umap(adata, ncols=3, color=marker_genes, size=35)

In [None]:
# plot cell markers used by Christina
import ast
with open(main_dir+'\P6_Visium_Christina\Tangram\cell_markers_from_Christina.txt') as f:
    data = f.read()
d = ast.literal_eval(data) # load json
markers_christina = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.items() ])) # turn to pd.df
markers_christina.head(8)

In [None]:
markers_christina=sum(markers_christina.transpose().values.tolist(),[]) # turn to list
markers_christina=[x for x in markers_christina if str(x) != 'nan'] # remove nan
markers_christina=[x for x in markers_christina if x in list(adata.var_names)] # remove those not in adata.var_names

In [None]:
sc.pl.dotplot(adata, var_names=markers_christina, groupby="leiden")

In [None]:
plt.rcParams['figure.figsize'] = [8, 6] # set plot sizes
marker_genes = sum(list(d.values()), [])
sc.pl.umap(adata, ncols=4, color=markers_christina)

## DE analysis

In [None]:
# Run DE test for annotation (Wilcoxon)
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', key_added='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, ncols=3, fontsize=13, key='wilcoxon')

In [None]:
# Dotplot
sc.tl.dendrogram(adata, groupby="leiden")
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, key="wilcoxon", groupby="leiden");

## Annotation

In [None]:
plt.rcParams['figure.figsize'] = [7,7] # set plot sizes
sc.pl.umap(adata, color=['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], size=25)

In [None]:
utils.cluster_small_multiples(adata, clust_key="leiden")

In [None]:
# Rename clusters
# filtering  cells:
    #adata = adata[adata.obs.n_genes_by_counts < 7000, :]
    #adata = adata[adata.obs.n_genes_by_counts > 200, :]
    #adata = adata[adata.obs.pct_counts_mt < 20, :]
# not regressing out any covariates
# clustering resolution 0.15
old_to_new = {
    '0':'Microglia/Macrophages','1':'Astrocytes',
    '2':'Microglia/Macrophages','3':'Endothelial cells',
    '4':'Microglia/Macrophages','5':'Neurons',
    '6':'Neurons','7':'Neurons',
    '8':'Astrocytes','9':'Neurons',
    '10':'Astrocytes','11':'SMCs',
    '12':'Pericytes','13':'Microglia/Macrophages',
    '14':'Immune_Other','15':'Endothelial cells',
    '16':'Neurons','17':'Microglia/Macrophages',
    '18':'Oligos','19':'Unknown',
    '20':'OPCs','21':'Neurons',
    '22':'Unknown','23':'Immune_Other',
    '24':'Microglia/Macrophages'
}

In [None]:
adata.obs['clusters'] = (
    adata.obs['leiden'].map(old_to_new).astype('category')
)

In [None]:
plt.rcParams['figure.figsize'] = [8, 8] # set plot sizes
sc.pl.umap(adata, color=['clusters'], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=3, legend_fontsize=14)

In [None]:
# Dotplot
# re-run DE analysis based on cluster names
sc.tl.rank_genes_groups(adata, 'clusters', method='wilcoxon', key_added = "wilcoxon")
# Plot
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, key="wilcoxon", groupby="clusters")

In [None]:
# exclude clusters with <50 cells
cluster_counts = adata.obs['clusters'].value_counts()
cluster_counts

In [None]:
adata = adata[adata.obs['clusters'].isin(cluster_counts[cluster_counts>49].index)]

In [None]:
plt.rcParams['figure.figsize'] = [8, 8] # set plot sizes
sc.pl.umap(adata, color=['clusters'], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=3, legend_fontsize=14)

# Focus on: Foxf2

In [None]:
gene="Foxf2"

In [None]:
with rc_context({'figure.figsize': (7,7)}):
    sc.pl.umap(adata, color=['clusters',gene], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=2, legend_fontsize=10, size=35, layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
utils.summarize_gene_expression(adata = adata, gene = gene, groupby = "clusters", 
                                study_name = dataset_name, organism = organism,
                                export = True, output_dir = os.path.join(main_dir, "P06_Foxf2_per_celltype", "Foxf2_summarized")
                               )

# Focus on: Other genes

In [None]:
target_genes = ["Foxo1", "Tek", "Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Ptprb", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, study_name = dataset_name, organism = organism, groupby = "clusters",
                                          output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized"), export=True
                                         ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

# Focus on: Tspo (for Herms lab)

In [None]:
# for Herms lab 02/2025

In [None]:
gene="Tspo"

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
utils.summarize_gene_expression(adata = adata, gene = gene, groupby = "clusters"
                          study_name = dataset_name, organism = organism,
                          export = True, output_dir = os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", "Genes_summarized")
                         )

In [None]:
# mt genes
mt_genes = [gene for gene in adata.var_names if gene.startswith("mt")]
mt_genes

In [None]:
mt_results = {
    gene: utils.summarize_gene_expression(adata, gene, groupby = "clusters", study_name = dataset_name, organism = organism, 
                                    output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", "Genes_summarized"), export=True
                                   ) for gene in mt_genes
}

# Correlate gene expression (Foxf2 and Foxo1)

Using MAGIC denoising

In [None]:
import magic
import scprep

## ECs

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adata_EC = adata[adata.obs.clusters == "Endothelial cells"]

In [None]:
adata_EC

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = 1500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['Foxf2','Foxo1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'Foxf2'].X.todense(), y=adata_EC[:,'Foxo1'].X.todense(), c=adata_EC[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((0.3,0.3), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## PCs

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adata_EC = adata[adata.obs.clusters == "Pericytes"]

In [None]:
adata_EC

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = 1500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['Foxf2','Foxo1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'Foxf2'].X.todense(), y=adata_EC[:,'Foxo1'].X.todense(), c=adata_EC[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((0.05,0.05), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## All cell types

In [None]:
gg = ["Foxf2","Foxo1"]

In [None]:
adata

In [None]:
#matrix = pd.DataFrame(adata.X) # not compatible with sparse 
matrix = adata.X
matrix.columns = adata.var.index.tolist()

In [None]:
cutoff_var = 700

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata.layers

# Correlate gene expression (TSPO and mt genes)  

For Herms lab 02/2025

In [None]:
mt_genes = [gene for gene in adata.var_names if gene.startswith("mt")]
mt_genes

In [None]:
gg = ["Tspo"] + mt_genes

## All cell types

In [None]:
celltype = "AllCellTypes"

In [None]:
#matrix = pd.DataFrame(adata.X) # not compatible with sparse 
matrix = adata.X
matrix.columns = adata.var.index.tolist()

In [None]:
cutoff_var = 700

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
adata.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata, genes=gg)

In [None]:
gene_x = 'Tspo'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #aplt.axline((0.1, 0.1), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{celltype}.xlsx"), index=False)

## ECs

In [None]:
celltype = "Endothelial cells"

In [None]:
adata_EC = adata[adata.obs.clusters == celltype]

In [None]:
adata_EC

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = 1500

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
adata_EC.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

In [None]:
gene_x = 'Tspo'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #plt.axline((0.5, 0.5), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{celltype}.xlsx"), index=False)

## Microglia/Macrophages

In [None]:
celltype="Microglia/Macrophages"

In [None]:
adata_MM = adata[adata.obs.clusters == celltype]

In [None]:
adata_MM

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_MM.X
matrix.columns = adata_MM.var.index.tolist()

In [None]:
cutoff_var = 800

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
adata_MM.layers

In [None]:
magic_op = magic.MAGIC()

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_MM, genes=gg)

In [None]:
gene_x = 'Tspo'
correlations = []

for gene_y in gg[1:]:  # Start from index 1 to skip 'Tspo'
    corr = np.corrcoef(emt_magic[:, [gene_x, gene_y]].X, rowvar=False)[0, 1]
    correlations.append((gene_x, gene_y, corr))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))

    # Scatter plot before MAGIC
    scprep.plot.scatter(x=adata[:, gene_x].X.todense(), y=adata[:, gene_y].X.todense(),
                        c=adata[:, gene_x].X.todense(), ax=ax1,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='Before MAGIC')

    # Scatter plot after MAGIC
    scprep.plot.scatter(x=emt_magic[:, gene_x].X, y=emt_magic[:, gene_y].X,
                        c=emt_magic[:, gene_x].X, ax=ax2,
                        xlabel=gene_x, ylabel=gene_y, legend_title=gene_x, title='After MAGIC')

    # Add correlation text to the second plot
    ax2.text(0.75, 0.97, f"r = {corr:.2f}", transform=ax2.transAxes,
             fontsize=15, verticalalignment='top', bbox=dict(facecolor='white', alpha=0.5))

    #plt.axline((0.5, 1), slope=1, color="black", alpha=0.3, linestyle="--")
    plt.tight_layout()

    # Save plot
    plt.savefig(os.path.join(main_dir, 'P06_Foxf2_per_celltype', '202502-Tspo-HermsLab', 
                            'plots', 'scatter_plots', date.today().strftime("%Y%m%d")+f'_{dataset_name}{organism}_{celltype.replace("/", "")}_{gene_x}_{gene_y}.png'), 
               dpi=500)

    plt.show()

In [None]:
correlation_df = pd.DataFrame(correlations, columns=["GeneA", "GeneB", "PearsonR"])
correlation_df["CellType"] = celltype.replace("/", "")
correlation_df = correlation_df.sort_values("PearsonR")
correlation_df

In [None]:
ct=celltype.replace("/", "")
correlation_df.to_excel(os.path.join(main_dir, "P06_Foxf2_per_celltype", "202502-Tspo-HermsLab", date.today().strftime("%Y%m%d") + f"_{dataset_name}{organism}_MAGIC_correlation_results_{ct}.xlsx"), index=False)

# Save

In [None]:
name='heindl_normalised_logarithmised_annotated'

In [None]:
# Win
adata.write(main_dir+"\P06_Foxf2_per_celltype\\scRNAseq\\Steffi_mouse_brain_scRNAseq\h5ad\\" + date.today().strftime("%Y%m%d")+'_'+name+'.h5ad')

# Session Info

In [None]:
sc.logging.print_versions()