<font size="+3.8">Scanpy single-cell pre-processing</font>  
<font size="+1.5"></font>  

Aim: Preprocess mouse brain single-cell data from Winkler 2022 Science (human adult brain - healthy controls only)  
Publication: https://pubmed.ncbi.nlm.nih.gov/35084939/

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
os.getlogin()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import platform
from datetime import date
from matplotlib.pyplot import rc_context

In [None]:
import utils

In [None]:
os.environ['CONDA_DEFAULT_ENV'] # conda env

In [None]:
platform.platform()

In [None]:
sc.settings.verbosity = 3

In [None]:
main_dir='/run/user/1000/gvfs/smb-share:server=138.245.4.35,share=bd-dichgans/SF' # Linux

In [None]:
main_dir='\\\isdsynnas.srv.med.uni-muenchen.de\BD-Dichgans\SF' # Win

In [None]:
main_dir='/Volumes/BD-Dichgans/SF' # Mac

In [None]:
dataset_name = "Winkler2022"
organism = "Human"

# Load data

## Annotated 
Downloaded from: https://cells.ucsc.edu/?bp=brain&dis=Healthy~Healthy+Control&org=Human+(H.+sapiens)&ds=adult-brain-vasc

### EC dataset

In [None]:
ad = sc.read_text(os.path.join(main_dir, "P06_vasc_scRNAseq", "Winkler2022", "ucsc_cellbrowser_data", "EC", "exprMatrix.tsv.gz")).transpose()
original_cellnames = ad.obs

In [None]:
meta = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Winkler2022", "ucsc_cellbrowser_data", "EC", "meta.tsv"), sep="\t")
ad.obs = meta
ad

In [None]:
ad.obs.columns

In [None]:
ad.obs.head(2)

In [None]:
assert(all(ad.obs["Cell"] == original_cellnames.index))

In [None]:
ad.obs["Sample"].unique()

In [None]:
# add umap coords
umap = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Winkler2022", "ucsc_cellbrowser_data", "EC", "Seurat_umap.coords.tsv.gz"), sep="\t", header=None)
ad.obsm['X_umap'] = umap[[1,2]].to_numpy()

In [None]:
ad = ad[(ad.obs.doublet2 != 'Doublet') & (ad.obs.doublet3 != 'Doublet')] # remove doublets

In [None]:
del ad.obs['clusters']

In [None]:
ad.obs['Cluster']

In [None]:
ad.obs["clusters2"] = 'ECs_' + ad.obs['Cluster'].astype(str)

In [None]:
ad.obs["clusters2"] = 'ECs_' + ad.obs['Cluster'].astype(str)

In [None]:
ad.obs['clusters2'].unique()

In [None]:
# merge subclusters
new_names = {
    'ECs_Arterial': {'ECs_Art1','ECs_Art2','ECs_Art3'},
    'ECs_Capillary': {'ECs_Cap'},
    'ECs_Venous': {'ECs_Venous','ECs_Venule'},
    'Pericytes': {'PC'},
    'SMCs': {'aSMC','aaSMC','vSMC'},
    'Oligos': {'OL'},
    'Fibroblasts': {'FB1', 'FB2'},
    'Microglia': {'MG'},
    'Astrocytes': {'AC'}
}
# re-format
result = {}
for key, value in new_names.items():
    for elem in value:
        result[(elem)] = key
new_cluster_names = dict(sorted(result.items()))
print(new_cluster_names)

In [None]:
# fine annotation
ad.obs['clusters2'] = (
    ad.obs['clusters2']
    .map(new_cluster_names)
    .astype('category')
)

In [None]:
ad.obs['clusters2'].unique()

In [None]:
ad.obs['clusters'] = "ECs"

In [None]:
with rc_context({'figure.figsize': (7, 6)}):
    sc.pl.umap(ad, color=['clusters2'])

Check if data is normalized:

In [None]:
ad.layers

In [None]:
sns.histplot(ad.X.sum(1))
plt.show()

In [None]:
# show expression of 100 random genes (across all spots)
import random
random_genes=random.sample(range(0, ad.X.shape[1]), 100)
adata_sub = ad[:,random_genes]
exp=pd.DataFrame(np.matrix(adata_sub.X))
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 10));
sns.set_style("ticks")
plt.show()

Data is not normalized, therefore normalize and log-transform

In [None]:
ad.layers["counts"] = ad.X.copy() # save unnormalized raw RNA counts - retrieve via ad.X = ad.layers["counts"]

In [None]:
sc.pp.normalize_total(ad, inplace=True) # Normalize each spot by total counts over all genes, so that every spot has the same total count after normalization.

In [None]:
sc.pp.log1p(ad) # X = log(X + 1)

In [None]:
sns.histplot(ad.X.sum(1))
plt.show()

In [None]:
# show expression of 100 random genes (across all spots)
adata_sub = ad[:,random_genes]
exp=pd.DataFrame(np.matrix(adata_sub.X))
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.5) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005));
plt.show()

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)));
pl.set(xlim=(0, 10));
plt.show()

In [None]:
ad.layers["normalized"] = ad.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
# Identify highly-variable genes
sc.pp.highly_variable_genes(ad)
sc.pl.highly_variable_genes(ad)

In [None]:
ad

In [None]:
ad.layers

In [None]:
winkler_ec = ad

In [None]:
del ad

### Perivascular dataset

In [None]:
ad = sc.read_text(os.path.join(main_dir, "P06_vasc_scRNAseq", "Winkler2022", "ucsc_cellbrowser_data", "Perivascular", "exprMatrix.tsv.gz")).transpose()
original_cellnames = ad.obs

In [None]:
meta = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Winkler2022", "ucsc_cellbrowser_data", "Perivascular", "meta.tsv"), sep="\t")
ad.obs = meta
ad

In [None]:
ad.obs.columns

In [None]:
ad.obs.head(2)

In [None]:
assert(all(ad.obs["cellId"] == original_cellnames.index))

In [None]:
ad.obs["Sample"].unique()

In [None]:
# add umap coords
umap = pd.read_csv(os.path.join(main_dir, "P06_vasc_scRNAseq", "Winkler2022", "ucsc_cellbrowser_data", "Perivascular", "UMAP.coords.tsv.gz"), sep="\t", header=None)
ad.obsm['X_umap'] = umap[[1,2]].to_numpy()

In [None]:
# no doublets defined in obs
# ad = ad[(ad.obs.doublet2 != 'Doublet') & (ad.obs.doublet3 != 'Doublet')] # remove doublets

In [None]:
# Renaming
old_to_new = {'FB':'Fibroblasts','FBMC':'Fibromyocytes','PC':'Pericytes','SMC':'SMCs'}
ad.obs['clusters2'] = (
    ad.obs['clusters'].map(old_to_new).astype('category')
)

In [None]:
with rc_context({'figure.figsize': (7, 6)}):
    sc.pl.umap(ad, color=['clusters2'])

Check if data is normalized:

In [None]:
ad.layers

In [None]:
sns.histplot(ad.X.sum(1))
plt.show()

In [None]:
# show expression of 100 random genes (across all spots)
random_genes=random.sample(range(0, ad.X.shape[1]), 100)
adata_sub = ad[:,random_genes]
exp=pd.DataFrame(np.matrix(adata_sub.X))
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));
sns.set_style("ticks")
plt.show()

In [None]:
exp.iloc[0:5,0:30]

Note: Matrix is apparently log-transformed. Also see https://github.com/cnk113/vascular-analysis/issues/1.

In [None]:
# reverse log-transformation ln(x+1)
import math
ad.layers["counts"] = (math.e**(ad.X))-1

In [None]:
ad.layers["log?"] = ad.X

In [None]:
ad.X = ad.layers["counts"]

In [None]:
# show expression of 100 random genes (across all spots)
random_genes=random.sample(range(0, ad.X.shape[1]), 100)
adata_sub = ad[:,random_genes]
exp=pd.DataFrame(np.matrix(adata_sub.X))
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));
plt.show()

In [None]:
exp.iloc[0:5,0:20]

In [None]:
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)));
pl.set(xlim=(0, 10));
plt.show()

Now normalize and log-transform

In [None]:
ad.layers["counts"] = ad.X.copy() # save unnormalized raw RNA counts - retrieve via ad.X = ad.layers["counts"]

In [None]:
sc.pp.normalize_total(ad, inplace=True) # Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization.

In [None]:
sc.pp.log1p(ad) # X = log(X + 1)

In [None]:
sns.histplot(ad.X.sum(1))
plt.show()

In [None]:
ad.layers["normalized"] = ad.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
ad

In [None]:
ad.layers

In [None]:
ad.obs["clusters"] = ad.obs["clusters2"]

In [None]:
winkler_pv = ad

### Merge

In [None]:
# free memory
del winkler_pv.layers["log?"]
del winkler_pv.layers["counts"]
del winkler_ec.layers["counts"]

In [None]:
del winkler_pv.uns
del winkler_ec.uns

In [None]:
del ad
del exp
del adata_sub
del random_genes

In [None]:
# merge winkler datasets
# ignore potential batch effects because dataset is pre-annotated
adata = anndata.concat([winkler_ec, winkler_pv], join="inner") 

In [None]:
sns.histplot(adata.X.sum(1), bins = 100)
plt.show()

# Standard pipeline

Skip, because data is pre-analyzed.

# Vascular marker genes

In [None]:
# Basic QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-') 
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],jitter=0.4, multi_panel=True)

In [None]:
sc.tl.rank_genes_groups(adata, 'clusters', method='wilcoxon')

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False)

Export DEG table

In [None]:
import sys
sys.path.append(os.path.join(main_dir, "Git", "p06", "p06-visium", "visium-foxf2")
from p6_helper_functions import rank_genes_groups_df

In [None]:
# export DEG table
celltypes = adata.obs.clusters.unique().tolist()
merged_df = pd.DataFrame()
for group in celltypes:
    rank_df = rank_genes_groups_df(adata=adata, group=group, adj_pval_cutoff=None, log2fc_cutoff=None, sortby='zscore')
    merged_df = pd.concat([merged_df, rank_df])
    merged_df['reference'] = 'all other clusters'
merged_df.head(3)

In [None]:
merged_df.group.unique().tolist()

In [None]:
# export as csv
merged_df.to_csv(os.path.join(main_dir, 'P6_Foxf2_per_celltype', 'DEA', date.today().strftime("%Y%m%d")+'_DE_genes_Winkler2022.csv'),sep=';')

# Focus on: Foxf2

In [None]:
gene="FOXF2"

## Plot

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters2', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters2', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

## Excl zonation

In [None]:
utils.summarize_gene_expression(adata = adata, gene = gene, groupby = "clusters", 
                                study_name = dataset_name, organism = organism,
                                export = True, output_dir = os.path.join(main_dir, "P06_Foxf2_per_celltype", "Foxf2_summarized")
                               )

## Incl zonation

In [None]:
utils.summarize_gene_expression(adata = adata, gene = gene, groupby = "clusters2", 
                                study_name = dataset_name, organism = organism,
                                export = True, output_dir = os.path.join(main_dir, "P06_Foxf2_per_celltype", "Foxf2_summarized", "incl_zonation")
                               )

# Focus on: Other genes

In [None]:
target_genes = ["Foxo1", "Tek", "Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]

In [None]:
target_genes = [gene.upper() for gene in target_genes]
target_genes

### Excl zonation

In [None]:
groupby = "clusters"

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, study_name = dataset_name, organism = organism, groupby = groupby, 
                                          output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized"), export=True
                                         ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

### Incl zonation

In [None]:
groupby = "clusters2"

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, study_name = dataset_name, organism = organism, groupby = groupby, 
                                          output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized", "incl_zonation"), export=True
                                         ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby=groupby, swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

# Correlate gene expression (Foxf2 and Foxo1)

Using MAGIC denoising

In [None]:
import magic
import scprep

In [None]:
#sc.pp.scale(adata)

## ECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters == "ECs"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X, y=adata_EC[:,'FOXO1'].X, c=adata_EC[:,'NOS3'].X, ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## PCs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters == "Pericytes"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X, y=adata_EC[:,'FOXO1'].X, c=adata_EC[:,'NOS3'].X, ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0.4,0.4), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## All cell types

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X, y=adata_EC[:,'FOXO1'].X, c=adata_EC[:,'NOS3'].X, ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## capECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters2 == "ECs_Capillary"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X, y=adata_EC[:,'FOXO1'].X, c=adata_EC[:,'NOS3'].X, ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## aECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters2 == "ECs_Arterial"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X, y=adata_EC[:,'FOXO1'].X, c=adata_EC[:,'NOS3'].X, ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## vECs

In [None]:
gg = ["FOXF2","FOXO1","NOS3"]

In [None]:
adata_EC = adata[adata.obs.clusters2 == "ECs_Venous"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=gg)

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['FOXF2','FOXO1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'FOXF2'].X, y=adata_EC[:,'FOXO1'].X, c=adata_EC[:,'NOS3'].X, ax = ax1,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'FOXF2'].X, y=emt_magic[:,'FOXO1'].X, c=emt_magic[:,'NOS3'].X, ax=ax2,
                    xlabel='FOXF2', ylabel='FOXO1', legend_title="NOS3", title='After MAGIC')
plt.axline((0,0), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

# Session Info

In [None]:
sc.logging.print_versions()