<font size="+3.8">Scanpy single-cell pre-processing</font>  
<font size="+1.5"></font>  

Aim: Preprocess annotated mouse brain single-cell data from Tabula Muris (Schaum 2018 Nature)

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
os.getlogin()

In [None]:
import random
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sns
import platform

In [None]:
import anndata
import scanpy as sc
import scipy as sci
sc.settings.verbosity = 3

In [None]:
import utils

In [None]:
os.environ['CONDA_DEFAULT_ENV'] # conda env

In [None]:
platform.platform()

In [None]:
os.getcwd()

In [None]:
main_dir='\\\isdsynnas.srv.med.uni-muenchen.de\BD-Dichgans\SF' # Win
main_dir='/Volumes/BD-Dichgans/SF'# Mac

In [None]:
dataset_name = "TabulaMuris2018"
organism = "Mouse"

# Load + format data

## Annotated by authors

Partly adapted from https://github.com/theislab/scib-reproducibility/blob/main/notebooks/data_preprocessing/mouse_brain/01_collect_mouse_brain_studies.ipynb

Using FACS sorted cells, from:  
 -Brain_Myeloid-counts.csv  
 -Brain_Non-Myeloid-counts.csv

Publication: https://www.nature.com/articles/s41586-018-0590-4  
Downloaded from: https://figshare.com/articles/dataset/Single-cell_RNA-seq_data_from_Smart-seq2_sequencing_of_FACS_sorted_cells/5715040

In [None]:
# adata_tamu_myel = sc.read_csv('./../../DL_annotation/mouse_brain_data/Tabula_muris_2018/FACS/Brain_Myeloid-counts.csv').T

In [None]:
# adata_tamu_nonmyel = sc.read_csv('./../../DL_annotation/mouse_brain_data/Tabula_muris_2018/FACS/Brain_Non-Myeloid-counts.csv').T

In [None]:
# download source differs to the one used by Maren Buettner. I assume Brain_Myeloid-counts equals Brain_Microglia-counts.csv.

In [None]:
adata_tamu_myel = sc.read_csv(os.path.join(main_dir,'P06_vasc_scRNAseq','TabulaMuris2018','FACS','Brain_Microglia-counts.csv')).T

In [None]:
adata_tamu_nonmyel = sc.read_csv(os.path.join(main_dir,'P06_vasc_scRNAseq','TabulaMuris2018','FACS','Brain_Neurons-counts.csv')).T

In [None]:
adata_tamu_myel

In [None]:
adata_tamu_nonmyel

Merge brain dataset objects from Tabula Muris. 

In [None]:
adata_tamu = adata_tamu_myel.concatenate(adata_tamu_nonmyel)

In [None]:
adata_tamu

Convert Tabula Muris dataset into a sparse matrix.

In [None]:
adata_tamu.X = sci.sparse.csr_matrix(adata_tamu.X)

In [None]:
test = [x for x in adata_tamu.obs_names.str.split('-')]

In [None]:
adata_tamu.obs_names = [x[0] for x in test]

Load annotation file.

In [None]:
anno_tamu = pd.read_csv(os.path.join(main_dir,'P06_vasc_scRNAseq','TabulaMuris2018','annotations_FACS.csv'), low_memory=False)
meta_tamu = pd.read_csv(os.path.join(main_dir,'P06_vasc_scRNAseq','TabulaMuris2018','metadata_FACS.csv'))

In [None]:
anno_tamu.shape

In [None]:
anno_tamu.columns

In [None]:
red_idx2 = np.isin(adata_tamu.obs.index, anno_tamu.cell)

In [None]:
red_idx2.sum()

In [None]:
adata_tamu = adata_tamu[red_idx2,:]

In [None]:
red_idx = np.isin(anno_tamu.cell,adata_tamu.obs.index)

In [None]:
anno_tamu_red = anno_tamu.loc[red_idx ,:]

In [None]:
anno_tamu_red.index = anno_tamu_red.cell

In [None]:
pd.value_counts(anno_tamu_red.cell_ontology_class)

In [None]:
adata_tamu.obs['cell_ontology_class'] = anno_tamu_red.cell_ontology_class
adata_tamu.obs['cell_ontology_class'] = adata_tamu.obs['cell_ontology_class'].astype('category')
adata_tamu.obs['cell_ontology_id'] = anno_tamu_red.cell_ontology_id
adata_tamu.obs['cell_ontology_id'] = adata_tamu.obs['cell_ontology_id'].astype('category')

In [None]:
# adata_tamu.obs['region'] = anno_tamu_red.subtissue
# adata_tamu.obs['region'].loc[pd.isnull(adata_tamu.obs['region'])] ='Unknown'
# adata_tamu.obs['region'] = adata_tamu.obs['region'].astype('category')

In [None]:
# adata_tamu.obs['region'].cat.categories

Rename regions to fit with the other data sets.

In [None]:
# tabula_dict = dict({'Cerebellum' : 'CB', 
#                     'Cortex' : 'CTX', 
#                     'Hippocampus' : 'HC', 
#                     'Striatum' : 'STR'})

In [None]:
# adata_tamu.obs['region'] = adata_tamu.obs['region'].cat.rename_categories(tabula_dict)

In [None]:
# adata_tamu.obs['region'].value_counts()

In [None]:
adata_tamu.obs['study'] = 'Tabula_Muris'

In [None]:
adata_tamu.obs['cell_ontology_class'].cat.categories

Apply own re-ordering of annotation according to cell types of interest:

In [None]:
# # previous from scib:
# adata_tamu.obs['cell_type'] = adata_tamu.obs['cell_ontology_class'].cat.add_categories(['astrocyte'])

# adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
#                                      ['Bergmann glial cell', 'astrocyte of the cerebral cortex'])] =  'astrocyte'

# adata_tamu.obs['cell_type'] = adata_tamu.obs['cell_type'].cat.remove_unused_categories()

In [None]:
# Novel:

In [None]:
adata_tamu.obs['cell_type'] = adata_tamu.obs['cell_ontology_class'].cat.add_categories(['Oligos','Endothelial cells','OPCs','Neurons','Pericytes','Microglia/Macrophages','Unknown','Neuronal stem cells','SMCs','Astrocytes'])

In [None]:
adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['Bergmann glial cell', 'astrocyte of the cerebral cortex'])] =  'Astrocytes'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['oligodendrocyte'])] =  'Oligos'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['endothelial cell'])] =  'Endothelial cells'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['oligodendrocyte precursor cell'])] =  'OPCs'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['neuron'])] =  'Neurons'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['brain pericyte'])] =  'Pericytes'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['macrophage','microglial cell'])] =  'Microglia/Macrophages'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['unknown'])] =  'Unknown'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['neuronal stem cell'])] =  'Neuronal stem cells'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['smooth muscle cell'])] =  'SMCs'

In [None]:
adata_tamu.obs['cell_type'] = adata_tamu.obs['cell_type'].cat.remove_unused_categories()

In [None]:
adata_tamu.obs['cell_type'].value_counts()

Remove ERCC tags from genes.

In [None]:
ercc_idx = np.array([tag.startswith('ERCC') for tag in adata_tamu.var_names])

In [None]:
adata_tamu = adata_tamu[:, np.invert(ercc_idx)].copy()

In [None]:
adata=adata_tamu

In [None]:
del adata_tamu

# QC

Not required because already annotated

In [None]:
# genes with highest fraction of counts per cell
sc.pl.highest_expr_genes(adata, n_top=20, )

# Normalisation, logarithmization

Check if data is normalized:

In [None]:
adata.layers

In [None]:
sb.histplot(adata.X.sum(1), kde=False)
plt.show()

In [None]:
# show expression of 100 random genes (across all spots)
random_genes=random.sample(range(0, adata.X.shape[1]), 100)
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl1=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
sns.set_theme(style='white')
pl1.set(xlim=(-0.5, 7),ylim=(0,0.007));

In [None]:
sns.set(rc={'figure.figsize':(4,4)})
sns.set_theme(style='white')
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 20),ylim=(0,1e5));

In [None]:
exp

In [None]:
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, inplace=True)

In [None]:
# show expression of 100 random genes (across all spots)
sns.set_theme(style='white')
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.3) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005))

In [None]:
sns.set_theme(style='white')
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)))
pl.set(xlim=(0, 20),ylim=(0,1e5));

In [None]:
sc.pp.log1p(adata) # X = log(X + 1)

In [None]:
sb.histplot(adata.X.sum(1), kde=False)
plt.show()

In [None]:
# show expression of 100 random genes (across all spots)
sns.set_theme(style='white')
adata_sub = adata[:,random_genes]
exp=pd.DataFrame(adata_sub.X.todense())
# plot
pl=sns.displot(data=pd.melt(exp),x='value',height=4,hue='variable',kind="kde",warn_singular=False,legend=False,palette=list(np.repeat('#086da6',100)), lw=0.5) # genes with 0 expression are excluded
pl.set(xlim=(-0.25, 3.5),ylim=(0,0.005));

In [None]:
sns.set_theme(style='white')
pl=sns.histplot(data=pd.melt(exp),x='value',binwidth=0.5,legend=True,palette=list(np.repeat('#086da6',100)));
pl.set(xlim=(0, 20),ylim=(0,1e5));

In [None]:
adata.layers["normalized"] = adata.X.copy() # save normalized + log-transformed (but unscaled) counts - retrieve via adata.X = adata.layers["normalized"]

In [None]:
# Identify highly-variable genes
sc.pp.highly_variable_genes(adata)
sc.pl.highly_variable_genes(adata)

In [None]:
adata

In [None]:
adata.layers

# Dim Reduction

In [None]:
with rc_context({'figure.figsize': (8, 8)}):
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pl.pca(adata, color='Foxf2')

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
list(adata.obs.columns)

In [None]:
with rc_context({'figure.figsize': (9, 9)}):
    sc.pl.umap(adata, color=['cell_type', 'batch'], wspace=0.3, size=30)

In [None]:
plt.rcParams['figure.figsize'] = [12, 8] # set plot sizes
sc.pl.umap(adata, color=['cell_type'], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=4, legend_fontsize=11, size=30)

In [None]:
plt.rcParams['figure.figsize'] = [12, 8] # set plot sizes
sc.pl.umap(adata, color=['cell_type'], title='', legend_fontweight='normal', legend_fontoutline=4, legend_fontsize=11, size=30)

Note: SMC cluster is very small

In [None]:
adata.obs['cell_type'].value_counts()

# Cell annotation

Verify annotation from authors

Manual marker gene selection

|Vascular     |EC         |Pericytes|SMCs   |Fibroblasts|Oligos|OPCs         |Ependymal|Neurons    |immature/migrating Neurons|Astrocytes|Microglia|Immune (broad/hematopoetic)|Macrophages     |Macrophages/Microglia|Monocytes|Mononcytes/B-cells|Granulocytes|B-cells|T/NK cells|
|---          |---        |---      |---    |---        |---   |---          |---      |---        |---   |---       |---      |---                        |---             |---|---|---|---|---|---|
|PDGFRA =CD140A|CLDN5      |VTN      |ACTA2  |DCN        |MBP   |CSPG4 =NG2    |PIFO     |RBFOX3 =NEUN|DCX   |AQP4      |AIF1     |PTPRC =CD45                 |CD14            |TREM2|CCR2|CD74|CD16/32|CD19|CD4|
|MCAM =CD146   |PECAM1 =CD31|PDGFRB   |MYOCD  |COL6A1     |ENPP2 |PDGFRA =CD140A|FOXJ1    |TUBB3      ||          |         |                           |ITGB2 =CD18 =CD11B||||ITGB2 =CD18 =CD11B||CD8A|
|FOXF2        |           |         |       |COL3A1     |      |             |DYNLRB2  |           ||          |         |                           |CD86            ||||CD15||CD8B|
|             |           |         |       |           |      |             |MEIG1    |           ||          |         |                           |ADGRE1 =F4/80    ||||||IL2RB|
||||||||||||||||||||IFNG|

In [None]:
# plot marker genes
plt.rcParams['figure.figsize'] = [8, 6] # set plot sizes
marker_genes = ["Pdgfra", "Mcam","Foxf2", "Pecam1", "Cldn5","Vtn","Kcnj8","Atp13a5","Pdgfrb","Acta2","Tagln","Myocd","Dcn", "Col6a1", "Mbp","Enpp2","Cspg4","Pifo","Foxj1","Dynlrb2","Meig1","Rbfox3","Tubb3","Dcx","Aqp4", "Aif1", "Ptprc", "Ccr2","Adgre1","Itgb2","Cd14","Cd86","Trem2","Vcan","Cd4","Cd19", "Cd8a","Il2rb","Cd244", "Cd74","Cd68","Ifng","Ptgdr2","Ccr3"]
marker_genes=[x for x in marker_genes if x in list(adata.var_names)] # remove those not in adata.var_names
sc.pl.umap(adata, ncols=3, color=marker_genes, size=50)

In [None]:
# Run DE test for annotation (Wilcoxon)
sc.tl.rank_genes_groups(adata, 'cell_type', method='wilcoxon', key_added='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, ncols=3, fontsize=13, key='wilcoxon')

In [None]:
# Dotplot
sc.tl.dendrogram(adata, groupby="cell_type")
sc.pl.rank_genes_groups_dotplot(adata, n_genes=6, key="wilcoxon", groupby="cell_type");

Own note: The clusters SMCs and Pericytes seem to be wrongly switched. The pericyte markers Vtn, Atp13a5, Pdgfrb are among top SMC hits. Conversely, SMC marker genes are constituting the top hits from the pericyte cluster. Therefore, switch annotations:


In [None]:
adata_tamu = adata
adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['Bergmann glial cell', 'astrocyte of the cerebral cortex'])] =  'Astrocytes'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['oligodendrocyte'])] =  'Oligos'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['endothelial cell'])] =  'Endothelial cells'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['oligodendrocyte precursor cell'])] =  'OPCs'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['neuron'])] =  'Neurons'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['brain pericyte'])] =  'SMCs'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['macrophage','microglial cell'])] =  'Microglia/Macrophages'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['unknown'])] =  'Unknown'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['neuronal stem cell'])] =  'Neuronal stem cells'

adata_tamu.obs['cell_type'][np.in1d(adata_tamu.obs['cell_ontology_class'], 
                                     ['smooth muscle cell'])] =  'Pericytes'
adata = adata_tamu
del adata_tamu

In [None]:
adata.obs['cell_type'] = adata.obs['cell_type'].cat.remove_unused_categories()

In [None]:
adata.obs['cell_type'].value_counts()

In [None]:
plt.rcParams['figure.figsize'] = [12, 8] # set plot sizes
sc.pl.umap(adata, color=['cell_type'], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=2, legend_fontsize=8, size=30)

In [None]:
# exclude clusters with <50 cells
cluster_counts = adata.obs['cell_type'].value_counts()
cluster_counts

In [None]:
adata = adata[adata.obs['cell_type'].isin(cluster_counts[cluster_counts>49].index)]

In [None]:
adata.obs['clusters'] = adata.obs['cell_type']

# Focus on: Foxf2

In [None]:
gene="Foxf2"

In [None]:
with rc_context({'figure.figsize': (7,7)}):
    sc.pl.umap(adata, color=['clusters',gene], legend_loc='on data', title='', legend_fontweight='normal', legend_fontoutline=2, legend_fontsize=10, size=35, layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [gene], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
utils.summarize_gene_expression(adata = adata, gene = gene, groupby = "clusters", 
                          study_name = dataset_name, organism = organism,
                          export = True, output_dir = os.path.join(main_dir, "P06_Foxf2_per_celltype", "Foxf2_summarized")
                         )

# Focus on: Other genes

In [None]:
target_genes = ["Foxo1", "Tek", "Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Ptprb", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]

In [None]:
other_genes_results = {
    gene: utils.summarize_gene_expression(adata, gene, groupby = "clusters", study_name = dataset_name, organism = organism, 
                                    output_dir=os.path.join(main_dir, "P06_Foxf2_per_celltype", "Other_genes_summarized"), export=True
                                   ) for gene in target_genes
}

In [None]:
# some plots

In [None]:
sc.pl.matrixplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[0]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.matrixplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

In [None]:
sc.pl.dotplot(adata, [target_genes[1]], groupby='clusters', swap_axes=False, figsize=(2,5), standard_scale="var", layer="normalized")

# Correlate gene expression (Foxf2 and Foxo1)

Using MAGIC denoising

In [None]:
import magic
import scprep

In [None]:
sns.set_theme(style='white')

## ECs

In [None]:
gg = ["Foxf2","Foxo1","Nos3"]

In [None]:
adata_EC = adata[adata.obs.cell_type == "Endothelial cells"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['Foxf2','Foxo1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'Foxf2'].X.todense(), y=adata_EC[:,'Foxo1'].X.todense(), c=adata_EC[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((1,1), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## PCs

In [None]:
# no pericytes in dataset

## SMCs

In [None]:
gg = ["Foxf2","Foxo1","Nos3"]

In [None]:
adata_EC = adata[adata.obs.cell_type == "SMCs"]

In [None]:
adata_EC

In [None]:
adata_EC.layers

In [None]:
#matrix = pd.DataFrame(adata_EC.X) # not compatible with sparse 
matrix = adata_EC.X
matrix.columns = adata_EC.var.index.tolist()

In [None]:
cutoff_var = None

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata_EC.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata_EC, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['Foxf2','Foxo1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata_EC[:,'Foxf2'].X.todense(), y=adata_EC[:,'Foxo1'].X.todense(), c=adata_EC[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((1,1), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

## All cell types

In [None]:
adata

In [None]:
#matrix = pd.DataFrame(adata.X) # not compatible with sparse 
matrix = adata.X
#matrix.columns = ad_merged.var.index.tolist()

In [None]:
cutoff_var = 700

In [None]:
scprep.plot.plot_library_size(matrix, cutoff=cutoff_var)

In [None]:
# filter lowly expressed genes and cells with a small library size
#matrix = scprep.filter.filter_library_size(matrix, cutoff=cutoff_var)
#matrix.head()

Note: Skipped normalization as data is already log-normalized

In [None]:
adata.layers

### Creating the MAGIC operator
If you don't specify parameters, MAGIC creates an operator with the following default values: knn=5, knn_max = 3 * knn, decay=1, t=3.

In [None]:
magic_op = magic.MAGIC()

### Running MAGIC with gene selection
The magic_op.fit_transform function takes the normalized data and an array of selected genes as its arguments. If no genes are provided, MAGIC will return a matrix of all genes. The same can be achieved by substituting the array of gene names with genes='all_genes'.

In [None]:
%%time
emt_magic = magic_op.fit_transform(adata, genes=['Foxf2', 'Foxo1', 'Nos3'])

### Visualizing gene-gene relationships

We can see gene-gene relationships much more clearly after applying MAGIC. Note that the change in absolute values of gene expression is not meaningful - the relative difference is all that matters.

In [None]:
np.corrcoef(emt_magic[:,['Foxf2','Foxo1']].X, rowvar = False)[0][1]

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))
scprep.plot.scatter(x=adata[:,'Foxf2'].X.todense(), y=adata[:,'Foxo1'].X.todense(), c=adata[:,'Nos3'].X.todense(), ax = ax1,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='Before MAGIC')
scprep.plot.scatter(x=emt_magic[:,'Foxf2'].X, y=emt_magic[:,'Foxo1'].X, c=emt_magic[:,'Nos3'].X, ax=ax2,
                    xlabel='Foxf2', ylabel='Foxo1', legend_title="Nos3", title='After MAGIC')
plt.axline((1,1), slope=1, color="black", alpha=0.3, linestyle="--")
plt.tight_layout()
plt.show()

# Session Info

In [None]:
sc.logging.print_versions()