In [None]:
import pandas as pd
import numpy as np
import scipy
import scanpy as sc
from numba import njit
import matplotlib
import matplotlib.pyplot as plt
import re
import statsmodels.stats.multitest
import scvelo as scv
import os
import csv
import bbknn
import tacco as tc

In [None]:
import sys
# Make helper functions available: The notebook expects to be executed either in the sub-workflow directory or in the notebooks directory
sys.path.insert(1, '../'), sys.path.insert(1, '../workflow/'); # prefer to look just one directory up
import helper
sys.path.pop(1), sys.path.pop(1);

get_path = helper.get_paths('mouse_sc')

# settings

## visualization settings

In [None]:
compartment_colors = helper.get_colors('compartment')
labels_colors = helper.get_colors('labels')
cluster_colors = helper.get_colors('cluster')
program_colors = helper.get_colors('program')

In [None]:
figures_folder = get_path('plots')
puck_point_size = 3

In [None]:
matplotlib.rcParams['figure.dpi'] = 216

# Load mouse data

In [None]:
reference = sc.read(f'{get_path("resources")}/scRNAseq.h5ad')
reference.obs['batch'] = reference.obs['Chemistry'].astype(str) + '_' + reference.obs['Hashed'].map({True:'hashing',False:'regular'})

In [None]:
splicing = scv.read(f'{get_path("resources")}/scRNAseq_splicing.h5ad')

In [None]:
subsets = {
    'normal_TNK':    reference.obs.index[reference.obs['labels'].isin(['TNK']) &  reference.obs['State'].isin(['normal'])].intersection(splicing.obs.index),
    'notnormal_TNK': reference.obs.index[reference.obs['labels'].isin(['TNK']) & ~reference.obs['State'].isin(['normal'])].intersection(splicing.obs.index),
    'normal_Epi':    reference.obs.index[reference.obs['labels'].isin(['Epi']) &  reference.obs['State'].isin(['normal'])].intersection(splicing.obs.index),
    'notnormal_Epi': reference.obs.index[reference.obs['labels'].isin(['Epi']) & ~reference.obs['State'].isin(['normal'])].intersection(splicing.obs.index),
}

In [None]:
for anno in ['batch','cluster','State']:
    splicing.obs[anno] = reference.obs[anno]

In [None]:
for anno in ['Epithelial_programs']:
    splicing.obsm[anno] = reference.obsm[anno].reindex(index=splicing.obs.index)

In [None]:
subset_results = {}
for subset_label, subset_index in subsets.items():
    sdata = splicing[subset_index].copy()
    scv.pp.filter_and_normalize(sdata)
    scv.pp.moments(sdata)
    scv.tl.velocity(sdata, mode='stochastic')
    scv.tl.velocity_graph(sdata)
    sc.tl.umap(sdata)
    scv.tl.paga(sdata,groups='cluster')
    subset_results[subset_label] = sdata

In [None]:
subset_meta = pd.DataFrame(index=subsets.keys())
subset_meta['normality'] = subset_meta.index.str.split('_').str[0]
subset_meta['case'] = subset_meta.index.str.split('_').str[1]

In [None]:
subset_results_bbknn = {}
for subset_label, subset_index in subsets.items():
    sdata = subset_results[subset_label].copy()
    sc.tl.pca(sdata)
    sc.external.pp.bbknn(sdata, batch_key='batch')
    sc.tl.umap(sdata) # UMAP is not 100% reproducible across different systems (see e.g., https://github.com/scverse/scanpy/issues/2014), so the actual plots can be different, while the general structure should be the same.
    subset_results_bbknn[subset_label] = sdata

In [None]:
for ax_i,normality in enumerate(subset_meta['normality'].unique()):
    for ax_j,case in enumerate(subset_meta['case'].unique()):
        subset_label = f'{normality}_{case}'
        if subset_label in subset_meta.index:
            sdata = subset_results_bbknn[subset_label]
            for k in sdata.obsm['Epithelial_programs'].columns:
                sdata.obs[k] = sdata.obsm['Epithelial_programs'][k]

In [None]:
fig,axs = tc.pl.subplots(len(subset_meta['normality'].unique()), len(sdata.obsm['Epithelial_programs'].columns), x_padding=3)
for ax_i,normality in enumerate(subset_meta['normality'].unique()):
    for ax_j,prog in enumerate(sdata.obsm['Epithelial_programs'].columns):
        ax = axs[ax_j,ax_i]
        case = 'Epi'
        subset_label = f'{normality}_{case}'
        if subset_label in subset_meta.index and case == 'Epi':
            sdata = subset_results_bbknn[subset_label]
            scv.pl.velocity_embedding_stream(sdata, basis='umap',color=prog,legend_loc='right margin',palette=tc.pl.get_default_colors(4), ax=ax, show=False)
            ax.set_title(f'{subset_label}_{prog}')
            
            # rasterize dots and arrows to make the plot smaller (and to avoid bug in pdf backend)
            for thingy in ax.get_children():
                raterized = False
                if isinstance(thingy,matplotlib.collections.LineCollection): # streamlines
                    raterized = True
                if isinstance(thingy,matplotlib.patches.FancyArrowPatch): # streamline arrow tips
                    raterized = True
                if isinstance(thingy,matplotlib.collections.PathCollection): # dots
                    raterized = True
                thingy.set_rasterized(raterized)
            
        else:
            ax.set_axis_off()
# UMAP is not 100% reproducible across different systems (see e.g., https://github.com/scverse/scanpy/issues/2014), so the actual plots can be different, while the general structure should be the same.
fig.savefig(f'{get_path("plots")}/velocity_per_program_and_State_after_batch_correction.pdf',bbox_inches='tight')

In [None]:
fig,axs = tc.pl.subplots(len(subset_meta['normality'].unique()), len(subset_meta['case'].unique()), x_padding=4)
for ax_i,normality in enumerate(subset_meta['normality'].unique()):
    for ax_j,case in enumerate(subset_meta['case'].unique()):
        ax = axs[ax_j,ax_i]
        subset_label = f'{normality}_{case}'
        if subset_label in subset_meta.index:
            sdata = subset_results_bbknn[subset_label]
            scv.pl.velocity_embedding_stream(sdata, basis='umap',color='cluster',legend_loc='right margin',palette=[cluster_colors[k] for k in sdata.obs['cluster'].cat.categories], ax=ax, show=False)
            if 'Epithelial_programs' in sdata.obsm:
                proliferation_programs = sdata.obsm['Epithelial_programs'][[c for c in sdata.obsm['Epithelial_programs'].columns if 'Proliferation' in c]].sum(axis=1)
                sdata.obs['Proliferating'] = proliferation_programs > 0.5 * sdata.obsm['Epithelial_programs'].sum(axis=1)
                add_outline = sdata.obs['Proliferating'].to_numpy()
                # plot again with outline - but keep the previous plot, as the legend would be missing otherwise...
                scv.pl.velocity_embedding_stream(sdata, basis='umap',color='cluster',legend_loc='right margin',palette=[cluster_colors[k] for k in sdata.obs['cluster'].cat.categories], ax=ax, show=False, add_outline=add_outline)
            
            ax.set_title(subset_label)
            
            # rasterize dots and arrows to make the plot smaller (and to avoid bug in pdf backend)
            for thingy in ax.get_children():
                raterized = False
                if isinstance(thingy,matplotlib.collections.LineCollection): # streamlines
                    raterized = True
                if isinstance(thingy,matplotlib.patches.FancyArrowPatch): # streamline arrow tips
                    raterized = True
                if isinstance(thingy,matplotlib.collections.PathCollection): # dots
                    raterized = True
                thingy.set_rasterized(raterized)
            
        else:
            ax.set_axis_off()
# UMAP is not 100% reproducible across different systems (see e.g., https://github.com/scverse/scanpy/issues/2014), so the actual plots can be different, while the general structure should be the same.
fig.savefig(f'{get_path("plots")}/velocity_per_cluster_and_State_after_batch_correction_highlight_proliferation.pdf',bbox_inches='tight')