In [1]:
import velvet as vt

# general packages
import numpy as np
import pandas as pd
import torch
from scipy.sparse import issparse

# velocity packages
import scanpy as sc
import scvelo as scv
import anndata as ann

# plotting packages
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange
from IPython.display import clear_output

# color palette object
from colors import colorpalette as colpal

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [2]:
# we implement unitvelo's evaluation 
# originally from https://github.com/StatBiomed/UniTVelo/blob/main/unitvelo/eval_utils.py
# paper: https://www.nature.com/articles/s41467-022-34188-7
# authors: Mingze Gao, Chen Qiao & Yuanhua Huang 

from eval_functions import unitvelo_cross_boundary_correctness as cross_boundary_correctness
from  eval_functions import unitvelo_inner_cluster_coh as inner_cluster_coh

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
def velvetSDE_pipeline(adata0, name):
    adata = adata0.copy()
    vt.pp.neighborhood(adata, n_neighbors=100)
    
    vt.ut.set_seed(0)
    
    vt.md.Velvet.setup_anndata(adata, x_layer='total', n_layer='new', knn_layer='knn_index')

    model = vt.md.Velvet(
        adata,
        n_latent = 50,
        linear_decoder = True,
        neighborhood_space="latent_space",
        biophysical_model = "full",
        gamma_mode = "learned",
        labelling_time = 2.0,
    )

    model.setup_model()
    
    model.train(
        batch_size = adata.shape[0],
        max_epochs = 1000, 
        freeze_vae_after_epochs = 200,
        constrain_vf_after_epochs = 200,
        lr=0.001,
    )
    
    model.module = model.module.to('cuda')
    
    model.get_latent_dynamics(return_data=False)
    
    mp = vt.sb.MarkovProcess(
        model,
        n_neighbors=10,
        use_space='latent_space',
        use_spline=True,
        use_similarity=False,
    )

    sde = vt.sb.SDE(
        model.module.n_latent,
        prior_vectorfield=model.module.vf,
        noise_scalar=0.15,
        device=model.device
    )

    model.adata.obs['index'] = np.arange(model.adata.shape[0])
    vt.sm.VelvetSDE.setup_anndata(
        model, 
        x_layer='total', 
        t_key='t',
        index_key='index'
    )

    sde_model = vt.sm.VelvetSDE(
        model,
        sde,
        mp,
    )

    sde_model.train(
        max_epochs = 250,
        n_trajectories = 200,
        n_simulations = 50,
        n_steps = 30,
        n_markov_steps=15,
        t_max=25,
        dt = 1.0,
        lr = 0.001,
    )    
    
    X = model.adata_manager.get_from_registry("X")
    X = X.A if issparse(X) else X
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

    x = torch.tensor(X, device=torch_device)
    b = torch.zeros(X.shape[0], device=torch_device)
    model.module.to(torch_device)
    with torch.no_grad():
        inf = model.module.inference(x, b) # model and sde_model have same VAE
        z = inf['z']
        vz = sde_model.module.sde.drift(z)
        gen = model.module.generative(
                z,
                vz,
                inf['library'],
                b
        )
    V = gen['vel'].detach().cpu().numpy()
    V = V.A if issparse(V) else V
    V = np.nan_to_num(V, nan=0, neginf=0, posinf=0)
    return V

def velvetSDE_pipeline_with_smoothing(adata0, name):
    adata = adata0.copy()
    
    smoothing_cnx = vt.pp.connectivities(adata, n_neighbors=30)

    adata.layers['total_smooth'] = vt.pp.moments(X=adata.layers['total'],
        cnx=smoothing_cnx
    )

    adata.layers['new_smooth'] = vt.pp.moments(
        X=adata.layers['new'],
        cnx=smoothing_cnx
    )

    vt.pp.neighborhood(adata, n_neighbors=100)

    vt.ut.set_seed(0)
    
    vt.md.Velvet.setup_anndata(adata, x_layer='total_smooth', n_layer='new_smooth', knn_layer='knn_index')

    model = vt.md.Velvet(
        adata,
        n_latent = 50,
        linear_decoder = True,
        neighborhood_space="latent_space",
        biophysical_model = "full",
        gamma_mode = "learned",
        labelling_time = 2.0,
    )

    model.setup_model()
    
    model.train(
        batch_size = adata.shape[0],
        max_epochs = 1000, 
        freeze_vae_after_epochs = 200,
        constrain_vf_after_epochs = 200,
        lr=0.001,
    )
    
    model.module = model.module.to('cuda')

    model.get_latent_dynamics(return_data=False)
    
    mp = vt.sb.MarkovProcess(
        model,
        n_neighbors=10,
        use_space='latent_space',
        use_spline=True,
        use_similarity=False,
    )

    sde = vt.sb.SDE(
        model.module.n_latent,
        prior_vectorfield=model.module.vf,
        noise_scalar=0.15,
        device=model.device
    )

    model.adata.obs['index'] = np.arange(model.adata.shape[0])
    vt.sm.VelvetSDE.setup_anndata(
        model, 
        x_layer='total_smooth', 
        t_key='t',
        index_key='index'
    )

    sde_model = vt.sm.VelvetSDE(
        model,
        sde,
        mp,
    )

    sde_model.train(
        max_epochs = 250,
        n_trajectories = 200,
        n_simulations = 50,
        n_steps = 30,
        n_markov_steps=15,
        t_max=25,
        dt = 1.0,
        lr = 0.001,
    )    
    
    X = model.adata_manager.get_from_registry("X")
    X = X.A if issparse(X) else X
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

    x = torch.tensor(X, device=torch_device)
    b = torch.zeros(X.shape[0], device=torch_device)
    model.module.to(torch_device)
    with torch.no_grad():
        inf = model.module.inference(x, b) # model and sde_model have same VAE
        z = inf['z']
        vz = sde_model.module.sde.drift(z)
        gen = model.module.generative(
                z,
                vz,
                inf['library'],
                b
        )
    V = gen['vel'].detach().cpu().numpy()
    V = V.A if issparse(V) else V
    V = np.nan_to_num(V, nan=0, neginf=0, posinf=0)
    return V

In [5]:
## functions used in preparing data for benchmarking

def project_to_pca(adata):
    X = adata.layers['total']
    V = adata.layers['velocity']

    X = np.array(X.A if issparse(X) else X)
    V = np.array(V.A if issparse(V) else V)
    V = np.nan_to_num(V, nan=0, neginf=0, posinf=0)
    Y = np.clip(X + V, 0, 1000)


    Xlog = np.log1p(X)
    pca = PCA()
    Xpca = pca.fit_transform(Xlog)

    Ylog = np.log1p(Y)
    Ypca = pca.transform(Ylog)
    V = Ypca - Xpca
    return V

def prepare_for_test(
    adata,
    name,
    func,
    ndims=50,
    pt=True,
):
    x_pca = adata.obsm['X_pca']
    velocity = func(adata, name)

    test = ann.AnnData(X=adata.X, obs=adata.obs, var=adata.var,
                       layers={'total':adata.layers['total'],
                               'velocity':velocity})

    test.obsm['X_pca'] = x_pca[:,:ndims]
    test.obsm['cellrank_baseline'] = adata.obsm['velocity_cr_pca'][:,:ndims]
    if pt:
        test.obsm['pseudotime_baseline'] = adata.obsm['velocity_pst'][:,:ndims]
    else:
        ## this is a lazy implementation, will create meaningless comparison
        ## but it will never get saved
        ## this is just for the maxi dataset that we don't have a good
        ## pseudotime trajectory skeleton for.
        test.obsm['pseudotime_baseline'] = np.zeros_like(test.obsm['cellrank_baseline'])
        
    test.obsm['velocity_pca'] = project_to_pca(test)[:,:ndims]
    
    scv.pp.neighbors(test)
    return test


In [6]:
# the object that will contain the data and data-specific parameters for benchmarking

class BenchMarkingData:
    def __init__(self, name, func, pt=True):
        self.name = name
        adata = sc.read_h5ad(f'../data/benchmarking_data/{name}.h5ad')

        self.adata = prepare_for_test(
            adata,
            name,
            func,
            pt=pt
        )
        
        self.cluster_edges()
        
    def cluster_edges(self):
        if self.name == "mini_V3":
            self.obs = 'leiden'
            self.cluster_edges = [
                ('5','14'),
                ('14','8'),
                ('8','21')
            ]
        elif self.name == "mini_MN":
            self.obs = 'leiden'
            self.cluster_edges = [
                ('16','15'),
                ('20','23'),
                ('13','18')
            ]
        elif self.name == "mini_MD":
            self.obs = 'leiden'
            self.cluster_edges = [
                ('9','12'),
                ('25','4'),
                ('4','6'),
                ('6','22')
            ]
        elif self.name == "midi_NM":
            self.obs = 'cell_annotation'
            self.cluster_edges = [
                ('Early_Neural','Neural'),
                ('NMP','Early_Neural'),
                ('NMP','Mesoderm')
            ]
        elif self.name == 'midi_Ne':
            self.obs = 'cell_annotation'
            self.cluster_edges = [
                ('Neural','pMN'),
                ('pMN','MN'),
                ('pMN','p3'),
                ('p3','V3')
            ]
        elif self.name == 'maxi':
            self.obs = 'cell_annotation'
            self.cluster_edges = [
                ('Early_Neural','Neural'),
                ('NMP','Early_Neural'),
                ('NMP','Mesoderm'),
                ('Neural','pMN'),
                ('pMN','MN'),
                ('pMN','p3'),
                ('p3','V3')
            ]

In [8]:
def baseline_scores(
    adata
):
    X = adata.obsm['velocity_pca']
    Y1 = adata.obsm['cellrank_baseline']
    Y2 = adata.obsm['pseudotime_baseline']
    cr_scores = np.diagonal(cosine_similarity(X, Y1))
    pt_scores = np.diagonal(cosine_similarity(X, Y2))
    return cr_scores, pt_scores

def run_tests(bm):
    cbd = cross_boundary_correctness(
        bm.adata,
        k_cluster=bm.obs,
        k_velocity='velocity',
        x_emb='X_pca',
        cluster_edges=bm.cluster_edges
    )[1]

    icc = inner_cluster_coh(
        bm.adata,
        k_cluster=bm.obs,
        k_velocity='velocity',
    )[1]
    
    crs, pts = baseline_scores(bm.adata)
    
    return cbd, icc, crs, pts

def perform_benchmark(
    pipeline_name,
    velocity_pipeline, 
    output_folder
):
    dataset = ['mini_V3', 'mini_MN', 'mini_MD',
               'midi_NM', 'midi_Ne', 'maxi']
  
    for ds in tqdm(dataset):  
        bm_data = BenchMarkingData(ds, velocity_pipeline, pt=(ds!='maxi'))
        print(ds)
        cbd, icc, crs, pts = run_tests(bm_data)
        np.save(f'{output_folder}/{ds}_{pipeline_name}_CBD.npy', cbd)
        np.save(f'{output_folder}/{ds}_{pipeline_name}_ICC.npy', icc)
        np.save(f'{output_folder}/{ds}_{pipeline_name}_CRS.npy', crs)
        if ds!='maxi':
            np.save(f'{output_folder}/{ds}_{pipeline_name}_PTS.npy', pts)
        

In [9]:
perform_benchmark(
    pipeline_name='velvetSDE_RAW',
    velocity_pipeline=velvetSDE_pipeline, 
    output_folder='../output_data/'
)

  0%|          | 0/6 [00:00<?, ?it/s]

Using ScanPy methods to calculate distances, indices, connectivities, transitions, ... done! 
KNN indices for Velvet stored in .obsm['knn_index'].
Dense similarity transition matrix for Velvet stored in .obsm['Ts'].
[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Generating sequential column names                                                                        


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 1000/1000: 100%|██████████| 1000/1000 [01:42<00:00,  9.77it/s, loss=1.59, v_num=1]  

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 1000/1000: 100%|██████████| 1000/1000 [01:42<00:00,  9.74it/s, loss=1.59, v_num=1]
KNN indices for Velvet stored in .obsm['knn_index'].


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 250/250: 100%|██████████| 250/250 [00:55<00:00,  4.55it/s, loss=168, v_num=1]

`Trainer.fit` stopped: `max_epochs=250` reached.


Epoch 250/250: 100%|██████████| 250/250 [00:55<00:00,  4.53it/s, loss=168, v_num=1]
computing neighbors
    finished (0:00:08) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
mini_V3


 17%|█▋        | 1/6 [03:15<16:18, 195.62s/it]

Using ScanPy methods to calculate distances, indices, connectivities, transitions, ... done! 
KNN indices for Velvet stored in .obsm['knn_index'].
Dense similarity transition matrix for Velvet stored in .obsm['Ts'].
[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Generating sequential column names                                                                        


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 1000/1000: 100%|██████████| 1000/1000 [02:26<00:00,  6.64it/s, loss=1.69, v_num=1]  

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 1000/1000: 100%|██████████| 1000/1000 [02:26<00:00,  6.85it/s, loss=1.69, v_num=1]
KNN indices for Velvet stored in .obsm['knn_index'].


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 250/250: 100%|██████████| 250/250 [01:06<00:00,  3.75it/s, loss=228, v_num=1]

`Trainer.fit` stopped: `max_epochs=250` reached.


Epoch 250/250: 100%|██████████| 250/250 [01:06<00:00,  3.76it/s, loss=228, v_num=1]
computing neighbors
    finished (0:00:17) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
mini_MN


 33%|███▎      | 2/6 [07:46<16:00, 240.09s/it]

Using ScanPy methods to calculate distances, indices, connectivities, transitions, ... done! 
KNN indices for Velvet stored in .obsm['knn_index'].
Dense similarity transition matrix for Velvet stored in .obsm['Ts'].
[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Generating sequential column names                                                                        


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 1000/1000: 100%|██████████| 1000/1000 [01:56<00:00,  8.32it/s, loss=1.85, v_num=1]  

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 1000/1000: 100%|██████████| 1000/1000 [01:56<00:00,  8.56it/s, loss=1.85, v_num=1]
KNN indices for Velvet stored in .obsm['knn_index'].


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 250/250: 100%|██████████| 250/250 [01:04<00:00,  3.85it/s, loss=101, v_num=1] 

`Trainer.fit` stopped: `max_epochs=250` reached.


Epoch 250/250: 100%|██████████| 250/250 [01:04<00:00,  3.87it/s, loss=101, v_num=1]
computing neighbors
    finished (0:00:01) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
mini_MD


 50%|█████     | 3/6 [11:28<11:35, 231.80s/it]

Using ScanPy methods to calculate distances, indices, connectivities, transitions, ... done! 
KNN indices for Velvet stored in .obsm['knn_index'].
Dense similarity transition matrix for Velvet stored in .obsm['Ts'].
[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Generating sequential column names                                                                        


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 1000/1000: 100%|██████████| 1000/1000 [03:56<00:00,  4.15it/s, loss=1.96, v_num=1]  

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 1000/1000: 100%|██████████| 1000/1000 [03:56<00:00,  4.24it/s, loss=1.96, v_num=1]
KNN indices for Velvet stored in .obsm['knn_index'].


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 250/250: 100%|██████████| 250/250 [01:29<00:00,  2.81it/s, loss=117, v_num=1] 

`Trainer.fit` stopped: `max_epochs=250` reached.


Epoch 250/250: 100%|██████████| 250/250 [01:29<00:00,  2.80it/s, loss=117, v_num=1]
computing neighbors
    finished (0:00:03) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
midi_NM


 67%|██████▋   | 4/6 [18:26<10:10, 305.38s/it]

Using ScanPy methods to calculate distances, indices, connectivities, transitions, ... done! 
KNN indices for Velvet stored in .obsm['knn_index'].
Dense similarity transition matrix for Velvet stored in .obsm['Ts'].
[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Generating sequential column names                                                                        


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 1000/1000: 100%|██████████| 1000/1000 [04:49<00:00,  3.33it/s, loss=1.78, v_num=1]  

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 1000/1000: 100%|██████████| 1000/1000 [04:49<00:00,  3.45it/s, loss=1.78, v_num=1]
KNN indices for Velvet stored in .obsm['knn_index'].


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 250/250: 100%|██████████| 250/250 [01:38<00:00,  2.53it/s, loss=118, v_num=1]

`Trainer.fit` stopped: `max_epochs=250` reached.


Epoch 250/250: 100%|██████████| 250/250 [01:38<00:00,  2.54it/s, loss=118, v_num=1]
computing neighbors
    finished (0:00:03) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
midi_Ne


 83%|████████▎ | 5/6 [26:41<06:13, 373.51s/it]

Using ScanPy methods to calculate distances, indices, connectivities, transitions, ... done! 
KNN indices for Velvet stored in .obsm['knn_index'].
Dense similarity transition matrix for Velvet stored in .obsm['Ts'].
[34mINFO    [0m Generating sequential column names                                                                        
[34mINFO    [0m Generating sequential column names                                                                        


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 1000/1000: 100%|██████████| 1000/1000 [10:56<00:00,  1.48it/s, loss=1.88, v_num=1]  

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 1000/1000: 100%|██████████| 1000/1000 [10:56<00:00,  1.52it/s, loss=1.88, v_num=1]
KNN indices for Velvet stored in .obsm['knn_index'].


Multiprocessing is handled by SLURM.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 250/250: 100%|██████████| 250/250 [02:58<00:00,  1.37it/s, loss=102, v_num=1] 

`Trainer.fit` stopped: `max_epochs=250` reached.


Epoch 250/250: 100%|██████████| 250/250 [02:58<00:00,  1.40it/s, loss=102, v_num=1]
computing neighbors
    finished (0:00:08) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
maxi


100%|██████████| 6/6 [47:14<00:00, 472.47s/it]
