In [1]:
import velvet as vt

# general packages
import numpy as np
import pandas as pd
import torch
from scipy.sparse import issparse

# velocity packages
import scanpy as sc
import scvelo as scv
import anndata as ann

# plotting packages
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange
from IPython.display import clear_output

# color palette object
from colors import colorpalette as colpal

# we implement unitvelo's evaluation 
# originally from https://github.com/StatBiomed/UniTVelo/blob/main/unitvelo/eval_utils.py
# paper: https://www.nature.com/articles/s41467-022-34188-7
# authors: Mingze Gao, Chen Qiao & Yuanhua Huang 

from eval_functions import unitvelo_cross_boundary_correctness as cross_boundary_correctness
from  eval_functions import unitvelo_inner_cluster_coh as inner_cluster_coh

from sklearn.metrics.pairwise import cosine_similarity

# the object that will contain the data and data-specific parameters for benchmarking

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [23]:
def gene_specific_benchmark(output_folder, pipeline, pipeline_name):
    gene_results = {
        'mini_MN':[
            ['leiden','4','Olig2','-'],
            ['leiden','4','Tubb3','+'],
            ['leiden','2','Neurog2','+'],
            ['leiden','3','Isl2','+'],
        ],
        'mini_V3':[
            ['leiden','1','Sim1','+'],
            ['leiden','1','Sox2','-'],
            ['leiden','3','Tubb3','+'],
            ['leiden','1','Map2','+'],
        ],
        'mini_MD':[
            ['leiden','3','Sox2','-'],
            ['leiden','3','Nkx1-2','-'],
            ['leiden','3','T','-'],
            ['leiden','2','Meox1','+'],
        ],
        'midi_NM':[
            ['cell_annotation','Neural','Olig2','+'],
            ['cell_annotation','Neural','T','-'],
            ['cell_annotation','Mesoderm','Meox1','+'],
            ['cell_annotation','Early_Neural','Irx3','+'],
        ],
        'midi_Ne':[
            ['cell_annotation','Neural','Olig2','+'],
            ['cell_annotation','FP','Shh','+'],
            ['cell_annotation','P3','Nkx2-2','+'],
            ['cell_annotation','pMN','Irx3','-'],
        ]
    }
    
    scores = []
    for name, settings in gene_results.items():
        print(f"GENE SCORE: {name}")
        adata = sc.read_h5ad(f'../../data/benchmarking/{name}.h5ad')
        adata.layers['velocity'] = pipeline(adata, name)

        for seti in settings:
            sub = adata[adata.obs[seti[0]]==seti[1]]
            vel = sub[:,seti[2]].layers['velocity'].flatten()
            if seti[3]=='-':
                score = np.mean(vel<0)
            elif seti[3]=='+':
                score = np.mean(vel>0)
            scores.append(score)
           
    scores = np.array(scores)
    np.save(f'{output_folder}/{pipeline_name}_gene_specific_scores.npy', scores)

def consistency_benchmark(output_folder, pipeline, pipeline_name):
    print('1')
    consistency_results = {
        'Neural':[
            ['midi_NM','midi_Ne'],
            ['Olig2','Irx3','Sema3e','Nkx1-2']
        ],
        'pMN':[
            ['mini_MN','midi_Ne'],
            ['Olig2','Neurog2','Mnx1','Isl2']
        ],
        'MN':[
            ['mini_MN','midi_Ne'],
            ['Tubb3','Neurog2','Map2','Olig2']
        ],
        'V3':[
            ['mini_V3','midi_Ne'],
            ['Tubb3','Sim1','Map2','Stmn2']
        ],
        'p3':[
            ['mini_V3','midi_Ne'],
            ['Sim1','Nfia','Sox9','Nfib']
        ],
        'Mesoderm':[
            ['mini_MD','midi_NM'],
            ['Meox1','T','Rspo3','Cyp26a1']
        ],
        'NMP':[
            ['midi_NM','maxi'],
            ['Rspo3','T','Sema3e','Fgf8']
        ],
        'FP':[
            ['midi_Ne','maxi'],
            ['Shh','Arx','Olig2','Foxa2']
        ],
        'Early_Neural':[
            ['midi_NM','maxi'],
            ['Nkx1-2','Irx3','Sema3e','Olig2']
        ]
    }
    scores = []
    for cell, settings in consistency_results.items():
        print(f"CONSISTENCY: {cell}")
        fir = sc.read_h5ad(f'../../data/benchmarking/{settings[0][0]}.h5ad')
        sec = sc.read_h5ad(f'../../data/benchmarking/{settings[0][1]}.h5ad')

        fir.layers['velocity'] = pipeline(fir, pipeline_name)
        sec.layers['velocity'] = pipeline(sec, pipeline_name)

        fir_sub = fir[fir.obs.cell_annotation==cell]
        sec_sub = sec[sec.obs.cell_annotation==cell]
        shared_cells = list(set(fir_sub.obs_names).intersection(sec_sub.obs_names))
        fir_sub = fir_sub[shared_cells]
        sec_sub = sec_sub[shared_cells]
        
        print(settings[1])
        
        for gene in settings[1]:
            proceed = True
            print('proceed: ', proceed)
#             try:
            firvel = np.array(fir_sub[:,gene].layers['velocity']).flatten()
            print('t1')
#             except:
#                 print(f"KEY ERROR NOTE: {gene} not found in {settings[0][0]}!")
#                 proceed = False
#                 print('e1')
#             try:
            secvel = np.array(sec_sub[:,gene].layers['velocity']).flatten()
            print('t2')
#             except:
#                 print(f"KEY ERROR NOTE: {gene} not found in {settings[0][1]}!")
#                 proceed = False
#                 print('e2')
            if proceed:
                print("Score for ({cell}, {settings[1]}, {settings[0][0]},{settings[0][1]})")
                print(np.corrcoef(firvel, secvel)[0,1])
                scores.append(np.corrcoef(firvel, secvel)[0,1])

    scores = np.array(scores)
#     np.save(f'{output_folder}/{pipeline_name}_consistency_scores.npy', scores)
    return scores

In [2]:
def gene_specific_benchmark_splicing(output_folder, pipeline, pipeline_name):
    gene_results = {
        'splicing_mini_MN':[
            ['leiden','4','Olig2','-'],
            ['leiden','4','Tubb3','+'],
            ['leiden','2','Neurog2','+'],
            ['leiden','3','Isl2','+'],
        ],
        'splicing_mini_V3':[
            ['leiden','1','Sim1','+'],
            ['leiden','1','Sox2','-'],
            ['leiden','3','Tubb3','+'],
            ['leiden','1','Map2','+'],
        ],
        'splicing_mini_MD':[
            ['leiden','3','Sox2','-'],
            ['leiden','3','Nkx1-2','-'],
            ['leiden','3','T','-'],
            ['leiden','2','Meox1','+'],
        ],
        'splicing_midi_NM':[
            ['cell_annotation','Neural','Olig2','+'],
            ['cell_annotation','Neural','T','-'],
            ['cell_annotation','Mesoderm','Meox1','+'],
            ['cell_annotation','Early_Neural','Irx3','+'],
        ],
        'splicing_midi_Ne':[
            ['cell_annotation','Neural','Olig2','+'],
            ['cell_annotation','FP','Shh','+'],
            ['cell_annotation','P3','Nkx2-2','+'],
            ['cell_annotation','pMN','Irx3','-'],
        ]
    }
    
    scores = []
    for name, settings in gene_results.items():
        print(f"GENE SCORE: {name}")
        adata = sc.read_h5ad(f'../../data/benchmarking/{name}.h5ad')
        adata.layers['velocity'] = pipeline(adata, name)

        for seti in settings:
            sub = adata[adata.obs[seti[0]]==seti[1]]
            vel = sub[:,seti[2]].layers['velocity'].flatten()
            if seti[3]=='-':
                score = np.mean(vel<0)
            elif seti[3]=='+':
                score = np.mean(vel>0)
            scores.append(score)
           
    scores = np.array(scores)
    np.save(f'{output_folder}/{pipeline_name}_gene_specific_scores.npy', scores)

In [30]:
def consistency_benchmark_splicing(output_folder, pipeline, pipeline_name):
    consistency_results = {
        'Neural':[
            ['splicing_midi_NM','splicing_midi_Ne'],
            ['Olig2','Irx3','Sema3e','Nkx1-2']
        ],
        'pMN':[
            ['splicing_mini_MN','splicing_midi_Ne'],
            ['Olig2','Neurog2','Mnx1','Isl2']
        ],
        'MN':[
            ['splicing_mini_MN','splicing_midi_Ne'],
            ['Tubb3','Neurog2','Map2','Olig2']
        ],
        'V3':[
            ['splicing_mini_V3','splicing_midi_Ne'],
            ['Tubb3','Sim1','Map2','Stmn2']
        ],
        'p3':[
            ['splicing_mini_V3','splicing_midi_Ne'],
            ['Sim1','Nfia','Sox9','Nfib']
        ],
        'Mesoderm':[
            ['splicing_mini_MD','splicing_midi_NM'],
            ['Meox1','T','Rspo3','Cyp26a1']
        ],
        'NMP':[
            ['splicing_midi_NM','splicing_maxi'],
            ['Rspo3','T','Sema3e','Fgf8']
        ],
        'FP':[
            ['splicing_midi_Ne','splicing_maxi'],
            ['Shh','Arx','Olig2','Foxa2']
        ],
        'Early_Neural':[
            ['splicing_midi_NM','splicing_maxi'],
            ['Nkx1-2','Irx3','Sema3e','Olig2']
        ]
    }
    scores = []
    for cell, settings in consistency_results.items():
        print(f"CONSISTENCY: {cell}")
        fir = sc.read_h5ad(f'../../data/benchmarking/{settings[0][0]}.h5ad')
        sec = sc.read_h5ad(f'../../data/benchmarking/{settings[0][1]}.h5ad')

        fir.layers['velocity'] = pipeline(fir, pipeline_name)
        sec.layers['velocity'] = pipeline(sec, pipeline_name)

        fir_sub = fir[fir.obs.cell_annotation==cell]
        sec_sub = sec[sec.obs.cell_annotation==cell]
        shared_cells = list(set(fir_sub.obs_names).intersection(sec_sub.obs_names))
        fir_sub = fir_sub[shared_cells]
        sec_sub = sec_sub[shared_cells]

        for gene in settings[1]:
            proceed = True
#             try:
            firvel = np.array(fir_sub[:,gene].layers['velocity']).flatten()
#             except:
#                 print(f"KEY ERROR NOTE: {gene} not found in {settings[0][0]}!")
#                 proceed = False
#             try:
            secvel = np.array(sec_sub[:,gene].layers['velocity']).flatten()
#             except:
#                 print(f"KEY ERROR NOTE: {gene} not found in {settings[0][1]}!")
#                 proceed = False
            if proceed:
                print('score: ', np.corrcoef(firvel, secvel)[0,1])
                scores.append(np.corrcoef(firvel, secvel)[0,1])

    scores = np.nan_to_num(scores)
    np.save(f'{output_folder}/{pipeline_name}_consistency_scores.npy', scores)

In [31]:
PIPELINE_NAME = 'scvelo'
pipeline = scvelo_pipeline
data = 'splicing'

if data == 'splicing':
    genes_func = gene_specific_benchmark_splicing
    csist_func = consistency_benchmark_splicing
elif data == 'labelling':
    genes_func = gene_specific_benchmark
    csist_func = consistency_benchmark

In [32]:
scores = csist_func(
    output_folder='../../output_data/benchmarking_scores', 
    pipeline=pipeline, 
    pipeline_name=f'{PIPELINE_NAME}_CON'
)


CONSISTENCY: Neural
computing neighbors
    finished (0:00:03) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:01) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
computing velocities
    finished (0:00:03) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing neighbors
    finished (0:00:04) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:02) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
computing velocities
    finished (0:00:06) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
score:  0.5715172423553837
score:  0.4456480165395385
score:  0.7991163601734586
score:  nan
CONSISTENCY: pMN
computing neighbors
    finished (0:00:01) --> added 
    'dis

KeyboardInterrupt: 

In [33]:
np.nan_to_num([1,2,34])

array([ 1,  2, 34])

In [None]:
def consistency_benchmark(output_folder, pipeline, pipeline_name):
    consistency_results = {
        'Neural':[
            ['midi_NM','midi_Ne'],
            ['Olig2','Irx3','Sema3e','Nkx1-2']
        ],
        'pMN':[
            ['mini_MN','midi_Ne'],
            ['Olig2','Neurog2','Mnx1','Isl2']
        ],
        'MN':[
            ['mini_MN','midi_Ne'],
            ['Tubb3','Neurog2','Map2','Olig2']
        ],
        'V3':[
            ['mini_V3','midi_Ne'],
            ['Tubb3','Sim1','Map2','Stmn2']
        ],
        'p3':[
            ['mini_V3','midi_Ne'],
            ['Sim1','Nfia','Sox9','Nfib']
        ],
        'Mesoderm':[
            ['mini_MD','midi_NM'],
            ['Meox1','T','Rspo3','Cyp26a1']
        ],
        'NMP':[
            ['midi_NM','maxi'],
            ['Rspo3','T','Sema3e','Fgf8']
        ],
        'FP':[
            ['midi_Ne','maxi'],
            ['Shh','Arx','Olig2','Foxa2']
        ],
        'Early_Neural':[
            ['midi_NM','maxi'],
            ['Nkx1-2','Irx3','Sema3e','Olig2']
        ]
    }
    scores = []
    for cell, settings in consistency_results.items():
        print(f"CONSISTENCY: {cell}")
        fir = sc.read_h5ad(f'../../data/benchmarking/{settings[0][0]}.h5ad')
        sec = sc.read_h5ad(f'../../data/benchmarking/{settings[0][1]}.h5ad')

        fir.layers['velocity'] = pipeline(fir, pipeline_name)
        sec.layers['velocity'] = pipeline(sec, pipeline_name)

        fir_sub = fir[fir.obs.cell_annotation==cell]
        sec_sub = sec[sec.obs.cell_annotation==cell]
        shared_cells = list(set(fir_sub.obs_names).intersection(sec_sub.obs_names))
        fir_sub = fir_sub[shared_cells]
        sec_sub = sec_sub[shared_cells]

        for gene in settings[1]:
            proceed = True
            try:
                firvel = np.array(fir_sub[:,gene].layers['velocity']).flatten()
            except:
                print(f"KEY ERROR NOTE: {gene} not found in {settings[0][0]}!")
                proceed = False
            try:
                secvel = np.array(sec_sub[:,gene].layers['velocity']).flatten()
            except:
                print(f"KEY ERROR NOTE: {gene} not found in {settings[0][1]}!")
                proceed = False
            if proceed:
                scores.append(np.corrcoef(firvel, secvel)[0,1])

    scores = np.array(scores)
    np.save(f'{output_folder}/{pipeline_name}_consistency_scores.npy', scores)
    
def consistency_benchmark_splicing(output_folder, pipeline, pipeline_name):
    consistency_results = {
        'Neural':[
            ['splicing_midi_NM','splicing_midi_Ne'],
            ['Olig2','Irx3','Sema3e','Nkx1-2']
        ],
        'pMN':[
            ['splicing_mini_MN','splicing_midi_Ne'],
            ['Olig2','Neurog2','Mnx1','Isl2']
        ],
        'MN':[
            ['splicing_mini_MN','splicing_midi_Ne'],
            ['Tubb3','Neurog2','Map2','Olig2']
        ],
        'V3':[
            ['splicing_mini_V3','splicing_midi_Ne'],
            ['Tubb3','Sim1','Map2','Stmn2']
        ],
        'p3':[
            ['splicing_mini_V3','splicing_midi_Ne'],
            ['Sim1','Nfia','Sox9','Nfib']
        ],
        'Mesoderm':[
            ['splicing_mini_MD','splicing_midi_NM'],
            ['Meox1','T','Rspo3','Cyp26a1']
        ],
        'NMP':[
            ['splicing_midi_NM','splicing_maxi'],
            ['Rspo3','T','Sema3e','Fgf8']
        ],
        'FP':[
            ['splicing_midi_Ne','splicing_maxi'],
            ['Shh','Arx','Olig2','Foxa2']
        ],
        'Early_Neural':[
            ['splicing_midi_NM','splicing_maxi'],
            ['Nkx1-2','Irx3','Sema3e','Olig2']
        ]
    }
    scores = []
    for cell, settings in consistency_results.items():
        print(f"CONSISTENCY: {cell}")
        fir = sc.read_h5ad(f'../../data/benchmarking/{settings[0][0]}.h5ad')
        sec = sc.read_h5ad(f'../../data/benchmarking/{settings[0][1]}.h5ad')

        fir.layers['velocity'] = pipeline(fir, pipeline_name)
        sec.layers['velocity'] = pipeline(sec, pipeline_name)

        fir_sub = fir[fir.obs.cell_annotation==cell]
        sec_sub = sec[sec.obs.cell_annotation==cell]
        shared_cells = list(set(fir_sub.obs_names).intersection(sec_sub.obs_names))
        fir_sub = fir_sub[shared_cells]
        sec_sub = sec_sub[shared_cells]

        for gene in settings[1]:
            proceed = True
            try:
                firvel = np.array(fir_sub[:,gene].layers['velocity']).flatten()
            except:
                print(f"KEY ERROR NOTE: {gene} not found in {settings[0][0]}!")
                proceed = False
            try:
                secvel = np.array(sec_sub[:,gene].layers['velocity']).flatten()
            except:
                print(f"KEY ERROR NOTE: {gene} not found in {settings[0][1]}!")
                proceed = False
            if proceed:
                scores.append(np.corrcoef(firvel, secvel)[0,1])

    scores = np.array(scores)
    np.save(f'{output_folder}/{pipeline_name}_consistency_scores.npy', scores)