In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
import scBiMapping
import anndata as ad
import scanpy as sc
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
adata = ad.read('/data/work/test_data/Human_Cancer_cell_lines_ATAC.h5ad')  
adata.var_names_make_unique()
 
np.random.seed(seed=1)
id = np.arange(adata.n_obs); np.random.shuffle(id); adata = adata[id,:]
half_length = (adata.n_obs + 1) // 2

adata_ref = adata[:half_length,:]
adata_query = adata[half_length:,:]
print(adata_ref); print(adata_query)

View of AnnData object with n_obs × n_vars = 27038 × 606219
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'doublet_probability', 'doublet_score', 'cell_annotation', 'batch'
View of AnnData object with n_obs × n_vars = 27037 × 606219
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'doublet_probability', 'doublet_score', 'cell_annotation', 'batch'


## Case1： knnMethod = 'HNSW'

In [3]:
%%time
n_embedding = 30; # number of embeddings
K = 30; # this parameter will heavily influence the speed. Don't be too large
K_majority = 5; # parameter for marjority voting
normalization = True; 
knnMethod = 'HNSW'; # 'HNSW','NNDescent'
metric = 'euclidean'
reduction_method_on_cells_only = 'BiMapping'; # 'BiMapping','SnapATAC2','None','minHash'

CellType_Key_for_ref = 'cell_annotation' # this setting is Necessary, which denotes the cell type key of the reference dataset
scBiMapping.scBiMapping_annotation(adata_ref, adata_query,n_embedding = n_embedding,normalization = normalization,K = K,K_majority = K_majority,knnMethod = knnMethod,reduction_method_on_cells_only = reduction_method_on_cells_only,CellType_Key_for_ref =CellType_Key_for_ref)
print(adata_query.obs['cell_type_predicted'].head()) # predicted cell types for query cells

n_embedding:  30
normalization:  True
K:  30
knnMethod:  HNSW
for each cell, find K nearest genes in the co-embedded space......
for each cell, find K nearest genes in the co-embedded space......
knn_based_Sim_ref:
(27038, 606219)
knn_based_Sim_query: 
(27037, 606219)

Direct merge softmax-weighted coded reference and query dataset......
(54075, 606219)

 reduction on sparse cell-markerGenes softmax-weighted matrix, to get low-embedding of cells only, using BiMapping
v5...
(27038, 30)
(27037, 30)

K_majority = 5 (for majority voting)
find knn...
voting...
CL100169138_L02_BC2084_N02_13494            LoVo
CL100169139_L01_BC3288_N01_38386            Hap1
DP8400011418BR_L01_5_BC01297_N02_31537      A549
CL100169139_L02_BC0078_N01_32926          Caco-2
CL100169139_L02_BC0961_N02_32974           SNB75
Name: cell_type_predicted, dtype: object
CPU times: user 53min 7s, sys: 1min 20s, total: 54min 27s
Wall time: 17min 38s


In [4]:
# Evaluate 
accuracy = accuracy_score(adata_query.obs['cell_annotation'], adata_query.obs['cell_type_predicted'])
print(f'accuracy: {accuracy}')

accuracy: 0.5975515034952102


## Case2： knnMethod = 'NNDescent'

In [5]:
%%time
n_embedding = 30; # number of embeddings
K = 30; # this parameter will heavily influence the speed. Don't be too large
K_majority = 5; # parameter for marjority voting
normalization = True; 
knnMethod = 'NNDescent'; # 'HNSW','NNDescent'
metric = 'euclidean'
reduction_method_on_cells_only = 'BiMapping'; # 'BiMapping','SnapATAC2','None','minHash'

CellType_Key_for_ref = 'cell_annotation' # this setting is Necessary, which denotes the cell type key of the reference dataset
scBiMapping.scBiMapping_annotation(adata_ref, adata_query,n_embedding = n_embedding,normalization = normalization,K = K,K_majority = K_majority,knnMethod = knnMethod,reduction_method_on_cells_only = reduction_method_on_cells_only,CellType_Key_for_ref =CellType_Key_for_ref)
print(adata_query.obs['cell_type_predicted'].head()) # predicted cell types for query cells

n_embedding:  30
normalization:  True
K:  30
knnMethod:  NNDescent
for each cell, find K nearest genes in the co-embedded space......
for each cell, find K nearest genes in the co-embedded space......
knn_based_Sim_ref:
(27038, 606219)
knn_based_Sim_query: 
(27037, 606219)

Direct merge softmax-weighted coded reference and query dataset......
(54075, 606219)

 reduction on sparse cell-markerGenes softmax-weighted matrix, to get low-embedding of cells only, using BiMapping
v5...
(27038, 30)
(27037, 30)

K_majority = 5 (for majority voting)
find knn...
voting...
CL100169138_L02_BC2084_N02_13494           LoVo
CL100169139_L01_BC3288_N01_38386           Hap1
DP8400011418BR_L01_5_BC01297_N02_31537     A549
CL100169139_L02_BC0078_N01_32926          786-O
CL100169139_L02_BC0961_N02_32974           HK-2
Name: cell_type_predicted, dtype: object
CPU times: user 1h 58min 40s, sys: 10min 22s, total: 2h 9min 2s
Wall time: 24min 46s


In [6]:
# Evaluate 
accuracy = accuracy_score(adata_query.obs['cell_annotation'], adata_query.obs['cell_type_predicted'])
print(f'accuracy: {accuracy}')

accuracy: 0.9501793838073751
