In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
import scBiMapping
import anndata as ad
import scanpy as sc
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## reference data: scRNA

In [2]:
multiome1 = ad.read('/data/work/test_data/NIPS-s1d1.h5ad') 
adata_ref = multiome1[:,multiome1.var['feature_types']=='GEX']
adata_ref.var_names_make_unique()
# sc.pp.normalize_total(adata_ref, target_sum=1e4)
# sc.pp.log1p(adata_ref)
# sc.pp.highly_variable_genes(adata_ref, n_top_genes=4000,subset=True,flavor='cell_ranger') #
 

## query dataset: scRNA

In [3]:
multiome2 = ad.read_h5ad('/data/work/test_data/NIPS-s2d1.h5ad')  
adata_query = multiome2[:,multiome2.var['feature_types']=='GEX']
adata_query.var_names_make_unique()
# sc.pp.normalize_total(adata_query, target_sum=1e4)
# sc.pp.log1p(adata_query)
# sc.pp.highly_variable_genes(adata_query, n_top_genes=4000,subset=True,flavor='cell_ranger') # 

## Case1： knnMethod = 'HNSW'

In [4]:
%%time
n_embedding = 20; # number of embeddings
K = 50; # this parameter will heavily influence the speed. Don't be too large
K_majority = 10; # parameter for marjority voting
normalization = True; knnMethod = 'HNSW'; metric = 'euclidean'
reduction_method_on_cells_only = 'BiMapping'; # 'BiMapping','SnapATAC2','None','minHash'

CellType_Key_for_ref = 'cell_type' # this setting is Necessary, which denotes the cell type key of the reference dataset
scBiMapping.scBiMapping_annotation(adata_ref, adata_query,n_embedding = n_embedding,normalization = normalization,K = K,K_majority = K_majority,knnMethod = knnMethod,reduction_method_on_cells_only = reduction_method_on_cells_only,CellType_Key_for_ref =CellType_Key_for_ref)
print(adata_query.obs['cell_type_predicted'].head()) # predicted cell types for query cells

n_embedding:  20
normalization:  True
K:  50
knnMethod:  HNSW
for each cell, find K nearest genes in the co-embedded space......
for each cell, find K nearest genes in the co-embedded space......
knn_based_Sim_ref:
(6224, 13431)
knn_based_Sim_query: 
(4220, 13431)

Direct merge softmax-weighted coded reference and query dataset......
(10444, 13431)

 reduction on sparse cell-markerGenes softmax-weighted matrix, to get low-embedding of cells only, using BiMapping
v5...
(6224, 20)
(4220, 20)

K_majority = 10 (for majority voting)
find knn...
voting...
ACGTTACAGGCATTAC-4-s2d1       CD16+ Mono
GGTGATTTCGCTAGAT-4-s2d1     Erythroblast
ACAGGATCACTAAGAA-4-s2d1             cDC2
CGCTACTTCATCCACC-4-s2d1       CD14+ Mono
CTTTGGTGTGCTAGAC-4-s2d1    Naive CD20+ B
Name: cell_type_predicted, dtype: object
CPU times: user 34.6 s, sys: 2.05 s, total: 36.7 s
Wall time: 10.7 s


In [5]:
# Evaluate 
accuracy = accuracy_score(adata_query.obs['cell_type'], adata_query.obs['cell_type_predicted'])
print(f'accuracy: {accuracy}')

accuracy: 0.8627962085308057


## Case2： knnMethod = 'NNDescent'

In [6]:
%%time
n_embedding = 20; # number of embeddings
K = 50; # this parameter will heavily influence the speed. Don't be too large
K_majority = 10; # parameter for marjority voting
normalization = True; knnMethod = 'NNDescent'; metric = 'euclidean'
reduction_method_on_cells_only = 'BiMapping'; # 'BiMapping','SnapATAC2','None','minHash'

CellType_Key_for_ref = 'cell_type' # this setting is Necessary, which denotes the cell type key of the reference dataset
scBiMapping.scBiMapping_annotation(adata_ref, adata_query,n_embedding = n_embedding,normalization = normalization,K = K,K_majority = K_majority,knnMethod = knnMethod,reduction_method_on_cells_only = reduction_method_on_cells_only,CellType_Key_for_ref =CellType_Key_for_ref)
print(adata_query.obs['cell_type_predicted'].head()) # predicted cell types for query cells

n_embedding:  20
normalization:  True
K:  50
knnMethod:  NNDescent
for each cell, find K nearest genes in the co-embedded space......
for each cell, find K nearest genes in the co-embedded space......
knn_based_Sim_ref:
(6224, 13431)
knn_based_Sim_query: 
(4220, 13431)

Direct merge softmax-weighted coded reference and query dataset......
(10444, 13431)

 reduction on sparse cell-markerGenes softmax-weighted matrix, to get low-embedding of cells only, using BiMapping
v5...
(6224, 20)
(4220, 20)

K_majority = 10 (for majority voting)
find knn...
voting...
ACGTTACAGGCATTAC-4-s2d1          CD16+ Mono
GGTGATTTCGCTAGAT-4-s2d1        Erythroblast
ACAGGATCACTAAGAA-4-s2d1    CD4+ T activated
CGCTACTTCATCCACC-4-s2d1          CD14+ Mono
CTTTGGTGTGCTAGAC-4-s2d1       Naive CD20+ B
Name: cell_type_predicted, dtype: object
CPU times: user 3min 55s, sys: 10.1 s, total: 4min 5s
Wall time: 1min 54s


In [7]:
# Evaluate 
accuracy = accuracy_score(adata_query.obs['cell_type'], adata_query.obs['cell_type_predicted'])
print(f'accuracy: {accuracy}')

accuracy: 0.8514218009478673
