In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
import scBiMapping
import anndata as ad
import scanpy as sc
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## reference data: scATAC

In [2]:
multiome1 = ad.read('/data/work/test_data/NIPS-s1d1.h5ad') 
adata_ref = multiome1[:,multiome1.var['feature_types']=='ATAC'] 

## query dataset: scATAC

In [3]:
multiome2 = ad.read('/data/work/test_data/NIPS-s2d1.h5ad')  
adata_query = multiome2[:,multiome2.var['feature_types']=='ATAC'] 

In [4]:
print(adata_query)

View of AnnData object with n_obs × n_vars = 4220 × 116490
    obs: 'GEX_pct_counts_mt', 'GEX_n_counts', 'GEX_n_genes', 'GEX_size_factors', 'GEX_phase', 'ATAC_nCount_peaks', 'ATAC_atac_fragments', 'ATAC_reads_in_peaks_frac', 'ATAC_blacklist_fraction', 'ATAC_nucleosome_signal', 'cell_type', 'batch', 'ATAC_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker'
    var: 'feature_types', 'gene_id'
    uns: 'ATAC_gene_activity_var_names', 'dataset_id', 'genome', 'organism'
    obsm: 'ATAC_gene_activity', 'ATAC_lsi_full', 'ATAC_lsi_red', 'ATAC_umap', 'GEX_X_pca', 'GEX_X_umap'
    layers: 'counts'


In [5]:
print(adata_ref)

View of AnnData object with n_obs × n_vars = 6224 × 116490
    obs: 'GEX_pct_counts_mt', 'GEX_n_counts', 'GEX_n_genes', 'GEX_size_factors', 'GEX_phase', 'ATAC_nCount_peaks', 'ATAC_atac_fragments', 'ATAC_reads_in_peaks_frac', 'ATAC_blacklist_fraction', 'ATAC_nucleosome_signal', 'cell_type', 'batch', 'ATAC_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker'
    var: 'feature_types', 'gene_id'
    uns: 'ATAC_gene_activity_var_names', 'dataset_id', 'genome', 'organism'
    obsm: 'ATAC_gene_activity', 'ATAC_lsi_full', 'ATAC_lsi_red', 'ATAC_umap', 'GEX_X_pca', 'GEX_X_umap'
    layers: 'counts'


## Case1： knnMethod = 'HNSW'

In [6]:
%%time
n_embedding = 20; # number of embeddings
K = 50; # this parameter will heavily influence the speed. Don't be too large
K_majority = 30; # parameter for marjority voting
normalization = True; knnMethod = 'HNSW'; metric = 'euclidean'
reduction_method_on_cells_only = 'BiMapping'; # 'BiMapping','SnapATAC2','None','minHash'

CellType_Key_for_ref = 'cell_type' # this setting is Necessary, which denotes the cell type key of the reference dataset
scBiMapping.scBiMapping_annotation(adata_ref, adata_query,n_embedding = n_embedding,normalization = normalization,K = K,K_majority = K_majority,knnMethod = knnMethod,reduction_method_on_cells_only = reduction_method_on_cells_only,CellType_Key_for_ref =CellType_Key_for_ref)
print(adata_query.obs['cell_type_predicted'].head()) # predicted cell types for query cells

n_embedding:  20
normalization:  True
K:  50
knnMethod:  HNSW
for each cell, find K nearest genes in the co-embedded space......
for each cell, find K nearest genes in the co-embedded space......
knn_based_Sim_ref:
(6224, 116490)
knn_based_Sim_query: 
(4220, 116490)

Direct merge softmax-weighted coded reference and query dataset......
(10444, 116490)

 reduction on sparse cell-markerGenes softmax-weighted matrix, to get low-embedding of cells only, using BiMapping
v5...
(6224, 20)
(4220, 20)

K_majority = 30 (for majority voting)
find knn...
voting...
ACGTTACAGGCATTAC-4-s2d1      CD16+ Mono
GGTGATTTCGCTAGAT-4-s2d1    Erythroblast
ACAGGATCACTAAGAA-4-s2d1            cDC2
CGCTACTTCATCCACC-4-s2d1      CD14+ Mono
CTTTGGTGTGCTAGAC-4-s2d1            B1 B
Name: cell_type_predicted, dtype: object
CPU times: user 3min 4s, sys: 6.13 s, total: 3min 10s
Wall time: 48.8 s


In [7]:
# Evaluate 
accuracy = accuracy_score(adata_query.obs['cell_type'], adata_query.obs['cell_type_predicted'])
print(f'accuracy: {accuracy}')

accuracy: 0.8056872037914692


## Case2： knnMethod = 'NNDescent'

In [8]:
%%time
n_embedding = 20; # number of embeddings
K = 50; # this parameter will heavily influence the speed. Don't be too large
K_majority = 30; # parameter for marjority voting
normalization = True; knnMethod = 'NNDescent'; metric = 'euclidean'
reduction_method_on_cells_only = 'BiMapping'; # 'BiMapping','SnapATAC2','None','minHash'

CellType_Key_for_ref = 'cell_type' # this setting is Necessary, which denotes the cell type key of the reference dataset
scBiMapping.scBiMapping_annotation(adata_ref, adata_query,n_embedding = n_embedding,normalization = normalization,K = K,K_majority = K_majority,knnMethod = knnMethod,reduction_method_on_cells_only = reduction_method_on_cells_only,CellType_Key_for_ref =CellType_Key_for_ref)
print(adata_query.obs['cell_type_predicted'].head()) # predicted cell types for query cells

n_embedding:  20
normalization:  True
K:  50
knnMethod:  NNDescent
for each cell, find K nearest genes in the co-embedded space......
for each cell, find K nearest genes in the co-embedded space......
knn_based_Sim_ref:
(6224, 116490)
knn_based_Sim_query: 
(4220, 116490)

Direct merge softmax-weighted coded reference and query dataset......
(10444, 116490)

 reduction on sparse cell-markerGenes softmax-weighted matrix, to get low-embedding of cells only, using BiMapping
v5...
(6224, 20)
(4220, 20)

K_majority = 30 (for majority voting)
find knn...
voting...
ACGTTACAGGCATTAC-4-s2d1      CD16+ Mono
GGTGATTTCGCTAGAT-4-s2d1    Erythroblast
ACAGGATCACTAAGAA-4-s2d1            cDC2
CGCTACTTCATCCACC-4-s2d1      CD14+ Mono
CTTTGGTGTGCTAGAC-4-s2d1            B1 B
Name: cell_type_predicted, dtype: object
CPU times: user 17min 10s, sys: 56.6 s, total: 18min 6s
Wall time: 3min 22s


In [9]:
# Evaluate 
accuracy = accuracy_score(adata_query.obs['cell_type'], adata_query.obs['cell_type_predicted'])
print(f'accuracy: {accuracy}')

accuracy: 0.8063981042654028
