In [2]:
import scHash
import anndata as ad

from sklearn.metrics import f1_score, precision_score, recall_score
from statistics import median

# Tutorial for scATAC-seq cell annotations
Here is an demonstration on scATAC-seq dataset. For ATAC data, we used the mapping method adapted from [GLUE](https://www.nature.com/articles/s41587-022-01284-4) to map peaks to genes. The dataset can also be obtained from the GLUE github page. We used Muto-2021 for demonstration.

## Load data

In [3]:
# define data path
data_dir = '../../../../share_data/Symphony/symphony_reproducibility/scATAC/atac2rna/Muto-2021/Muto-2021-FRAGS2RNA.h5ad'

# This data contains both the reference and query source
data = ad.read_h5ad(data_dir)

# we randomly picked '5028f75a-8c09-4155-a232-ad7dbfa6042e' as query
query='5028f75a-8c09-4155-a232-ad7dbfa6042e'
train = data[data.obs.batch!=query]
test = data[data.obs.batch==query]

## Training Model

In [None]:
# set up the training datamodule
datamodule = scHash.setup_training_data(train_data = train,cell_type_key = 'cell_type', batch_key = 'batch')

# set a directory to save the model 
checkpointPath = '../checkpoint/'

# initiliza scHash model and train 
model = scHash.scHashModel(datamodule)
trainer, best_model_path, training_time = scHash.training(model = model, datamodule = datamodule, checkpointPath = checkpointPath, max_epochs = 70)

## Test Model

In [6]:
# add the test data
datamodule.setup_test_data(test)

# test the model
pred_labels, hash_codes = scHash.testing(trainer, model, best_model_path)

# show the test performance
labels_true = test.obs.cell_type
f1_median = round(median(f1_score(labels_true,pred_labels,average=None)),3)

print(f'F1 Median: {f1_median}')

F1 Median: 0.965
