In [1]:
import scHash
import anndata as ad

## Pancreas Dataset
We demonstrate how scHash encodes multiple datasets into hash codes for six publicly available Pancreas datasets.

The raw data for the first five datasets can be obtained from [Harmony](https://github.com/immunogenomics/harmony2019/tree/master/data/figure5).

The sixth Pancreas dataset is from [(Wang et al., 2016)](https://diabetesjournals.org/diabetes/article/65/10/3028/34922/Single-Cell-Transcriptomics-of-the-Human-Endocrine) is publicly available at [GSE83139](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE83139).

We compiled the six datasets into one AnnData object for easy demonstration. The processed data can be downloaded [here](https://drive.google.com/file/d/1shc4OYIbq2FwbyGUaYuzizuvzW-giSTs/view?usp=share_link).

In [3]:
data_dir = '../../share_data/Pancreas_Wang/fivepancreas_wang_raw.h5ad'

# set up datamodule
# This anndata object is packed with 6 pancreas dataset. We take one of them to be a test dataset here.  
query = 'wang'
full = ad.read_h5ad(data_dir)
train = full[full.obs.dataset!=query]
test = full[full.obs.dataset==query]

datamodule = scHash.util.setup_training_data(train_data = train,cell_type_key = 'cell_type', batch_key = 'dataset')

# set the query data
# this can be also set after train
datamodule.setup_test_data(test)

########### consider write into a function again
# Init ModelCheckpoint callback
checkpointPath = '../checkpoint/'

# Init the model and Train
model = scHash.scHashModel(datamodule)
trainer, best_model_path = scHash.util.training(model = model, datamodule = datamodule, checkpointPath = checkpointPath, max_epochs = 50)

# Test the best model
scHash.util.testing(trainer, model, best_model_path, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type       | Params
------------------------------------------
0 | hash_layer | Sequential | 641 K 
------------------------------------------
641 K     Trainable params
0         Non-trainable params
641 K     Total params
2.567     Total estimated model params size (MB)
Epoch 9, global step 9: 'Val_F1_score_median_CHC_epoch' reached 0.93064 (best 0.93064), saving model to '/project/6061845/shaoc/checkpoint/scHash-epoch=09-Val_F1_score_median_CHC_epoch=0.931.ckpt' as top 1
Epoch 19, global step 19: 'Val_F1_score_median_CHC_epoch' reached 0.95844 (best 0.95844), saving model to '/project/6061845/shaoc/checkpoint/scHash-epoch=19-Val_F1_score_median_CHC_epoch=0.958.ckpt' as top 1
Epoch 29, global step 29: 'Val_F1_score_median_CHC_epoch' reached 0.96209 (best 0.96209), saving model to '/

Epoch: 49, Val_loss_epoch: 0.03


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


# Atlas Dataset

Here is an demonstration on atlas level dataset. We demonstrate the the atlas level annotation with the dataset Tabula Senis Muris and it can be download here https://figshare.com/projects/Tabula_Muris_Senis/64982. We followed scArches' preprocess pipeline and the preprocessed data can be downloaded through https://drive.google.com/file/d/1lfDu-TGsUvHrmXoSWkj0tptvWNYFgs2x/view?usp=share_link. The dataset contains 356213 cells with 5000 highly variable genes with cell type, method, and tissue annotations. 

The steps aredata the same for atlas level datasets.

In [None]:
data_dir = '../../share_data/Tabula_Muris_Senis(TM)/tabula_senis_normalized_all_hvg.h5ad'

# This data contains both the reference and query source
data = ad.read_h5ad(data_dir)

# random split to get query indices
# import random 
from sklearn.model_selection import train_test_split
reference_indicies, query_indicies = train_test_split(list(range(data.shape[0])), train_size=0.8, stratify=data.obs.cell_ontology_class,random_state=42)

train = data[reference_indicies]
test = data[query_indicies]
datamodule = scHash.util.setup_training_data(train_data = train,cell_type_key = 'cell_ontology_class')

# set the query data
# this can be also set after train
datamodule.setup_test_data(test)

########### consider write into a function again
# Init ModelCheckpoint callback
checkpointPath = '../checkpoint/'

# Init the model and Train
model = scHash.scHashModel(datamodule, bit = 128)
trainer, best_model_path = scHash.util.training(model = model, datamodule = datamodule, checkpointPath = checkpointPath, max_epochs = 100)

# Test the best model
scHash.util.testing(trainer, model, best_model_path, datamodule)