In [1]:
import scHash
import anndata as ad

## Pancreas Dataset
We demonstrate how scHash encode multiple datasets into hash codes for six public avaialble Pancreas datasets.

The raw data for first five datasets can be obtained from [Harmony](https://github.com/immunogenomics/harmony2019/tree/master/data/figure5).

The sixth Pancreas dataset is public available at GSE83139.

We compiled the six datasets into one AnnData object for easy demonstration. The processed data can be downloaded [here](https://drive.google.com/file/d/1shc4OYIbq2FwbyGUaYuzizuvzW-giSTs/view?usp=share_link).

In [None]:
data_dir = '../../share_data/Pancreas_Wang/fivepancreas_wang_raw.h5ad'

# set up datamodule
# This anndata object is packed with 6 pancreas dataset. We take one of them to be a test dataset here.  
query = 'wang'
full = ad.read_h5ad(data_dir)
train = full[full.obs.dataset!=query]
test = full[full.obs.dataset==query]

datamodule = scHash.setup_training_data(train_data = train,cell_type_key = 'cell_type', batch_key = 'dataset')

# set the query data
# this can be also set after train
datamodule.setup_test_data(test)

########### consider write into a function again
# Init ModelCheckpoint callback
checkpointPath = '../checkpoint/'

# Init the model and Train
model = scHash.scHashModel(datamodule)
trainer, best_model_path = scHash.training(model = model, datamodule = datamodule, checkpointPath = checkpointPath, max_epochs = 100)

# Test the best model
scHash.testing(trainer, model, best_model_path, datamodule)

# Atlas Dataset

Here is an demonstration on atlas level dataset. We demonstrate the the atlas level annotation with the dataset Tabula Senis Muris and it can be download [here](https://figshare.com/projects/Tabula_Muris_Senis/64982). We followed scArches' preprocess pipeline and the preprocessed data can be downloaded through [here]( https://drive.google.com/file/d/1lfDu-TGsUvHrmXoSWkj0tptvWNYFgs2x/view?usp=share_link). The dataset contains 356213 cells with 5000 highly variable genes with cell type, method, and tissue annotations. 

The steps aredata the same for atlas level datasets.

In [None]:
data_dir = '../../share_data/Tabula_Muris_Senis(TM)/tabula_senis_normalized_all_hvg.h5ad'

# This data contains both the reference and query source
data = ad.read_h5ad(data_dir)

# random split to get query indices
# import random 
from sklearn.model_selection import train_test_split
reference_indicies, query_indicies = train_test_split(list(range(data.shape[0])), train_size=0.8, stratify=data.obs.cell_ontology_class,random_state=42)

train = data[reference_indicies]
test = data[query_indicies]
datamodule = scHash.setup_training_data(train_data = train,cell_type_key = 'cell_ontology_class')

# set the query data
# this can be also set after train
datamodule.setup_test_data(test)

########### consider write into a function again
# Init ModelCheckpoint callback
checkpointPath = '../checkpoint/'

# Init the model and Train
model = scHash.scHashModel(datamodule, bit = 256)
trainer, best_model_path = scHash.training(model = model, datamodule = datamodule, checkpointPath = checkpointPath, max_epochs = 100)

# Test the best model
scHash.testing(trainer, model, best_model_path, datamodule)