In [1]:
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
from polyphony import Polyphony
from polyphony.data import QryAnnDataManager, RefAnnDataManager
from polyphony.tool import projection

Global seed set to 0


# Use your own dataset 

You may create your own dataset from AnnData files (.h5ad). \
This example uses the `Pancreas` dataset shown in the `1. Basic Usage` example. 

In [3]:
import scanpy as sc
import gdown

condition_key = 'study'
target_conditions = ['Pancreas inDrop']
output_filename = './pancreas.h5ad'

if not os.path.exists(output_filename):
    url = 'https://drive.google.com/uc?confirm=pbef&id=1ehxgfHTsMZXy6YzlFKGJOsBKQ5rrvMnd'
    gdown.download(url, output_filename, quiet=False)

The `Pancreas` dataset contains sequencing results from five experiments with different sequencing techniques. \
We separate the dataset into a reference set (cells generated using a plate-based protocol) and a query set (cells generated using a droplet-based protocol, i.e., `adata.obs['study'].isin(['Pancreas inDrop'])`).

In [4]:
adata = sc.read(output_filename).raw.to_adata()
ref_adata = adata[~adata.obs[condition_key].isin(target_conditions)].copy()
qry_adata = adata[adata.obs[condition_key].isin(target_conditions)].copy()

In [5]:
ref_adata.obs.head()

Unnamed: 0_level_0,batch,study,cell_type,size_factors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-1-0-0-0,0,Pancreas CelSeq2,Pancreas Alpha,15213.94043
2-1-0-0-0,0,Pancreas CelSeq2,Pancreas Delta,11714.072266
3-1-0-0-0,0,Pancreas CelSeq2,Pancreas Beta,26895.630859
4-1-0-0-0,0,Pancreas CelSeq2,Pancreas Ductal,12899.861328
5-1-0-0-0,0,Pancreas CelSeq2,Pancreas Ductal,17666.962891


When building a Reference Dataset or a Query Dataset for Polyphony, you need to at least specify the following `key` names.
* `batch_key`: the name of the **batch name** field in `adata.obs`
* `cell_type_key`: the name of the **cell type** field in `adata.obs`
* `pred_key`: the name of a reserved field in `adata.obs` for cell type predictions

In [6]:
batch_key = 'study'
cell_type_key = 'cell_type'
pred_key = 'cell_type_pred'

ref_dataset = RefAnnDataManager(ref_adata, {'batch_key': batch_key, 'cell_type_key': cell_type_key})
qry_dataset = QryAnnDataManager(qry_adata, {'batch_key': batch_key, 'pred_key': pred_key})

In [7]:
pp = Polyphony('test', ref_dataset, qry_dataset)