# Tabula Data Loader

This file contains instructions for creating scanpy anndata versions of:
    
    
|Dataset|Paper|
|-------|-----|
|Tabula Sapiens|https://www.science.org/stoken/author-tokens/ST-495/full|
|Tabula Microcebus|https://www.biorxiv.org/content/10.1101/2021.12.12.469460v1|
|Tabula Muris|https://www.nature.com/articles/s41586-018-0590-4|

# Downloads

Download Sapiens data from https://figshare.com/projects/Tabula_Sapiens/100973

Download Microcebus data from https://figshare.com/articles/dataset/Tabula_Microcebus_v1_0/14468196?file=31777475

Download Muris data from 


In [None]:
## Sapiens (~15gb)

In [None]:
!wget -r "https://figshare.com/ndownloader/files/34702114" -O ./data/sapiens.h5ad.zip

will be placed in the single file you specified.

--2022-11-14 11:51:57--  https://figshare.com/ndownloader/files/34702114
Resolving figshare.com (figshare.com)... 34.252.180.148, 34.250.174.243, 2a05:d018:1f4:d003:64d9:8f4f:2f30:52f7, ...
Connecting to figshare.com (figshare.com)|34.252.180.148|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/34702114/TabulaSapiens.h5ad.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T195158Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=46ae7c1a3efd18c334b46996de69cf49e3c15293cd9b631acb29f7cd1f521ba8 [following]
--2022-11-14 11:51:58--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/34702114/TabulaSapiens.h5ad.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T195158Z&X-Amz-Expires=10&X-Amz-Sig

In [None]:
## Microcebus (~9 GB)

In [None]:
!wget -r "https://figshare.com/ndownloader/files/31777475" -O ./data/mouse_lemur.h5ad

In [None]:
## Muris

In [None]:
!wget -r "https://figshare.com/ndownloader/files/31777475" -O ./data/sapiens.h5ad.zip

In [1]:
import scanpy as sc
from glob import glob
import pandas as pd
import numpy as np

# Coarsen / Map Tissues

In [2]:
cell_type_number_filter = 200
tissue_subset = True
ten_x_subset = True

In [3]:
# list of tissues
all_tissues = ["liver", "trachea", "tounge", "spleen", 
               "skin", "bladder", "bone_marrow",
               "heart_and_aorta", "lung", "blood",
               "mammary", "bone", "intestine", "uterus",
               "fat", "kidney", "pancreas", "eye", "prostate", 
               "muscle", "thymus", "brain", "colon", "endocrine",  "testes",
               "lymph_node", "salivary_gland"
              ]

human_tissue_map = {
    "Liver": "liver",
    "Trachea": "trachea",
    "Blood": "blood",
    "Lymph_Node": "lymph_node",
    "Salivary_Gland": "salivary_gland",
    "Spleen": "spleen",
    "Tongue": "tounge",
    "Mammary": "mammary",
    "Uterus": "uterus",
    "Eye": "eye",
    "Fat": "fat",
    "Skin": "skin",
    "Bone_Marrow": "bone_marrow",
    "Heart": "heart_and_aorta",
    "Pancreas": "pancreas",
    "Prostate": "prostate",
    "Muscle": "muscle",
    "Thymus": "thymus",
    "Bladder": "bladder",
    "Large_Intestine": "intestine",
    "Lung": "lung",
    "Small_Intestine": "intestine",
    "Vasculature": "muscle",
    "Kidney": "kidney"
}


mouse_tissue_map = {
    "Tongue": "tounge",
    "Heart_and_Aorta": "heart_and_aorta",
    "Marrow": "bone_marrow",
    "Mammary_Gland": "mammary",
    "Fat": "fat",
    "Kidney": "kidney",
    "Liver": "liver",
    "Lung": "lung",
    "Limb_Muscle": "muscle",
    "Pancreas": "pancreas",
    "Spleen": "spleen",
    "Thymus": "thymus",
    "Bladder": "bladder",
    "Skin": "skin",
    "Large_Intestine": "intestine",
    "Trachea": "trachea"
}


lemur_tissue_map = {
    "Testes": "testes",
    "Heart": "heart_and_aorta",
    "Liver": "liver",
    "Thymus": "thymus",
    "Eye_retina": "eye",
    "Brain_cortex": "brain",
    "Brainstem": "brain",
    "Pancreas": "pancreas",
    "Small_intestine": "intestine",
    "Lung": "lung",
    "Kidney": "kidney",
    "Tongue": "tounge",
    "Diaphragm": "muscle",
    "Limb_muscle": "muscle",
    "Spleen": "spleen",
    "Blood": "blood",
    "Bone": "bone",
    "Bone_marrow": "bone_marrow",
    "Bladder": "bladder",
    "Skin": "skin",
    "Colon": "colon",
    "Aorta": "heart_and_aorta",
    "Hypothalamus_Pituitary": "endocrine",
    "Mammary_gland": "mammary",
    "Fat": "fat",
    "Uterus": "uterus",
    "Trachea": "trachea"
}





keep_tissues = ["bone_marrow", "muscle", "pancreas", "spleen", "thymus", "trachea", "bladder", 
                "lung", "kidney"] # _full
                #"heart_and_aorta", "intestine", "skin", "muscle"] # full_more_tissues
                # causes a weird bug such that HV genes doesn't work for lemur heart and aorta due to to few cells
keep_tissues

['bone_marrow',
 'muscle',
 'pancreas',
 'spleen',
 'thymus',
 'trachea',
 'bladder',
 'lung',
 'kidney']

# Coarsen Labels

In [4]:
obo_loc = "/dfs/project/cross-species/yanay/data/tabula/cl.obo.txt"
with open(obo_loc, "r", encoding='utf-8') as f:
    obo = f.readlines()

In [5]:
obo_term_idxs = np.where([o.startswith('[Term]') for o in obo])[0]
obo_term_idxs

array([   17,    33,    45, ..., 22772, 22782, 22792])

In [6]:
def parse_group(lines):
    term_id = lines[[o.startswith("id:") for o in lines]][0].strip().split()[1]
    try:
        is_a = lines[[o.startswith("is_a:") for o in lines]][0].strip().split()[1]
    except:
        is_a = None
    name = lines[[o.startswith("name:") for o in lines]][0].strip().split("name:")[1].strip()
    return name, is_a, term_id

In [7]:
obo[33:45]

['[Term]\n',
 'id: CL:0000001\n',
 'name: primary cultured cell\n',
 'namespace: cell\n',
 'def: "A cultured cell that is freshly isolated from a organismal source, or derives in culture from such a cell prior to the culture being passaged." [ReO:mhb]\n',
 'comment: Covers cells actively being cultured or stored in a quiescent state for future use.\n',
 'synonym: "primary cell culture cell" EXACT []\n',
 'synonym: "primary cell line cell" RELATED []\n',
 'synonym: "unpassaged cultured cell" EXACT []\n',
 'xref: BTO:0002290\n',
 'is_a: CL:0000010 ! cultured cell\n',
 '\n']

In [8]:
parse_group(np.array(obo[33:45]))

('primary cultured cell', 'CL:0000010', 'CL:0000001')

In [9]:
all_rows = []
for i in range(1, len(obo_term_idxs) - 1):
    ls = np.array(obo[obo_term_idxs[i]:obo_term_idxs[i+1]])
    r = parse_group(ls)
    all_rows.append(r)

In [10]:
obo_tbl = pd.DataFrame(all_rows, columns=["name", "is_a", "id"]).set_index("id")
obo_tbl.index = obo_tbl.index.astype(str)
obo_tbl.head(20)

Unnamed: 0_level_0,name,is_a
id,Unnamed: 1_level_1,Unnamed: 2_level_1
CL:0000001,primary cultured cell,CL:0000010
CL:0000002,obsolete immortal cell line cell,
CL:0000003,native cell,CL:0000000
CL:0000004,obsolete cell by organism,
CL:0000005,fibroblast neural crest derived,CL:0000057
CL:0000006,neuronal receptor cell,CL:0000101
CL:0000007,early embryonic cell,CL:0002321
CL:0000008,migratory cranial neural crest cell,CL:0000333
CL:0000009,obsolete fusiform initial,
CL:0000010,cultured cell,CL:0000578


In [11]:
COARSE_MIN_CT = 4
EXCLUDED_CTS = [1, 2, 3, 4, 255, 10, 548, 325, 2371, 630, 219, 11115, 473, 145, 62, 7010]
def coarsen(cl_id):
    if not int(cl_id.split(":")[1]) in EXCLUDED_CTS:
        r = obo_tbl.loc[cl_id]
        new_id = r["is_a"]
        new_name = obo_tbl.loc[new_id]["name"]
        if int(new_id.split(":")[1]) in EXCLUDED_CTS:
            # don't over coarsen
            new_id = cl_id
            new_name = obo_tbl.loc[cl_id]["name"]
    else:
        new_id = cl_id
        new_name = obo_tbl.loc[cl_id]["name"]
    return new_name, new_id

In [12]:
def coarsen_labels(labs, max_level=2):
    coarsened_ids_dict = {}
    coarsened_names_dict = {}
    for cl in labs.unique():
        try:
            new_name, new_id = coarsen(cl)
            i = 0
            cl_og = cl
            while new_id != cl:
                cl_store = new_id
                new_name, new_id = coarsen(new_id) 
                cl = cl_store
                i += 1
                if i > max_level:
                    break
            coarsened_ids_dict[cl_og] = new_id
            coarsened_names_dict[cl_og] = new_name
        except:
            next
    return coarsened_ids_dict, coarsened_names_dict

# Sapiens Data

In [13]:
human = sc.read("/dfs/project/cross-species/yanay/data/tabula/sapiens.h5ad")

In [14]:
human.X = human.layers["decontXcounts"]

In [15]:
human.var_names = human.var["feature_name"]

AnnData expects .var.index to contain strings, but got values like:
    ['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")


In [16]:
old_set = {}
for level in np.arange(20, 0, -1):
    coarsened_ids_dict, coarsened_names_dict = coarsen_labels(human.obs["cell_type_ontology_term_id"], max_level=level)
    new_set = set(coarsened_names_dict.values())
    diff = new_set.difference(old_set)
    print(f"Level {level}: added {diff}")
    old_set = new_set
    

Level 20: added {'neural cell', 'leukocyte', 'fat cell', 'barrier cell', 'cell of skeletal muscle', 'hematopoietic cell', 'stem cell', 'electrically active cell', 'secretory cell', 'salivary gland cell', 'connective tissue cell', 'contractile cell', 'pigment cell', 'adventitial cell', 'embryonic cell', 'ciliated cell', 'phagocyte', 'epithelial cell'}
Level 19: added set()
Level 18: added set()
Level 17: added set()
Level 16: added set()
Level 15: added set()
Level 14: added set()
Level 13: added set()
Level 12: added set()
Level 11: added set()
Level 10: added set()
Level 9: added set()
Level 8: added set()
Level 7: added set()
Level 6: added set()
Level 5: added {'nongranular leukocyte'}
Level 4: added {'lymphocyte', 'electrically responsive cell'}
Level 3: added {'T cell', 'neuron', 'endo-epithelial cell'}
Level 2: added {'hematopoietic precursor cell', 'neuron associated cell', 'muscle cell', 'respiratory epithelial cell', 'alpha-beta T cell', 'electrically responsive cell', 'affere

In [17]:
coarsened_ids_dict, coarsened_names_dict = coarsen_labels(human.obs["cell_type_ontology_term_id"], max_level=2)

In [18]:
list(zip(coarsened_ids_dict.values(), coarsened_names_dict.values()))

[('CL:0000234', 'phagocyte'),
 ('CL:0000738', 'leukocyte'),
 ('CL:0000215', 'barrier cell'),
 ('CL:0000738', 'leukocyte'),
 ('CL:0000542', 'lymphocyte'),
 ('CL:0002320', 'connective tissue cell'),
 ('CL:0000066', 'epithelial cell'),
 ('CL:0000066', 'epithelial cell'),
 ('CL:0000738', 'leukocyte'),
 ('CL:0000988', 'hematopoietic cell'),
 ('CL:0002087', 'nongranular leukocyte'),
 ('CL:0002368', 'respiratory epithelial cell'),
 ('CL:0000215', 'barrier cell'),
 ('CL:0000183', 'contractile cell'),
 ('CL:0000064', 'ciliated cell'),
 ('CL:0000066', 'epithelial cell'),
 ('CL:0000151', 'secretory cell'),
 ('CL:0000034', 'stem cell'),
 ('CL:0000738', 'leukocyte'),
 ('CL:0000151', 'secretory cell'),
 ('CL:0000542', 'lymphocyte'),
 ('CL:0000151', 'secretory cell'),
 ('CL:0000738', 'leukocyte'),
 ('CL:0000542', 'lymphocyte'),
 ('CL:0000542', 'lymphocyte'),
 ('CL:0000988', 'hematopoietic cell'),
 ('CL:0000084', 'T cell'),
 ('CL:0000084', 'T cell'),
 ('CL:0000234', 'phagocyte'),
 ('CL:0000542', 'lymp

In [19]:
np.unique(list(coarsened_names_dict.values())), np.unique(list(coarsened_names_dict.values())).shape

(array(['T cell', 'adventitial cell', 'afferent neuron',
        'alpha-beta T cell', 'barrier cell', 'cell of skeletal muscle',
        'ciliated cell', 'connective tissue cell', 'contractile cell',
        'electrically active cell', 'electrically responsive cell',
        'embryonic cell', 'epithelial cell', 'fat cell',
        'hematopoietic cell', 'hematopoietic precursor cell', 'leukocyte',
        'lymphocyte', 'muscle cell', 'neural cell',
        'neuron associated cell', 'nongranular leukocyte', 'phagocyte',
        'pigment cell', 'respiratory epithelial cell',
        'salivary gland cell', 'secretory cell', 'stem cell'], dtype='<U28'),
 (28,))

In [20]:
[print(a) for a in np.unique(list(coarsened_names_dict.values()))];

T cell
adventitial cell
afferent neuron
alpha-beta T cell
barrier cell
cell of skeletal muscle
ciliated cell
connective tissue cell
contractile cell
electrically active cell
electrically responsive cell
embryonic cell
epithelial cell
fat cell
hematopoietic cell
hematopoietic precursor cell
leukocyte
lymphocyte
muscle cell
neural cell
neuron associated cell
nongranular leukocyte
phagocyte
pigment cell
respiratory epithelial cell
salivary gland cell
secretory cell
stem cell


In [21]:
[print(a) for a in np.unique(list(coarsened_names_dict.values()))];

T cell
adventitial cell
afferent neuron
alpha-beta T cell
barrier cell
cell of skeletal muscle
ciliated cell
connective tissue cell
contractile cell
electrically active cell
electrically responsive cell
embryonic cell
epithelial cell
fat cell
hematopoietic cell
hematopoietic precursor cell
leukocyte
lymphocyte
muscle cell
neural cell
neuron associated cell
nongranular leukocyte
phagocyte
pigment cell
respiratory epithelial cell
salivary gland cell
secretory cell
stem cell


In [22]:
human

AnnData object with n_obs × n_vars = 483152 × 58559
    obs: 'tissue_in_publication', 'assay_ontology_term_id', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'
    var: 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
    uns: 'X_normalization', '_scvi', '_training_mode', 'compartment_colors', 'default_embedding', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissu

In [23]:
len(np.unique(list(coarsened_names_dict.values())))

28

In [24]:
len(np.unique(list(coarsened_names_dict.values())))

28

In [25]:
not_mapped = {}
for ctid, ctname in zip(human.obs["cell_type_ontology_term_id"], human.obs["cell_type"]):
    if ctid not in coarsened_names_dict.keys():
        not_mapped[ctname] = ctid
set(not_mapped.keys())

{'fibroblast of breast',
 'intestinal crypt stem cell of large intestine',
 'intestinal crypt stem cell of small intestine',
 'intestinal tuft cell',
 'paneth cell of colon',
 'pulmonary ionocyte',
 'transit amplifying cell of colon',
 'transit amplifying cell of small intestine'}

In [26]:
not_mapped

{'fibroblast of breast': 'CL:4006000',
 'paneth cell of colon': 'CL:0009009',
 'transit amplifying cell of colon': 'CL:0009011',
 'intestinal crypt stem cell of large intestine': 'CL:0009016',
 'intestinal tuft cell': 'CL:0019032',
 'transit amplifying cell of small intestine': 'CL:0009012',
 'intestinal crypt stem cell of small intestine': 'CL:0009017',
 'pulmonary ionocyte': 'CL:0017000'}

In [27]:
coarsened_names_dict = {**coarsened_names_dict, **{not_mapped['fibroblast of breast']:"connective tissue cell",
 not_mapped['intestinal crypt stem cell of large intestine']:"stem cell",
 not_mapped['intestinal crypt stem cell of small intestine']:"stem cell",
 not_mapped['intestinal tuft cell']:"epithelial cell",
 not_mapped['paneth cell of colon']:"epithelial cell",
 not_mapped['pulmonary ionocyte']:"epithelial cell",
 not_mapped['transit amplifying cell of colon']:"stem cell",
 not_mapped['transit amplifying cell of small intestine']:"stem cell"}}

In [28]:
coarsened_ids = [coarsened_ids_dict.get(ctid, ctid) for ctid in human.obs["cell_type_ontology_term_id"]]
coarsened_names = [coarsened_names_dict.get(ctid, ctname) for ctid, ctname in zip(human.obs["cell_type_ontology_term_id"], human.obs["cell_type"])]

In [29]:
human.obs["coarse_cell_id"] = coarsened_ids
human.obs["coarse_cell_type"] = coarsened_names

In [30]:
sc.pp.filter_genes(human, min_counts=500)
sc.pp.filter_cells(human, min_counts=1000)



In [31]:
if ten_x_subset:
    human = human[human.obs["assay"] == "10x 3' v3"]

In [32]:
human.obs["tissue_type"] = [human_tissue_map[t] for t in human.obs["tissue_in_publication"]]
if tissue_subset:
    human = human[human.obs["tissue_type"].isin(keep_tissues)]

  human.obs["tissue_type"] = [human_tissue_map[t] for t in human.obs["tissue_in_publication"]]


In [33]:
human

View of AnnData object with n_obs × n_vars = 204155 × 42499
    obs: 'tissue_in_publication', 'assay_ontology_term_id', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'coarse_cell_id', 'coarse_cell_type', 'n_counts', 'tissue_type'
    var: 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference', 'n_counts'
    uns: 'X_normalization', '_scvi', '_training_mode', 'default_embedding', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_as

In [34]:
# filter to large cell types
if cell_type_number_filter > 0:
    human_keep_cell_types = human.obs["cell_ontology_class"].value_counts()[np.where(human.obs["cell_ontology_class"].value_counts() > cell_type_number_filter)[0]].index
    human = human[human.obs["cell_ontology_class"].isin(human_keep_cell_types)]

    print(len(human_keep_cell_types))

71


In [35]:
62

62

In [36]:
len(np.unique(coarsened_names))

28

In [37]:
human

View of AnnData object with n_obs × n_vars = 202355 × 42499
    obs: 'tissue_in_publication', 'assay_ontology_term_id', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'coarse_cell_id', 'coarse_cell_type', 'n_counts', 'tissue_type'
    var: 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference', 'n_counts'
    uns: 'X_normalization', '_scvi', '_training_mode', 'default_embedding', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_as

In [38]:
human_subset = human
human_subset.obs = human.obs[["coarse_cell_type", "cell_ontology_class", "tissue_type"]]
human_subset.obs["cell_type"] = human.obs["cell_ontology_class"]

In [39]:
del human_subset.raw # annoying subset rule for anndatas, to work with SAMap Comparison

In [40]:
human_subset.raw

In [41]:
# human_subset.write("/dfs/project/cross-species/yanay/data/tabula/finished/human.h5ad") # 500 cells
# human_subset.write("/dfs/project/cross-species/yanay/data/tabula/finished/human_all.h5ad") # 3
human_subset.write(f"/dfs/project/cross-species/yanay/data/tabula/finished/human_ct{cell_type_number_filter}_tissue{tissue_subset}_10x{ten_x_subset}.h5ad")

In [50]:
f"/dfs/project/cross-species/yanay/data/tabula/finished/human_ct{cell_type_number_filter}_tissue{tissue_subset}_10x{ten_x_subset}.h5ad"

'/dfs/project/cross-species/yanay/data/tabula/finished/human_ct0_tissueFalse_10xFalse.h5ad'

In [None]:
human_subset.obs["coarse_cell_type"].unique()

In [None]:
[print(a) for a in sorted(human_subset.obs["tissue_type"].unique())];

In [None]:
human_bladder_subset = human_subset[human_subset.obs["tissue_type"] == "bladder"]

In [None]:
human_bladder_subset.write("/dfs/project/cross-species/yanay/data/tabula/bladder/human.h5ad")

In [None]:
[print(a) for a in human_bladder_subset.obs["cell_type"].unique()];

In [None]:
human_bladder_subset

In [43]:
human = human_subset = human_bladder_subset = None 

# Tabula Microcebus

In [42]:
lemur = sc.read("/dfs/project/cross-species/yanay/data/tabula/microcebus/mouse_lemur.h5ad")
lemur

AnnData object with n_obs × n_vars = 244081 × 31509
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_name', 'cell_barcode_10x', 'sequencing_run_10x', 'channel_10x', 'possibly_contaminated_barcode_10x', 'method', 'individual', 'age', 'sex', 'tissue', 'tissue_system', 'tissue_order', 'subtissue', 'compartment_v1', 'cell_ontology_class_v1', 'free_annotation_v1', 'tissue__cell_ontology_class_v1', 'tissue__free_annotation_v1', 'mix_hybrid', 'low_quality', 'dendrogram_annotation_number', 'dendrogram_annotation_order', 'order__compartment_freeannotation_tissue', 'order__tissue_compartment_freeannotation', 'Mimu_168', 'Mimu_W03', 'Mimu_W04', 'Mimu_180ps', 'Mimu_191', 'Mimu_202', 'Mimu_208', 'Mimu_218', 'Mimu_229ps', 'Mimu_239ps', 'Mimu_249', 'Mimu_DMA', 'Mimu_DMB', 'Mimu_DPA', 'Mimu_DPB', 'Mimu_DQA', 'Mimu_DQB', 'Mimu_DRA', 'Mimu_DRB', 'MHC_C_I', 'MHC_NC_I', 'MHC_all_II', 'nMimu_168', 'nMimu_W03', 'nMimu_W04', 'nMimu_180ps', 'nMimu_191', 'nMimu_202', 'nMimu_208', 'nMimu_218', 'nMimu_229ps', 'nMimu

In [43]:
lemur.X = lemur.layers["raw_counts"]

In [44]:
lemur

AnnData object with n_obs × n_vars = 244081 × 31509
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_name', 'cell_barcode_10x', 'sequencing_run_10x', 'channel_10x', 'possibly_contaminated_barcode_10x', 'method', 'individual', 'age', 'sex', 'tissue', 'tissue_system', 'tissue_order', 'subtissue', 'compartment_v1', 'cell_ontology_class_v1', 'free_annotation_v1', 'tissue__cell_ontology_class_v1', 'tissue__free_annotation_v1', 'mix_hybrid', 'low_quality', 'dendrogram_annotation_number', 'dendrogram_annotation_order', 'order__compartment_freeannotation_tissue', 'order__tissue_compartment_freeannotation', 'Mimu_168', 'Mimu_W03', 'Mimu_W04', 'Mimu_180ps', 'Mimu_191', 'Mimu_202', 'Mimu_208', 'Mimu_218', 'Mimu_229ps', 'Mimu_239ps', 'Mimu_249', 'Mimu_DMA', 'Mimu_DMB', 'Mimu_DPA', 'Mimu_DPB', 'Mimu_DQA', 'Mimu_DQB', 'Mimu_DRA', 'Mimu_DRB', 'MHC_C_I', 'MHC_NC_I', 'MHC_all_II', 'nMimu_168', 'nMimu_W03', 'nMimu_W04', 'nMimu_180ps', 'nMimu_191', 'nMimu_202', 'nMimu_208', 'nMimu_218', 'nMimu_229ps', 'nMimu

In [45]:
lemur_ontology_names = lemur.obs["cell_ontology_class_v1"].unique()
len(lemur_ontology_names)

145

In [46]:
lemur_ontology_names_to_id = {}
for lon in lemur_ontology_names:
    r = obo_tbl[obo_tbl["name"] == lon]
    if r.shape[0] != 0:
        lemur_ontology_names_to_id[lon] = obo_tbl[obo_tbl["name"] == lon].index[0]

In [47]:
len(lemur_ontology_names_to_id)

136

In [48]:
lemur_ct_ids = [lemur_ontology_names_to_id.get(ctname, "na") for ctname in lemur.obs["cell_ontology_class_v1"]]
lemur.obs["cell_ontology_id"] = lemur_ct_ids

In [49]:
old_set = {}
for level in np.arange(20, 0, -1):
    coarsened_ids_dict, coarsened_names_dict = coarsen_labels(lemur.obs["cell_ontology_id"], max_level=level)
    new_set = set(coarsened_names_dict.values())
    diff = new_set.difference(old_set)
    print(f"Level {level}: added {diff}")
    old_set = new_set

Level 20: added {'neural cell', 'hair follicle cell', 'leukocyte', 'M cell of gut', 'germ line cell', 'fat cell', 'barrier cell', 'osteoblast', 'cell of skeletal muscle', 'hematopoietic cell', 'stem cell', 'electrically active cell', 'kidney cell', 'secretory cell', 'preosteoblast', 'connective tissue cell', 'contractile cell', 'pigment cell', 'ciliated cell', 'phagocyte', 'epithelial cell'}
Level 19: added set()
Level 18: added set()
Level 17: added set()
Level 16: added set()
Level 15: added set()
Level 14: added set()
Level 13: added set()
Level 12: added set()
Level 11: added set()
Level 10: added set()
Level 9: added set()
Level 8: added set()
Level 7: added set()
Level 6: added {'electrically responsive cell'}
Level 5: added {'neuron'}
Level 4: added {'afferent neuron', 'kidney epithelial cell'}
Level 3: added {'nongranular leukocyte', 'sensory neuron', 'endo-epithelial cell', 'electrically responsive cell', 'epithelial cell of nephron'}
Level 2: added {'kidney epithelial cell', 

In [50]:
coarsened_ids_dict, coarsened_names_dict = coarsen_labels(lemur.obs["cell_ontology_id"], max_level=5)

In [51]:
coarsened_names_dict

{'CL:0000018': 'germ line cell',
 'CL:0000017': 'germ line cell',
 'CL:0000771': 'hematopoietic cell',
 'CL:0000020': 'germ line cell',
 'CL:0000235': 'phagocyte',
 'CL:0002144': 'epithelial cell',
 'CL:0000542': 'leukocyte',
 'CL:0000499': 'connective tissue cell',
 'CL:0000077': 'epithelial cell',
 'CL:0000764': 'hematopoietic cell',
 'CL:0000084': 'leukocyte',
 'CL:0002131': 'contractile cell',
 'CL:0000763': 'hematopoietic cell',
 'CL:0000669': 'contractile cell',
 'CL:0000775': 'hematopoietic cell',
 'CL:0000623': 'leukocyte',
 'CL:0002129': 'contractile cell',
 'CL:0000115': 'barrier cell',
 'CL:0002063': 'secretory cell',
 'CL:0000057': 'connective tissue cell',
 'CL:0000540': 'electrically active cell',
 'CL:0002068': 'contractile cell',
 'CL:0002072': 'contractile cell',
 'CL:0000625': 'leukocyte',
 'CL:0000814': 'leukocyte',
 'CL:0000624': 'leukocyte',
 'CL:0002138': 'barrier cell',
 'CL:1000413': 'epithelial cell',
 'CL:0000136': 'fat cell',
 'CL:0000604': 'neuron',
 'CL:000

In [52]:
np.unique(list(coarsened_names_dict.values())), len(np.unique(list(coarsened_names_dict.values())))

(array(['M cell of gut', 'barrier cell', 'cell of skeletal muscle',
        'ciliated cell', 'connective tissue cell', 'contractile cell',
        'electrically active cell', 'epithelial cell', 'fat cell',
        'germ line cell', 'hair follicle cell', 'hematopoietic cell',
        'kidney cell', 'leukocyte', 'neural cell', 'neuron', 'osteoblast',
        'phagocyte', 'pigment cell', 'preosteoblast', 'secretory cell',
        'stem cell'], dtype='<U24'),
 22)

In [53]:
[print(a) for a in np.unique(list(coarsened_names_dict.values()))];

M cell of gut
barrier cell
cell of skeletal muscle
ciliated cell
connective tissue cell
contractile cell
electrically active cell
epithelial cell
fat cell
germ line cell
hair follicle cell
hematopoietic cell
kidney cell
leukocyte
neural cell
neuron
osteoblast
phagocyte
pigment cell
preosteoblast
secretory cell
stem cell


In [54]:
coarsened_names_dict

{'CL:0000018': 'germ line cell',
 'CL:0000017': 'germ line cell',
 'CL:0000771': 'hematopoietic cell',
 'CL:0000020': 'germ line cell',
 'CL:0000235': 'phagocyte',
 'CL:0002144': 'epithelial cell',
 'CL:0000542': 'leukocyte',
 'CL:0000499': 'connective tissue cell',
 'CL:0000077': 'epithelial cell',
 'CL:0000764': 'hematopoietic cell',
 'CL:0000084': 'leukocyte',
 'CL:0002131': 'contractile cell',
 'CL:0000763': 'hematopoietic cell',
 'CL:0000669': 'contractile cell',
 'CL:0000775': 'hematopoietic cell',
 'CL:0000623': 'leukocyte',
 'CL:0002129': 'contractile cell',
 'CL:0000115': 'barrier cell',
 'CL:0002063': 'secretory cell',
 'CL:0000057': 'connective tissue cell',
 'CL:0000540': 'electrically active cell',
 'CL:0002068': 'contractile cell',
 'CL:0002072': 'contractile cell',
 'CL:0000625': 'leukocyte',
 'CL:0000814': 'leukocyte',
 'CL:0000624': 'leukocyte',
 'CL:0002138': 'barrier cell',
 'CL:1000413': 'epithelial cell',
 'CL:0000136': 'fat cell',
 'CL:0000604': 'neuron',
 'CL:000

In [55]:
not_mapped = {}
for ctid, ctname in zip(lemur.obs["cell_ontology_id"], lemur.obs["cell_ontology_class_v1"]):
    if ctid not in coarsened_names_dict.keys():
        not_mapped[ctname] = ctid
set(not_mapped.keys())

{'cardiomyocyte',
 'cell',
 'gonadotroph',
 'intestinal tuft cell',
 'lactotroph',
 'mesothelial cell of epicardium',
 'pancreatic B cell',
 'podocyte',
 'unassigned'}

In [56]:
coarsened_names_dict = {**coarsened_names_dict, 
                        **{  not_mapped['cardiomyocyte']: 'contractile cell',
                             not_mapped['gonadotroph']: 'secretory cell',
                             not_mapped['intestinal tuft cell']: 'epithelial cell',
                             not_mapped['lactotroph']: 'secretory cell',
                             not_mapped['mesothelial cell of epicardium']: '',
                             not_mapped['pancreatic B cell']: 'secretory cell',
                             not_mapped['podocyte']: 'secretory cell'}}
                           
to_remove = ["cell", "unassigned"]

In [57]:
coarsened_ids = [coarsened_ids_dict.get(ctid, ctid) for ctid in lemur.obs["cell_ontology_id"]]
coarsened_names = [coarsened_names_dict.get(ctid, ctname) for ctid, ctname in zip(lemur.obs["cell_ontology_id"], lemur.obs["cell_ontology_class_v1"])]

In [58]:
lemur.obs["coarse_cell_id"] = coarsened_ids
lemur.obs["coarse_cell_type"] = coarsened_names

In [59]:
if ten_x_subset:
    lemur = lemur[lemur.obs["method"] == "10x"]
lemur

View of AnnData object with n_obs × n_vars = 231752 × 31509
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_name', 'cell_barcode_10x', 'sequencing_run_10x', 'channel_10x', 'possibly_contaminated_barcode_10x', 'method', 'individual', 'age', 'sex', 'tissue', 'tissue_system', 'tissue_order', 'subtissue', 'compartment_v1', 'cell_ontology_class_v1', 'free_annotation_v1', 'tissue__cell_ontology_class_v1', 'tissue__free_annotation_v1', 'mix_hybrid', 'low_quality', 'dendrogram_annotation_number', 'dendrogram_annotation_order', 'order__compartment_freeannotation_tissue', 'order__tissue_compartment_freeannotation', 'Mimu_168', 'Mimu_W03', 'Mimu_W04', 'Mimu_180ps', 'Mimu_191', 'Mimu_202', 'Mimu_208', 'Mimu_218', 'Mimu_229ps', 'Mimu_239ps', 'Mimu_249', 'Mimu_DMA', 'Mimu_DMB', 'Mimu_DPA', 'Mimu_DPB', 'Mimu_DQA', 'Mimu_DQB', 'Mimu_DRA', 'Mimu_DRB', 'MHC_C_I', 'MHC_NC_I', 'MHC_all_II', 'nMimu_168', 'nMimu_W03', 'nMimu_W04', 'nMimu_180ps', 'nMimu_191', 'nMimu_202', 'nMimu_208', 'nMimu_218', 'nMimu_229ps'

In [60]:
lemur.obs["coarse_cell_type"].unique()

array(['germ line cell', 'phagocyte', 'epithelial cell', 'leukocyte',
       'connective tissue cell', 'barrier cell', 'contractile cell',
       'fat cell', 'hematopoietic cell', 'secretory cell', 'neuron',
       'electrically active cell', 'neural cell', 'ciliated cell',
       'kidney cell', 'stem cell', 'cell of skeletal muscle',
       'M cell of gut', 'hair follicle cell', 'pigment cell',
       'preosteoblast', 'osteoblast'], dtype=object)

In [61]:
lemur

View of AnnData object with n_obs × n_vars = 231752 × 31509
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_name', 'cell_barcode_10x', 'sequencing_run_10x', 'channel_10x', 'possibly_contaminated_barcode_10x', 'method', 'individual', 'age', 'sex', 'tissue', 'tissue_system', 'tissue_order', 'subtissue', 'compartment_v1', 'cell_ontology_class_v1', 'free_annotation_v1', 'tissue__cell_ontology_class_v1', 'tissue__free_annotation_v1', 'mix_hybrid', 'low_quality', 'dendrogram_annotation_number', 'dendrogram_annotation_order', 'order__compartment_freeannotation_tissue', 'order__tissue_compartment_freeannotation', 'Mimu_168', 'Mimu_W03', 'Mimu_W04', 'Mimu_180ps', 'Mimu_191', 'Mimu_202', 'Mimu_208', 'Mimu_218', 'Mimu_229ps', 'Mimu_239ps', 'Mimu_249', 'Mimu_DMA', 'Mimu_DMB', 'Mimu_DPA', 'Mimu_DPB', 'Mimu_DQA', 'Mimu_DQB', 'Mimu_DRA', 'Mimu_DRB', 'MHC_C_I', 'MHC_NC_I', 'MHC_all_II', 'nMimu_168', 'nMimu_W03', 'nMimu_W04', 'nMimu_180ps', 'nMimu_191', 'nMimu_202', 'nMimu_208', 'nMimu_218', 'nMimu_229ps'

In [62]:
sc.pp.filter_genes(lemur, min_counts=500)
sc.pp.filter_cells(lemur, min_counts=1000)

  adata.var['n_counts'] = number


In [63]:
lemur.obs["tissue_type"] = [lemur_tissue_map[t] for t in lemur.obs["tissue"]]

if tissue_subset:
    lemur = lemur[lemur.obs["tissue_type"].isin(keep_tissues)]

In [64]:
lemur

View of AnnData object with n_obs × n_vars = 109498 × 19691
    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_name', 'cell_barcode_10x', 'sequencing_run_10x', 'channel_10x', 'possibly_contaminated_barcode_10x', 'method', 'individual', 'age', 'sex', 'tissue', 'tissue_system', 'tissue_order', 'subtissue', 'compartment_v1', 'cell_ontology_class_v1', 'free_annotation_v1', 'tissue__cell_ontology_class_v1', 'tissue__free_annotation_v1', 'mix_hybrid', 'low_quality', 'dendrogram_annotation_number', 'dendrogram_annotation_order', 'order__compartment_freeannotation_tissue', 'order__tissue_compartment_freeannotation', 'Mimu_168', 'Mimu_W03', 'Mimu_W04', 'Mimu_180ps', 'Mimu_191', 'Mimu_202', 'Mimu_208', 'Mimu_218', 'Mimu_229ps', 'Mimu_239ps', 'Mimu_249', 'Mimu_DMA', 'Mimu_DMB', 'Mimu_DPA', 'Mimu_DPB', 'Mimu_DQA', 'Mimu_DQB', 'Mimu_DRA', 'Mimu_DRB', 'MHC_C_I', 'MHC_NC_I', 'MHC_all_II', 'nMimu_168', 'nMimu_W03', 'nMimu_W04', 'nMimu_180ps', 'nMimu_191', 'nMimu_202', 'nMimu_208', 'nMimu_218', 'nMimu_229ps'

In [65]:
# filter to large cell types
if cell_type_number_filter is not 0:
    lemur_keep_cell_types = lemur.obs["cell_ontology_class_v1"].value_counts()[np.where(lemur.obs["cell_ontology_class_v1"].value_counts() > cell_type_number_filter)[0]].index
    lemur = lemur[lemur.obs["cell_ontology_class_v1"].isin(lemur_keep_cell_types)]

    print(len(lemur_keep_cell_types))

  if cell_type_number_filter is not 0:


45


In [66]:
40

40

In [67]:
lemur.obs[["cell_ontology_class_v1", "coarse_cell_type", "tissue_type"]]

Unnamed: 0,cell_ontology_class_v1,coarse_cell_type,tissue_type
L2_Pancreas_10X_AAACCTGAGAGACGAA,pancreatic acinar cell,epithelial cell,pancreas
L2_Pancreas_10X_AAACCTGAGGATATAC,macrophage,phagocyte,pancreas
L2_Pancreas_10X_AAACCTGCAAAGTGCG,fat cell,fat cell,pancreas
L2_Pancreas_10X_AAACCTGCAGTATGCT,neutrophil,hematopoietic cell,pancreas
L2_Pancreas_10X_AAACCTGCATGGTCAT,neutrophil,hematopoietic cell,pancreas
...,...,...,...
L4_Spleen_10X_TTTGTCAGTACCGGCT,B cell,leukocyte,spleen
L4_Spleen_10X_TTTGTCAGTCAGAAGC,neutrophil,hematopoietic cell,spleen
L4_Spleen_10X_TTTGTCAGTTGAGGTG,unassigned,secretory cell,spleen
L4_Spleen_10X_TTTGTCATCGAATGCT,B cell,leukocyte,spleen


In [68]:
from copy import deepcopy
lemur_subset = lemur
lemur_subset.obs = lemur.obs[["cell_ontology_class_v1", "coarse_cell_type", "tissue_type"]]
lemur_subset.obs["cell_type"] = lemur.obs["cell_ontology_class_v1"]
to_remove = ["cell", "unassigned"]
lemur_subset = lemur_subset[~lemur_subset.obs["cell_ontology_class_v1"].isin(to_remove)]

In [69]:
lemur_subset

View of AnnData object with n_obs × n_vars = 99039 × 19691
    obs: 'cell_ontology_class_v1', 'coarse_cell_type', 'tissue_type', 'cell_type'
    var: 'name', 'highly_variable', 'n_counts'
    uns: 'compartment_update_colors'
    obsm: 'X_pca', 'X_umap'
    layers: 'raw_counts'

In [70]:
lemur_susbet_raw = sc.AnnData(lemur_subset.layers["raw_counts"].toarray())

In [71]:
lemur_susbet_raw.obs = lemur_subset.obs

In [72]:
lemur_susbet = lemur_susbet_raw

In [73]:
len(lemur_subset.var_names)

19691

In [74]:
del lemur_subset.raw # annoying subset rule for anndatas, to work with SAMap Comparison

In [75]:
# lemur_subset.write("/dfs/project/cross-species/yanay/data/tabula/finished/mouse_lemur.h5ad") 500
#lemur_subset.write("/dfs/project/cross-species/yanay/data/tabula/finished/mouse_lemur_all.h5ad") # 250
lemur_subset.write(f"/dfs/project/cross-species/yanay/data/tabula/finished/mouse_lemur_ct{cell_type_number_filter}_tissue{tissue_subset}_10x{ten_x_subset}.h5ad")

In [None]:
lemur_bladder_subset = lemur_subset[lemur_subset.obs["tissue_type"] == "bladder"]
lemur_bladder_subset

In [None]:
lemur_bladder_subset.write("/dfs/project/cross-species/yanay/data/tabula/bladder/mouse_lemur.h5ad")

In [None]:
lemur_bladder_subset.obs["cell_type"].unique()

In [None]:
lemur_bladder_subset.obs["cell_type"].value_counts()

In [None]:
[print(a) for a in lemur_bladder_subset.obs["cell_type"].unique()];

In [None]:
lemur_bladder_subset

In [None]:
lemur_brain_subset = lemur_subset[lemur_subset.obs["tissue"].isin(["Brain_cortex", "Brainstem"])]
lemur_brain_subset = lemur_brain_subset[lemur_brain_subset.obs["cell_type"] != "unassigned"]
lemur_brain_subset

In [None]:
lemur_brain_subset.write("/dfs/project/cross-species/yanay/data/tabula/brain/mouse_lemur.h5ad")

In [None]:
lemur_brain_subset.obs["cell_type"].unique()

In [None]:
lemur_brain_subset.obs["cell_type"].value_counts()

In [None]:
[print(a) for a in lemur_brain_subset.obs["cell_type"].unique()];

In [None]:
lemur_brain_subset

In [78]:
lemur = lemur_subset = None

# Tabule Muris (outdated)

```
wget https://figshare.com/ndownloader/articles/5968960/versions/3
unzip 3
unzip droplet.zip

```

In [76]:
mouse_tissue_files = glob("/dfs/project/cross-species/yanay/data/tabula/muris/droplet/*")
mouse_tissue_files

['/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Kidney-10X_P7_5',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Kidney-10X_P4_6',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Lung-10X_P7_9',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Marrow-10X_P7_3',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Mammary_Gland-10X_P7_13',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Spleen-10X_P4_7',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Tongue-10X_P7_10',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Thymus-10X_P7_11',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Kidney-10X_P4_5',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Trachea-10X_P8_14',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Bladder-10X_P4_4',
 '/dfs/project/cross-species/yanay/data/tabula/muris/droplet/Lung-10X_P8_12',
 '/dfs/project/cross-species/yanay/data/ta

In [77]:
mouse_all_ads = []
tissue_names = []
for tissue_file in mouse_tissue_files:
    t_ad = sc.read_10x_mtx(tissue_file)
    mouse_all_ads.append(t_ad)
    tissue_name = tissue_file.split("/")[-1]
    tissue_names.append(tissue_name)

In [78]:
mouse_all_tissues = sc.concat(mouse_all_ads, label="tissue", keys=tissue_names)

  utils.warn_names_duplicates("obs")


In [79]:
sc.pp.filter_genes(mouse_all_tissues, min_counts=500)
sc.pp.filter_cells(mouse_all_tissues, min_counts=1000)

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [80]:
barcodes = pd.Series(mouse_all_tissues.obs_names).str.split("-", expand=True)[0]

In [81]:
tissue_ids = mouse_all_tissues.obs["tissue"].str.split("-", expand=True)[1]
new_obs_names = tissue_ids.reset_index()[1].str.cat(barcodes, sep="_")
mouse_all_tissues.obs_names = new_obs_names

In [82]:
mouse_annot = pd.read_csv("/dfs/project/cross-species/yanay/data/tabula/muris/annotations_droplet.csv").set_index("cell")
display(mouse_annot)

  mouse_annot = pd.read_csv("/dfs/project/cross-species/yanay/data/tabula/muris/annotations_droplet.csv").set_index("cell")


Unnamed: 0_level_0,cell_ontology_class,cell_ontology_id,channel,cluster.ids,free_annotation,mouse.id,mouse.sex,subsetA,subsetA_cluster.ids,subsetB,subsetB_cluster.ids,subsetC,subsetC_cluster.ids,subsetD,subsetD_cluster.ids,subtissue,tissue,tissue_tSNE_1,tissue_tSNE_2
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10X_P7_8_AAACGGGAGGATATAC,myeloid cell,CL:0000763,10X_P7_8,20,dendritic cells and interstital macrophages,3-F-56,F,,,,,,,,,,Lung,17.024721,-32.902836
10X_P7_8_AAACGGGTCTCGTATT,alveolar macrophage,CL:0000583,10X_P7_8,5,,3-F-56,F,,,,,,,,,,Lung,25.160619,25.066566
10X_P7_8_AAAGATGCAGATCTGT,B cell,CL:0000236,10X_P7_8,12,,3-F-56,F,,,,,,,,,,Lung,1.740567,46.488878
10X_P7_8_AAATGCCAGATAGTCA,natural killer cell,CL:0000623,10X_P7_8,7,,3-F-56,F,,,,,,,,,,Lung,-31.647934,-2.208061
10X_P7_8_AAATGCCCAAACTGCT,T cell,CL:0000084,10X_P7_8,21,,3-F-56,F,,,,,,,,,,Lung,-37.281266,-5.619565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10X_P7_15_TTTGTCAAGCCAGAAC,endothelial cell,CL:0000115,10X_P7_15,4,,3-F-57,F,,,,,,,,,,Limb_Muscle,21.778547,-15.239181
10X_P7_15_TTTGTCACAGCCTTGG,endothelial cell,CL:0000115,10X_P7_15,10,,3-F-57,F,,,,,,,,,,Limb_Muscle,37.977851,-10.079247
10X_P7_15_TTTGTCAGTAAGGGCT,mesenchymal stem cell,CL:0000134,10X_P7_15,9,,3-F-57,F,,,,,,,,,,Limb_Muscle,-27.254255,-10.505882
10X_P7_15_TTTGTCAGTCTCCACT,mesenchymal stem cell,CL:0000134,10X_P7_15,8,,3-F-57,F,,,,,,,,,,Limb_Muscle,-15.538574,-4.647427


In [83]:
keep_barcodes = set(np.unique(mouse_annot.index)).intersection(set(mouse_all_tissues.obs_names))
len(keep_barcodes)

55652

In [84]:
mouse = mouse_all_tissues[list(keep_barcodes), :]
mouse.obs["cell_type"] = mouse_annot["cell_ontology_class"]
mouse.obs["cell_ontology_id"] = mouse_annot["cell_ontology_id"]

  mouse.obs["cell_type"] = mouse_annot["cell_ontology_class"]


In [85]:
sc.pp.filter_genes(mouse, min_counts=500)
sc.pp.filter_cells(mouse, min_counts=1000)

In [86]:
mouse.obs["tissue_type"] = [mouse_tissue_map[t.split("-")[0]] for t in mouse.obs["tissue"]]
if tissue_subset:
    mouse = mouse[mouse.obs["tissue_type"].isin(keep_tissues)]

In [87]:
mouse.obs

Unnamed: 0,tissue,n_counts,cell_type,cell_ontology_id,tissue_type
10X_P4_5_TCGAGGCGTACATGTC,Kidney-10X_P4_5,16026.0,kidney proximal straight tubule epithelial cell,CL:1000839,kidney
10X_P8_14_TACGGGCGTACGAAAT,Trachea-10X_P8_14,2732.0,mesenchymal cell,CL:0008019,trachea
10X_P7_14_TGGCTGGCATGTAAGA,Limb_Muscle-10X_P7_14,3343.0,B cell,CL:0000236,muscle
10X_P7_6_CACATAGAGAAACCAT,Spleen-10X_P7_6,4244.0,B cell,CL:0000236,spleen
10X_P8_15_GTATTCTGTTTCCACC,Trachea-10X_P8_15,2974.0,mesenchymal cell,CL:0008019,trachea
...,...,...,...,...,...
10X_P7_6_GCAGCCAAGGTTACCT,Spleen-10X_P7_6,2051.0,B cell,CL:0000236,spleen
10X_P8_14_GACGCGTCACGAGAGT,Trachea-10X_P8_14,3208.0,endothelial cell,CL:0000115,trachea
10X_P8_14_ACGGGCTGTCAGCTAT,Trachea-10X_P8_14,2991.0,endothelial cell,CL:0000115,trachea
10X_P7_2_GGTGAAGGTGCATCTA,Marrow-10X_P7_2,7201.0,erythroblast,CL:0000765,bone_marrow


In [88]:
# filter to large cell types
if cell_type_number_filter is not 0:
    mouse_keep_cell_types = mouse.obs["cell_type"].value_counts()[np.where(mouse.obs["cell_type"].value_counts() > cell_type_number_filter)[0]].index
    mouse = mouse[mouse.obs["cell_type"].isin(mouse_keep_cell_types)]
    print(len(mouse_keep_cell_types))

30


  if cell_type_number_filter is not 0:


In [89]:
29

29

In [90]:
mouse

View of AnnData object with n_obs × n_vars = 39420 × 13921
    obs: 'tissue', 'n_counts', 'cell_type', 'cell_ontology_id', 'tissue_type'
    var: 'n_counts'

In [91]:
coarsened_ids_dict, coarsened_names_dict = coarsen_labels(mouse.obs["cell_ontology_id"], max_level=1)

In [92]:
np.unique(list(coarsened_names_dict.values()))

array(['barrier cell', 'cell of skeletal muscle',
       'ciliated epithelial cell', 'connective tissue cell',
       'epithelial cell', 'hematopoietic cell',
       'hematopoietic precursor cell', 'leukocyte',
       'lower urinary tract cell', 'lymphocyte',
       'lymphocyte of B lineage', 'mesenchymal cell',
       'nongranular leukocyte', 'phagocyte', 'secretory cell',
       'stem cell'], dtype='<U28')

In [93]:
coarsened_ids = [coarsened_ids_dict.get(ctid, ctid) for ctid in mouse.obs["cell_ontology_id"]]
coarsened_names = [coarsened_names_dict.get(ctid, ctname) for ctid, ctname in zip(mouse.obs["cell_ontology_id"], mouse.obs["cell_type"])]

In [94]:
mouse.obs["coarse_cell_id"] = coarsened_ids
mouse.obs["coarse_cell_type"] = coarsened_names

  mouse.obs["coarse_cell_id"] = coarsened_ids


In [95]:
mouse.write(f"/dfs/project/cross-species/yanay/data/tabula/finished/muris_ct{cell_type_number_filter}_tissue{tissue_subset}.h5ad")

In [96]:
mouse = None

# Tabula Muris Senis

In [None]:
mouse = sc.read("/dfs/project/cross-species/yanay/data/tabula/muris_senis/muris_senis_droplet_unproc.h5ad")

In [None]:
mouse_ontology_names = mouse.obs["cell_ontology_class"].unique()
len(mouse_ontology_names)

In [None]:
mouse_ontology_names_to_id = {}
for mon in mouse_ontology_names:
    r = obo_tbl[obo_tbl["name"] == mon]
    if r.shape[0] != 0:
        mouse_ontology_names_to_id[mon] = obo_tbl[obo_tbl["name"] == mon].index[0]

In [None]:
len(mouse_ontology_names_to_id)

In [None]:
mouse_ct_ids = [mouse_ontology_names_to_id.get(ctname, "na") for ctname in mouse.obs["cell_ontology_class"]]
mouse.obs["cell_ontology_id"] = mouse_ct_ids

In [None]:
coarsened_ids_dict, coarsened_names_dict = coarsen_labels(mouse.obs["cell_ontology_id"], max_level=2)

In [None]:
list(zip(coarsened_ids_dict.values(), coarsened_names_dict.values()))

In [None]:
np.unique(list(coarsened_names_dict.values())), len(np.unique(list(coarsened_names_dict.values())))

In [None]:
[print(a) for a in np.unique(list(coarsened_names_dict.values()))];

In [None]:
not_mapped = {}
for ctid, ctname in zip(mouse.obs["cell_ontology_id"], mouse.obs["cell_ontology_class"]):
    if ctid not in coarsened_names_dict.keys():
        not_mapped[ctname] = ctid
set(not_mapped.keys())

In [None]:
coarsened_names_dict = {**coarsened_names_dict, 
                        **{  not_mapped['cardiomyocyte']: 'muscle cell',
                             not_mapped['NK cell']: 'T cell',
                             not_mapped['erythroid progenitor']: 'hematopoietic cell',
                             not_mapped['kidney mesangial cell']: 'contractile cell',
                             not_mapped['podocyte']: 'epithelial cell',
                             not_mapped['club cell of bronchiole']: 'epithelial cell',
                             not_mapped['skeletal muscle cell']: 'cell of skeletal muscle',
                             not_mapped['pancreatic B cell']: 'secretory cell',
                             not_mapped['pancreatic ductal cel']: 'epithelial cell',
                             not_mapped['immature NKT cell']: 'T cell',
                             not_mapped['double negative T cell']: 'T cell',
                             not_mapped['mesenchymal progenitor cell']: 'stem cell'}}

In [None]:
coarsened_ids = [coarsened_ids_dict.get(ctid, ctid) for ctid in mouse.obs["cell_ontology_id"]]
coarsened_names = [coarsened_names_dict.get(ctid, ctname) for ctid, ctname in zip(mouse.obs["cell_ontology_id"], mouse.obs["cell_ontology_class"])]

In [None]:
mouse.obs["coarse_cell_id"] = coarsened_ids
mouse.obs["coarse_cell_type"] = coarsened_names

In [None]:
len(np.unique(coarsened_names))

In [None]:
mouse

In [None]:
mouse.obs["tissue_type"] = [mouse_tissue_map[t] for t in mouse.obs["tissue"]]
#mouse = mouse[mouse.obs["tissue_type"].isin(keep_tissues)]

In [None]:
mouse

In [None]:
# filter to large cell types
mouse_keep_cell_types = mouse.obs["cell_ontology_class"].value_counts()[np.where(mouse.obs["cell_ontology_class"].value_counts() > 0)[0]].index
#mouse = mouse[mouse.obs["cell_ontology_class"].isin(mouse_keep_cell_types)]

In [None]:
mouse

In [None]:
from copy import deepcopy
mouse_subset = mouse
mouse_subset.obs = mouse_subset.obs[["coarse_cell_type", "cell_ontology_class", "tissue_type"]]
mouse_subset.obs["cell_type"] = mouse.obs["cell_ontology_class"]

In [None]:
mouse_subset.write("/dfs/project/cross-species/yanay/data/tabula/finished/muris_senis_all.h5ad")

In [None]:
mouse_subset.obs["tissue_type"].unique()

In [None]:
mouse_bladder_subset = mouse_subset[mouse_subset.obs["tissue_type"] == "bladder"]
#sc.pp.filter_cells(mouse_bladder_subset, min_counts=500)
mouse_bladder_subset

In [None]:
mouse_bladder_subset.write("/dfs/project/cross-species/yanay/data/tabula/bladder/mouse.h5ad")

In [None]:
[print(a) for a in mouse_bladder_subset.obs["cell_type"].unique()];

In [None]:
mouse_bladder_subset

In [None]:
[print(a) for a in np.unique(mouse_subset.obs["tissue_type"])]

In [None]:
mouse = mouse_subset = None

# Fly (Drosophilia) Data

In [None]:
import loompy

In [None]:
# https://github.com/scverse/anndata/issues/627
with loompy.connect("/dfs/project/cross-species/yanay/data/tabula/drosophilia/r_fca_biohub_all_wo_blood_10x.loom", validate=False) as ds:
    print(ds.shape)
    print(ds.ra.keys())
    print(ds.ca.keys())
    genes = ds.ra["Gene"]
    cells = ds.ca["CellID"]
    gene_names = ds.ra["Gene"]
    
    # extras for obs
    tissue = ds.ca["tissue"]
    annotation = ds.ca["annotation"]
    annotation__ontology_id = ds.ca["annotation__ontology_id"]
    annotation_broad = ds.ca["annotation_broad"]
    annotation_broad__ontology_id = ds.ca["annotation_broad__ontology_id"]
    
    
    df = pd.DataFrame(ds[:, :], index=genes, columns=cells).T

fly = sc.AnnData(df)
fly.obs_names = df.index
fly.var_names = df.columns

fly.obs["tissue"] = tissue
fly.obs["cell_type"] = annotation
#fly.obs["annotation__ontology_id"] = annotation__ontology_id
fly.obs["coarse_cell_type"] = annotation_broad
#fly.obs["annotation_broad__ontology_id"] = annotation_broad__ontology_id

In [None]:
fly.obs

In [None]:
fly.obs["coarse_cell_type"].unique()

In [None]:
fly.obs["cell_type"].unique()

In [None]:
fly.X

In [None]:
fly_subset = fly[fly.obs["coarse_cell_type"] != "unannotated"]
fly_subset = fly_subset[fly_subset.obs["coarse_cell_type"] != "artefact"]

In [None]:
fly_subset.write("/dfs/project/cross-species/yanay/data/tabula/finished/fly.h5ad")

In [None]:
fly_subset.obs["coarse_cell_type"].value_counts()

In [None]:
fly_subset.obs["coarse_cell_type"].value_counts()

In [None]:
[print(a) for a in sorted(fly_subset.obs["coarse_cell_type"].unique())];

In [None]:
[print(a) for a in sorted(fly_subset.obs["tissue"].unique())];

In [None]:
fly_head_subset = fly_subset[fly_subset.obs["tissue"] == "head"]
fly_head_subset

In [None]:
fly_head_subset.obs["coarse_cell_type"].unique()

In [None]:
fly_head_subset.write("/dfs/project/cross-species/yanay/data/tabula/brain/fly.h5ad")

### Redo with tissue type

In [None]:
fly_subset = sc.read("/dfs/project/cross-species/yanay/data/tabula/finished/fly.h5ad")

In [None]:
[print(f'"{a}":"",') for a in fly_subset.obs["tissue"].unique()];

In [None]:
np.unique(list(human_tissue_map.values()) + list(lemur_tissue_map.values()) + list(mouse_tissue_map.values()))

In [None]:
fly_tissue_type_map = {
    "oenocyte":"endocrine",
    "fat_body":"fat",
    "haltere":"muscle",
    "proboscis_and_maxpalp":"tounge",
    "antenna":"brain",
    "trachea":"trachea",
    "testis":"testes",
    "ovary":"uterus",
    "gut":"intestine",
    "malpighian_tubule":"kidney",
    "body_wall":"skin",
    "heart":"heart_and_aorta",
    "male_reproductive_glands":"testes",
    "leg":"muscle",
    "wing":"muscle",
    "head":"brain",
    "body":"muscle",
}

In [None]:
fly_subset.obs["tissue_type"] = [fly_tissue_type_map[a] for a in fly_subset.obs["tissue"]]

In [None]:
fly_subset.write("/dfs/project/cross-species/yanay/data/tabula/finished/fly.h5ad")

In [None]:
fly_subset[fly_subset.obs["tissue_type"] == "heart_and_aorta"].obs["cell_type"].value_counts()

# Determining tissue Subset

In [None]:
[print('"' + a + '": "",') for a in human.obs["tissue_in_publication"].unique()];

In [None]:
[print('"' + a + '": "",') for a in mouse.obs["tissue"].unique()];

In [None]:
[print('"' + a + '": "",') for a in lemur.obs["tissue"].unique()];