In [1]:
import pathlib

import pandas as pd
import xarray as xr
from tqdm import tqdm

In [2]:
zhang_paths = list(pathlib.Path("/wmb/zarr/Zhang2021Cell.CutSites/").glob("*.zarr"))
domcke_paths = list(pathlib.Path("/wmb/zarr/Domcke2020Science.CutSites/").glob("*.zarr"))
zarr_paths = zhang_paths + domcke_paths

In [3]:
tissue_sample_to_zarr_cate = {
    "Human_brain_1": "UMB4540-snATAC-frontal-cortex-rep1",
    "Human_brain_2": "UMB4540-snATAC-frontal-cortex-rep2",
    "LungMap_D122": "LungMap-D122-rep1",
    "LungMap_D175": "LungMap-D175-rep1",
    "LungMap_D231": "LungMap-D231-rep1",
    "adipose_omentum_SM-ADYHB": "adipose-omentum-SM-ADYHB-rep1",
    "adipose_omentum_SM-CHZRM": "adipose-omentum-SM-CHZRM-rep1",
    "adipose_omentum_SM-CSSD4": "adipose-omentum-SM-CSSD4-rep1",
    "adipose_omentum_SM-IOBHJ": "adipose-omentum-SM-IOBHJ-rep1",
    "adrenal_gland_SM-A8WNO": "adrenal-gland-SM-A8WNO-rep1",
    "artery_aorta_SM-C1MLC": "artery-aorta-SM-C1MLC-rep1",
    "artery_aorta_SM-C1PX3": "artery-aorta-SM-C1PX3-rep1",
    "artery_aorta_SM-CR89M": "artery-aorta-SM-CR89M-rep1",
    "artery_aorta_SM-JF1NU": "artery-aorta-SM-JF1NU-rep1",
    "artery_tibial_SM-CHLWW": "artery-tibial-SM-CHLWW-rep1",
    "artery_tibial_SM-IOBHK": "artery-tibial-SM-IOBHK-rep1",
    "colon_sigmoid_SM-AZPYO": "colon-sigmoid-SM-AZPYO-rep1",
    "colon_sigmoid_SM-JF1O8": "colon-sigmoid-SM-JF1O8-rep1",
    "colon_transverse_SM-A9HOW": "colon-transverse-SM-A9HOW-rep1",
    "colon_transverse_SM-A9VP4": "colon-transverse-SM-A9VP4-rep1",
    "colon_transverse_SM-ACCQ1": "colon-transverse-SM-ACCQ1-rep1",
    "colon_transverse_SM-BZ2ZS": "colon-transverse-SM-BZ2ZS-rep1",
    "colon_transverse_SM-CSSDA": "colon-transverse-SM-CSSDA-rep1",
    "esophagus_ge_junction_SM-CTD24": "esophagus-ge-junction-SM-CTD24-rep1",
    "esophagus_ge_junction_SM-IOERG": "esophagus-ge-junction-SM-IOERG-rep1",
    "esophagus_mucosa_SM-A9HOR": "esophagus-mucosa-SM-A9HOR-rep1",
    "esophagus_mucosa_SM-A9VPA": "esophagus-mucosa-SM-A9VPA-rep1",
    "esophagus_mucosa_SM-AZPYJ": "esophagus-mucosa-SM-AZPYJ-rep1",
    "esophagus_muscularis_SM-A8CPH": "esophagus-muscularis-SM-A8CPH-rep1",
    "esophagus_muscularis_SM-CSSCV": "esophagus-muscularis-SM-CSSCV-rep1",
    "esophagus_muscularis_SM-IOBHM": "esophagus-muscularis-SM-IOBHM-rep1",
    "esophagus_muscularis_SM-IQYD1": "esophagus-muscularis-SM-IQYD1-rep1",
    "heart_atrial_appendage_SM-IOBHN": "heart-atrial-appendage-SM-IOBHN-rep1",
    "heart_atrial_appendage_SM-JF1NX": "heart-atrial-appendage-SM-JF1NX-rep1",
    "heart_la_CARE181125_3C": "CARE181125-3C-rep1",
    "heart_la_CARE190307_10C": "CARE190307-10C-rep1",
    "heart_la_CARE191122_2C": "CARE191122-2C-rep1",
    "heart_lv_CARE181125_3D": "CARE181125-3D-rep1",
    "heart_lv_CARE190307_10D": "CARE190307-10D-rep1",
    "heart_lv_CARE190331_11D": "CARE190331-11D-rep1",
    "heart_lv_CARE191122_2D": "CARE191122-2D-rep1",
    "heart_lv_CARE191122_3D": "CARE191122-3D-rep1",
    "heart_lv_SM-IOBHO": "heart-lv-SM-IOBHO-rep1",
    "heart_lv_SM-JF1NY": "heart-lv-SM-JF1NY-rep1",
    "heart_ra_CARE181213_2A": "CARE181213-2A-rep1",
    "heart_ra_CARE190307_10A": "CARE190307-10A-rep1",
    "heart_rv_CARE181125_3B": "CARE181125-3B-rep1",
    "heart_rv_CARE181213_2B": "CARE181213-2B-rep1",
    "heart_rv_CARE190307_10B": "CARE190307-10B-rep1",
    "heart_rv_CARE190331_11B": "CARE190331-11B-rep1",
    "islet_CB1": "islet-CB1-rep1",
    "islet_CB2": "islet-CB2-rep1",
    "islet_CB3": "islet-CB3-rep1",
    "liver_SM-A8WNZ": "liver-SM-A8WNZ-rep1",
    "lung_SM-A62E9": "lung-SM-A62E9-rep1",
    "lung_SM-A8WNH": "lung-SM-A8WNH-rep1",
    "lung_SM-ACCPU": "lung-SM-ACCPU-rep1",
    "lung_SM-JF1NZ": "lung-SM-JF1NZ-rep1",
    "mammary_tissue_SM-IOBHL": "mammary-tissue-SM-IOBHL-rep1",
    "mammary_tissue_SM-JF1NV": "mammary-tissue-SM-JF1NV-rep1",
    "muscle_SM-ADA6L": "muscle-SM-ADA6L-rep1",
    "muscle_SM-C1MKW": "muscle-SM-C1MKW-rep1",
    "muscle_SM-C1PWV": "muscle-SM-C1PWV-rep1",
    "muscle_SM-IOBHP": "muscle-SM-IOBHP-rep1",
    "muscle_SM-JF1O9": "muscle-SM-JF1O9-rep1",
    "nerve_tibial_SM-CHLWU": "nerve-tibial-SM-CHLWU-rep1",
    "nerve_tibial_SM-CP2V6": "nerve-tibial-SM-CP2V6-rep1",
    "nerve_tibial_SM-IOBHQ": "nerve-tibial-SM-IOBHQ-rep1",
    "ovary_SM-IOBHR": "ovary-SM-IOBHR-rep1",
    "pancreas_SM-ADRUQ": "pancreas-SM-ADRUQ-rep1",
    "pancreas_SM-IOBHS": "pancreas-SM-IOBHS-rep1",
    "pancreas_SM-JF1NS": "pancreas-SM-JF1NS-rep1",
    "pancreas_SM-JF1O6": "pancreas-SM-JF1O6-rep1",
    "skin_SM-IOBHT": "skin-SM-IOBHT-rep1",
    "skin_SM-JF1O1": "skin-SM-JF1O1-rep1",
    "skin_sun_exposed_SM-ADYHK": "skin-sun-exposed-SM-ADYHK-rep1",
    "skin_sun_exposed_SM-IOBHU": "skin-sun-exposed-SM-IOBHU-rep1",
    "skin_sun_exposed_SM-IQYCP": "skin-sun-exposed-SM-IQYCP-rep1",
    "skin_sun_exposed_SM-JF1NT": "skin-sun-exposed-SM-JF1NT-rep1",
    "small_intestine_SM-A62GO": "small-intestine-SM-A62GO-rep1",
    "small_intestine_SM-ADA5F": "small-intestine-SM-ADA5F-rep1",
    "small_intestine_SM-JF1O2": "small-intestine-SM-JF1O2-rep1",
    "stomach_SM-CHLWL": "stomach-SM-CHLWL-rep1",
    "stomach_SM-IOBHV": "stomach-SM-IOBHV-rep1",
    "stomach_SM-JF1NP": "stomach-SM-JF1NP-rep1",
    "stomach_SM-JF1O3": "stomach-SM-JF1O3-rep1",
    "thyroid_SM-C1MKY": "thyroid-SM-C1MKY-rep1",
    "thyroid_SM-IOBHW": "thyroid-SM-IOBHW-rep1",
    "thyroid_SM-JF1O4": "thyroid-SM-JF1O4-rep1",
    "uterus_SM-A87A2": "uterus-SM-A87A2-rep1",
    "uterus_SM-IOBHX": "uterus-SM-IOBHX-rep1",
    "vagina_SM-A9HOS": "vagina-SM-A9HOS-rep1",
}

for zarr_path in domcke_paths:
    zarr_name = zarr_path.name.split(".")[0]
    *sample, tissue = zarr_name.split("_")
    sample = "_".join(sample)
    new_name = f"{tissue}_{sample}"
    tissue_sample_to_zarr_cate[new_name] = zarr_name

In [4]:
meta = pd.read_csv("Cell_metadata.tsv.gz", sep="\t")
meta["zarr_tissue"] = meta["tissue"].map(tissue_sample_to_zarr_cate).str.replace("-", "_")
assert (meta["zarr_tissue"].isna()).sum() == 0
meta["barcode"] = meta["cellID"].map(lambda i: i.split("+")[1])

In [16]:
sorted(meta["tissue"].unique())

['Human_brain_1',
 'Human_brain_2',
 'LungMap_D122',
 'LungMap_D175',
 'LungMap_D231',
 'adipose_omentum_SM-ADYHB',
 'adipose_omentum_SM-CHZRM',
 'adipose_omentum_SM-CSSD4',
 'adipose_omentum_SM-IOBHJ',
 'adrenal_gland_SM-A8WNO',
 'adrenal_sample_1',
 'adrenal_sample_16',
 'adrenal_sample_27',
 'adrenal_sample_31',
 'artery_aorta_SM-C1MLC',
 'artery_aorta_SM-C1PX3',
 'artery_aorta_SM-CR89M',
 'artery_aorta_SM-JF1NU',
 'artery_tibial_SM-CHLWW',
 'artery_tibial_SM-IOBHK',
 'cerebellum_sample_58',
 'cerebrum_sample_36',
 'cerebrum_sample_6',
 'cerebrum_sample_64',
 'cerebrum_sample_66',
 'cerebrum_sample_69',
 'cerebrum_sample_71',
 'colon_sigmoid_SM-AZPYO',
 'colon_sigmoid_SM-JF1O8',
 'colon_transverse_SM-A9HOW',
 'colon_transverse_SM-A9VP4',
 'colon_transverse_SM-ACCQ1',
 'colon_transverse_SM-BZ2ZS',
 'colon_transverse_SM-CSSDA',
 'esophagus_ge_junction_SM-CTD24',
 'esophagus_ge_junction_SM-IOERG',
 'esophagus_mucosa_SM-A9HOR',
 'esophagus_mucosa_SM-A9VPA',
 'esophagus_mucosa_SM-AZPYJ',

In [17]:
records = []
for zarr_path in tqdm(zarr_paths):
    ds = xr.open_zarr(zarr_path)
    barcode_map = ds["barcode_map"].to_pandas().reset_index()
    barcode_map.columns = ["barcode", "barcode_idx"]
    barcode_map["zarr_path"] = str(zarr_path)
    barcode_map["zarr_tissue"] = barcode_map["barcode"].map(lambda i: i.split(":")[0].split(".")[0].replace("-", "_"))
    barcode_map["barcode_seq"] = barcode_map["barcode"].map(lambda i: i.split(":")[1])

    use_cells = meta.loc[meta["zarr_tissue"].isin(barcode_map["zarr_tissue"].unique()), "barcode"]
    if use_cells.size < 1:
        print(barcode_map["zarr_tissue"].unique())
    barcode_map = barcode_map[barcode_map["barcode_seq"].isin(use_cells)].copy()
    records.append(barcode_map)

barcode_records = pd.concat(records)
barcode_records["barcode_idx"] = barcode_records["barcode_idx"].astype("uint32")
barcode_records["zarr_path"] = barcode_records["zarr_path"].astype("category")
barcode_records = barcode_records.set_index(["zarr_tissue", "barcode_seq"])

 34%|███▎      | 99/295 [00:54<05:35,  1.71s/it]

['sample_11_cerebrum']


 34%|███▍      | 100/295 [00:56<05:39,  1.74s/it]

['sample_11_cerebrum']


 34%|███▍      | 101/295 [00:58<05:37,  1.74s/it]

['sample_11_cerebrum']


 35%|███▍      | 102/295 [01:00<05:46,  1.80s/it]

['sample_11_cerebrum']


 35%|███▍      | 103/295 [01:02<05:56,  1.85s/it]

['sample_11_cerebrum']


 35%|███▌      | 104/295 [01:04<06:08,  1.93s/it]

['sample_11_cerebrum']


 36%|███▌      | 105/295 [01:06<06:05,  1.92s/it]

['sample_11_cerebrum']


 43%|████▎     | 128/295 [01:56<06:24,  2.30s/it]

['sample_17_cerebellum']


 44%|████▎     | 129/295 [01:59<06:50,  2.47s/it]

['sample_17_cerebellum']


 44%|████▍     | 130/295 [02:02<07:11,  2.62s/it]

['sample_17_cerebellum']


 76%|███████▋  | 225/295 [07:05<03:50,  3.29s/it]

['sample_45_cerebrum']


 77%|███████▋  | 226/295 [07:08<03:40,  3.20s/it]

['sample_45_cerebrum']


 77%|███████▋  | 227/295 [07:11<03:34,  3.15s/it]

['sample_45_cerebrum']


 77%|███████▋  | 228/295 [07:14<03:30,  3.14s/it]

['sample_45_cerebrum']


 78%|███████▊  | 229/295 [07:17<03:25,  3.11s/it]

['sample_45_cerebrum']


 78%|███████▊  | 231/295 [07:21<02:31,  2.37s/it]

['sample_45_cerebrum']


 84%|████████▎ | 247/295 [08:12<01:49,  2.28s/it]

['sample_50_bonemarrow']
['sample_51_gonad']


 84%|████████▍ | 249/295 [08:17<01:47,  2.34s/it]

['sample_51_gonad']


 85%|████████▍ | 250/295 [08:19<01:38,  2.19s/it]

['sample_52_pancreas']


 86%|████████▌ | 254/295 [08:27<01:20,  1.97s/it]

['sample_56_spleen']


 87%|████████▋ | 257/295 [08:36<01:30,  2.38s/it]

['sample_59_bonemarrow']


 90%|█████████ | 266/295 [08:56<01:02,  2.15s/it]

['sample_63_gonad']


100%|██████████| 295/295 [10:27<00:00,  2.13s/it]


In [19]:
barcode_records

Unnamed: 0_level_0,Unnamed: 1_level_0,barcode,barcode_idx,zarr_path
zarr_tissue,barcode_seq,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CARE181125_3B_rep1,AACGAGAGCTAAAGCACTAGCG,CARE181125_3B_rep1:AACGAGAGCTAAAGCACTAGCG,0,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...
CARE181125_3B_rep1,AACGAGAGCTAACCACCTAAAG,CARE181125_3B_rep1:AACGAGAGCTAACCACCTAAAG,6,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...
CARE181125_3B_rep1,AACGAGAGCTAACGAGCTGTGA,CARE181125_3B_rep1:AACGAGAGCTAACGAGCTGTGA,8,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...
CARE181125_3B_rep1,AACGAGAGCTAAGCTCAACGCA,CARE181125_3B_rep1:AACGAGAGCTAAGCTCAACGCA,13,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...
CARE181125_3B_rep1,AACGAGAGCTAAGTTGTGCTAC,CARE181125_3B_rep1:AACGAGAGCTAAGTTGTGCTAC,14,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...
...,...,...,...,...
sample_9_liver,TTGGTTGGTATCATCAGCCATTGGCCAGGTGAGAAGACCA,sample_9_liver.ac:TTGGTTGGTATCATCAGCCATTGGCCAG...,1078995,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...
sample_9_liver,TTGGTTGGTATCTGACGAACGCCAAGGCAATGGCAAGTCA,sample_9_liver.ac:TTGGTTGGTATCTGACGAACGCCAAGGC...,1079290,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...
sample_9_liver,TTGGTTGGTATGACGCGACCTATTCCTTGCTTAATTCGTA,sample_9_liver.ac:TTGGTTGGTATGACGCGACCTATTCCTT...,1079540,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...
sample_9_liver,TTGGTTGGTATGCGAATCGGATGAATACCATTACCTTGCA,sample_9_liver.ac:TTGGTTGGTATGCGAATCGGATGAATAC...,1079883,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...


In [22]:
meta = meta.set_index(["zarr_tissue", "barcode"])

In [33]:
meta.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cellID,logUMI,tsse,tissue,cell type,Life stage
zarr_tissue,barcode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sample_1_adrenal,AACCAATAACACCGGTCTGCTTATTGCTAAGCTCCGCTTG,adrenal_sample_1_1+AACCAATAACACCGGTCTGCTTATTGC...,3.012837,20.926244,adrenal_sample_1,Fetal Adrenal Cortical Cell,Fetal
sample_1_adrenal,AACCAATAACATCCTTCAACCTTCAGTAGTAAGTAGACTA,adrenal_sample_1_1+AACCAATAACATCCTTCAACCTTCAGT...,4.724988,8.408811,adrenal_sample_1,Fetal Adrenal Cortical Cell,Fetal
sample_1_adrenal,AACCAATAACATGCTAACCTCTAGTTAAGTGCTCCGCTTG,adrenal_sample_1_1+AACCAATAACATGCTAACCTCTAGTTA...,3.889918,12.594235,adrenal_sample_1,Fetal Adrenal Cortical Cell,Fetal
sample_1_adrenal,AACCAATAACATGCTAACCTTGGCCGTCTTGCTCCGCTTG,adrenal_sample_1_1+AACCAATAACATGCTAACCTTGGCCGT...,4.210024,11.216235,adrenal_sample_1,Fetal Adrenal Cortical Cell,Fetal
sample_1_adrenal,AACCAATAACCAACGGTCTTACCAGTTCAGCGGCGCTCAA,adrenal_sample_1_1+AACCAATAACCAACGGTCTTACCAGTT...,4.496418,7.467532,adrenal_sample_1,Fetal Adrenal Cortical Cell,Fetal


In [28]:
barcode_records["logUMI"] = barcode_records.index.map(meta["logUMI"])
barcode_records["tsse"] = barcode_records.index.map(meta["tsse"])
barcode_records["cell type"] = barcode_records.index.map(meta["cell type"])
barcode_records["Life stage"] = barcode_records.index.map(meta["Life stage"])

In [34]:
barcode_records = barcode_records.reset_index()

In [36]:
barcode_records.head()

Unnamed: 0,zarr_tissue,barcode_seq,barcode,barcode_idx,zarr_path,logUMI,tsse,cell type,Life stage
0,CARE181125_3B_rep1,AACGAGAGCTAAAGCACTAGCG,CARE181125_3B_rep1:AACGAGAGCTAAAGCACTAGCG,0,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.459091,16.240071,Ventricular Cardiomyocyte,Adult
1,CARE181125_3B_rep1,AACGAGAGCTAACCACCTAAAG,CARE181125_3B_rep1:AACGAGAGCTAACCACCTAAAG,6,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.256237,11.706102,Ventricular Cardiomyocyte,Adult
2,CARE181125_3B_rep1,AACGAGAGCTAACGAGCTGTGA,CARE181125_3B_rep1:AACGAGAGCTAACGAGCTGTGA,8,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.423737,10.909091,Cardiac Fibroblasts,Adult
3,CARE181125_3B_rep1,AACGAGAGCTAAGCTCAACGCA,CARE181125_3B_rep1:AACGAGAGCTAAGCTCAACGCA,13,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.623973,12.5,Cardiac Fibroblasts,Adult
4,CARE181125_3B_rep1,AACGAGAGCTAAGTTGTGCTAC,CARE181125_3B_rep1:AACGAGAGCTAAGTTGTGCTAC,14,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.429106,11.079545,Cardiac Fibroblasts,Adult


In [38]:
barcode_records["zarr_tissue"] = barcode_records["zarr_tissue"].astype("category")
barcode_records["barcode_seq"] = barcode_records["barcode_seq"].astype(str)
barcode_records["zarr_path"] = barcode_records["zarr_path"].astype("category")
barcode_records["logUMI"] = barcode_records["logUMI"].astype("float16")
barcode_records["tsse"] = barcode_records["tsse"].astype("float16")
barcode_records["cell type"] = barcode_records["cell type"].astype("category")
barcode_records["Life stage"] = barcode_records["Life stage"].astype("category")
del barcode_records["barcode"]

In [50]:
barcode_records = barcode_records.rename(
    columns={"barcode_seq": "barcode", "cell type": "cell_type", "Life stage": "life_stage", "zarr_tissue": "sample"}
)

In [51]:
sample_to_tissue = {
    "CARE181125_3B_rep1": "heart_rv",
    "CARE181125_3C_rep1": "heart_la",
    "CARE181125_3D_rep1": "heart_la",
    "CARE181213_2A_rep1": "heart_ra",
    "CARE181213_2B_rep1": "heart_rv",
    "CARE190307_10A_rep1": "heart_ra",
    "CARE190307_10B_rep1": "heart_rv",
    "CARE190307_10C_rep1": "heart_la",
    "CARE190307_10D_rep1": "heart_lv",
    "CARE190331_11B_rep1": "heart_rv",
    "CARE190331_11D_rep1": "heart_lv",
    "CARE191122_2C_rep1": "heart_la",
    "CARE191122_2D_rep1": "heart_lv",
    "CARE191122_3D_rep1": "heart_lv",
    "LungMap_D122_rep1": "Lung",
    "LungMap_D175_rep1": "Lung",
    "LungMap_D231_rep1": "Lung",
    "UMB4540_snATAC_frontal_cortex_rep1": "frontal_cortex",
    "UMB4540_snATAC_frontal_cortex_rep2": "frontal_cortex",
    "adipose_omentum_SM_ADYHB_rep1": "adipose_omentum",
    "adipose_omentum_SM_CHZRM_rep1": "adipose_omentum",
    "adipose_omentum_SM_CSSD4_rep1": "adipose_omentum",
    "adipose_omentum_SM_IOBHJ_rep1": "adipose_omentum",
    "adrenal_gland_SM_A8WNO_rep1": "adrenal_gland",
    "artery_aorta_SM_C1MLC_rep1": "artery_aorta",
    "artery_aorta_SM_C1PX3_rep1": "artery_aorta",
    "artery_aorta_SM_CR89M_rep1": "artery_aorta",
    "artery_aorta_SM_JF1NU_rep1": "artery_aorta",
    "artery_tibial_SM_CHLWW_rep1": "artery_tibial",
    "artery_tibial_SM_IOBHK_rep1": "artery_tibial",
    "colon_sigmoid_SM_AZPYO_rep1": "colon_sigmoid",
    "colon_sigmoid_SM_JF1O8_rep1": "colon_sigmoid",
    "colon_transverse_SM_A9HOW_rep1": "colon_transverse",
    "colon_transverse_SM_A9VP4_rep1": "colon_transverse",
    "colon_transverse_SM_ACCQ1_rep1": "colon_transverse",
    "colon_transverse_SM_BZ2ZS_rep1": "colon_transverse",
    "colon_transverse_SM_CSSDA_rep1": "colon_transverse",
    "esophagus_ge_junction_SM_CTD24_rep1": "esophagus_ge_junction",
    "esophagus_ge_junction_SM_IOERG_rep1": "esophagus_ge_junction",
    "esophagus_mucosa_SM_A9HOR_rep1": "esophagus_mucosa",
    "esophagus_mucosa_SM_A9VPA_rep1": "esophagus_mucosa",
    "esophagus_mucosa_SM_AZPYJ_rep1": "esophagus_mucosa",
    "esophagus_muscularis_SM_A8CPH_rep1": "esophagus_muscularis",
    "esophagus_muscularis_SM_CSSCV_rep1": "esophagus_muscularis",
    "esophagus_muscularis_SM_IOBHM_rep1": "esophagus_muscularis",
    "esophagus_muscularis_SM_IQYD1_rep1": "esophagus_muscularis",
    "heart_atrial_appendage_SM_IOBHN_rep1": "heart_atrial_appendage",
    "heart_atrial_appendage_SM_JF1NX_rep1": "heart_atrial_appendage",
    "heart_lv_SM_IOBHO_rep1": "heart_lv",
    "heart_lv_SM_JF1NY_rep1": "heart_lv",
    "islet_CB1_rep1": "islet",
    "islet_CB2_rep1": "islet",
    "islet_CB3_rep1": "islet",
    "liver_SM_A8WNZ_rep1": "liver",
    "lung_SM_A62E9_rep1": "lung",
    "lung_SM_A8WNH_rep1": "lung",
    "lung_SM_ACCPU_rep1": "lung",
    "lung_SM_JF1NZ_rep1": "lung",
    "mammary_tissue_SM_IOBHL_rep1": "mammary_tissue",
    "mammary_tissue_SM_JF1NV_rep1": "mammary_tissue",
    "muscle_SM_ADA6L_rep1": "muscle",
    "muscle_SM_C1MKW_rep1": "muscle",
    "muscle_SM_C1PWV_rep1": "muscle",
    "muscle_SM_IOBHP_rep1": "muscle",
    "muscle_SM_JF1O9_rep1": "muscle",
    "nerve_tibial_SM_CHLWU_rep1": "nerve_tibial",
    "nerve_tibial_SM_CP2V6_rep1": "nerve_tibial",
    "nerve_tibial_SM_IOBHQ_rep1": "nerve_tibial",
    "ovary_SM_IOBHR_rep1": "ovary",
    "pancreas_SM_ADRUQ_rep1": "pancreas",
    "pancreas_SM_IOBHS_rep1": "pancreas",
    "pancreas_SM_JF1NS_rep1": "pancreas",
    "pancreas_SM_JF1O6_rep1": "pancreas",
    "skin_SM_IOBHT_rep1": "skin",
    "skin_SM_JF1O1_rep1": "skin",
    "skin_sun_exposed_SM_ADYHK_rep1": "skin",
    "skin_sun_exposed_SM_IOBHU_rep1": "skin",
    "skin_sun_exposed_SM_IQYCP_rep1": "skin",
    "skin_sun_exposed_SM_JF1NT_rep1": "skin",
    "small_intestine_SM_A62GO_rep1": "small_intestine",
    "small_intestine_SM_ADA5F_rep1": "small_intestine",
    "small_intestine_SM_JF1O2_rep1": "small_intestine",
    "stomach_SM_CHLWL_rep1": "stomach",
    "stomach_SM_IOBHV_rep1": "stomach",
    "stomach_SM_JF1NP_rep1": "stomach",
    "stomach_SM_JF1O3_rep1": "stomach",
    "thyroid_SM_C1MKY_rep1": "thyroid",
    "thyroid_SM_IOBHW_rep1": "thyroid",
    "thyroid_SM_JF1O4_rep1": "thyroid",
    "uterus_SM_A87A2_rep1": "uterus",
    "uterus_SM_IOBHX_rep1": "uterus",
    "vagina_SM_A9HOS_rep1": "vagina",
    "sample_10_muscle": "muscle",
    "sample_12_heart": "heart",
    "sample_13_placenta": "placenta",
    "sample_14_heart": "heart",
    "sample_15_placenta": "placenta",
    "sample_16_adrenal": "adrenal",
    "sample_18_eye": "eye",
    "sample_19_intestine": "intestine",
    "sample_1_adrenal": "adrenal",
    "sample_20_kidney": "kidney",
    "sample_21_intestine": "intestine",
    "sample_22_kidney": "kidney",
    "sample_23_muscle": "muscle",
    "sample_24_standard": "standard",
    "sample_25_kidney": "kidney",
    "sample_26_placenta": "placenta",
    "sample_27_adrenal": "adrenal",
    "sample_28_intestine": "intestine",
    "sample_29_placenta": "placenta",
    "sample_2_thymus": "thymus",
    "sample_30_placenta": "placenta",
    "sample_31_adrenal": "adrenal",
    "sample_32_heart": "heart",
    "sample_33_lung": "lung",
    "sample_34_kidney": "kidney",
    "sample_35_liver": "liver",
    "sample_36_cerebrum": "cerebrum",
    "sample_37_liver": "liver",
    "sample_38_lung": "lung",
    "sample_39_heart": "heart",
    "sample_3_kidney": "kidney",
    "sample_40_liver": "liver",
    "sample_41_thymus": "thymus",
    "sample_42_heart": "heart",
    "sample_43_liver": "liver",
    "sample_44_lung": "lung",
    "sample_46_liver": "liver",
    "sample_47_lung": "lung",
    "sample_48_standard": "standard",
    "sample_49_stomach": "stomach",
    "sample_4_lung": "lung",
    "sample_53_eye": "eye",
    "sample_54_thymus": "thymus",
    "sample_55_eye": "eye",
    "sample_57_spleen": "spleen",
    "sample_58_cerebellum": "cerebellum",
    "sample_5_kidney": "kidney",
    "sample_60_thymus": "thymus",
    "sample_61_pancreas": "pancreas",
    "sample_62_stomach": "stomach",
    "sample_64_cerebrum": "cerebrum",
    "sample_65_kidney": "kidney",
    "sample_66_cerebrum": "cerebrum",
    "sample_67_kidney": "kidney",
    "sample_68_lung": "lung",
    "sample_69_cerebrum": "cerebrum",
    "sample_6_cerebrum": "cerebrum",
    "sample_70_lung": "lung",
    "sample_71_cerebrum": "cerebrum",
    "sample_72_standard": "standard",
    "sample_7_liver": "liver",
    "sample_8_lung": "lung",
    "sample_9_liver": "liver",
}
barcode_records["tissue"] = barcode_records["sample"].map(sample_to_tissue)

In [52]:
barcode_records

Unnamed: 0,sample,barcode,barcode_idx,zarr_path,logUMI,tsse,cell_type,life_stage,tissue
0,CARE181125_3B_rep1,AACGAGAGCTAAAGCACTAGCG,0,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.458984,16.234375,Ventricular Cardiomyocyte,Adult,heart_rv
1,CARE181125_3B_rep1,AACGAGAGCTAACCACCTAAAG,6,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.255859,11.703125,Ventricular Cardiomyocyte,Adult,heart_rv
2,CARE181125_3B_rep1,AACGAGAGCTAACGAGCTGTGA,8,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.423828,10.906250,Cardiac Fibroblasts,Adult,heart_rv
3,CARE181125_3B_rep1,AACGAGAGCTAAGCTCAACGCA,13,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.623047,12.500000,Cardiac Fibroblasts,Adult,heart_rv
4,CARE181125_3B_rep1,AACGAGAGCTAAGTTGTGCTAC,14,/wmb/zarr/Zhang2021Cell.CutSites/CARE181125_3B...,3.429688,11.078125,Cardiac Fibroblasts,Adult,heart_rv
...,...,...,...,...,...,...,...,...,...
1323089,sample_9_liver,TTGGTTGGTATCATCAGCCATTGGCCAGGTGAGAAGACCA,1078995,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...,4.062500,14.585938,Fetal Erythroblast 2,Fetal,liver
1323090,sample_9_liver,TTGGTTGGTATCTGACGAACGCCAAGGCAATGGCAAGTCA,1079290,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...,3.923828,15.617188,Fetal Erythroblast 1,Fetal,liver
1323091,sample_9_liver,TTGGTTGGTATGACGCGACCTATTCCTTGCTTAATTCGTA,1079540,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...,3.275391,21.265625,Fetal Erythroblast 2,Fetal,liver
1323092,sample_9_liver,TTGGTTGGTATGCGAATCGGATGAATACCATTACCTTGCA,1079883,/wmb/zarr/Domcke2020Science.CutSites/sample_9_...,3.753906,11.257812,Fetal Fibroblast (General) 3,Fetal,liver


In [53]:
barcode_records.to_feather("/wmb/Zhang2021Cell/Zhang2021Cell.CellMetadata.feather")