In [1]:
import pathlib

import joblib
import pandas as pd
import xarray as xr
from tqdm import tqdm

## Barcode to idx in Cut Sites Zarr

In [2]:
zarr_paths = list(pathlib.Path("/wmb/zarr/Zu2023Nautre.CutSites/").glob("*.zarr"))

records = []
for zarr_path in tqdm(zarr_paths):
    ds = xr.open_zarr(zarr_path)
    barcode_map = ds["barcode_map"].to_pandas().reset_index()
    barcode_map.columns = ["barcode", "barcode_idx"]
    barcode_map["zarr_path"] = str(zarr_path)
    records.append(barcode_map)
barcode_records = pd.concat(records)
barcode_records["barcode_idx"] = barcode_records["barcode_idx"].astype("uint32")
barcode_records["zarr_path"] = barcode_records["zarr_path"].astype("category")
barcode_records = barcode_records.set_index("barcode")

100%|██████████| 234/234 [01:01<00:00,  3.84it/s]


## Metadata

In [3]:
# Download from catlas
atac_meta = pd.read_csv(
    "meta/SI Table 2 Metadata table for all the 2.3 million nuclei in the snATAC-seq data.txt", sep="\t"
)
meta = pd.read_hdf("meta/CEMBA.snATAC.Metadata.hdf")

for s, df in meta.groupby("Sample"):
    assert df["DissectionRegion"].unique().size == 1
sample_to_region = meta.set_index("Sample")["DissectionRegion"].to_dict()
atac_meta["DissectionRegion"] = atac_meta["Sample"].map(sample_to_region)
atac_meta["new_cell_id"] = atac_meta["CellID"].str.replace(".", ":").str.replace("_", "-")

has_meta = barcode_records.index.isin(atac_meta["new_cell_id"])
assert (~has_meta).sum() == 0

In [7]:
zu_annot = atac_meta.set_index("new_cell_id")["Subclass"]

In [8]:
atac_meta = atac_meta.set_index("new_cell_id")
atac_meta = atac_meta[["Sample", "# of Fragments", "TSSe", "DissectionRegion"]].copy()
atac_meta["ZarrPath"] = barcode_records["zarr_path"]
atac_meta["ZarrBarcodeIdx"] = barcode_records["barcode_idx"]
atac_meta.index.name = "cell"
assert atac_meta.isna().sum().sum() == 0

In [9]:
atac_meta["Sample"] = atac_meta["Sample"].astype(str).astype("category")
atac_meta["DissectionRegion"] = atac_meta["DissectionRegion"].astype(str).astype("category")
atac_meta["ZarrPath"] = atac_meta["ZarrPath"].astype(str).astype("category")

atac_meta["# of Fragments"] = atac_meta["# of Fragments"].astype("uint32")
atac_meta["TSSe"] = atac_meta["TSSe"].astype("float32")
atac_meta["ZarrBarcodeIdx"] = atac_meta["ZarrBarcodeIdx"].astype("uint32")

In [10]:
(atac_meta.memory_usage() / 1024 / 1024).sum()

56.19189643859863

In [26]:
atac_meta.reset_index().to_feather("/wmb/Zu2023Nature/Zu2023Nature.CellMetadata.feather")

## ATAC to mC Map

In [11]:
atac_to_mc = joblib.load("atac_cell_to_mc_clusters.dict")

In [12]:
new_atac_to_mc = {}
for c, v in atac_to_mc.items():
    *sample, barcode = c.split("_")
    nc = f'{"-".join(sample)}:{barcode}'
    new_atac_to_mc[nc] = v

In [25]:
joblib.dump(new_atac_to_mc, "/wmb/Zu2023Nature/atac_cell_to_mc_clusters.dict", compress=1)

['/wmb/Zu2023Nature/atac_cell_to_mc_clusters.new.dict']

In [13]:
has_mc = pd.Index(new_atac_to_mc.keys())

In [14]:
atac_meta.index.isin(has_mc).sum()

1972156

In [15]:
has_mc.size

2065820

In [16]:
no_mc = atac_meta[~atac_meta.index.isin(has_mc)].index

In [22]:
no_mc

Index(['CEMBA171206-3C:AGCGATAGACTGAGCGGTAAGGAGCCTATCCT',
       'CEMBA171206-3C:ATTACTCGGAACGGTAAATGACGTGTACTGAC',
       'CEMBA171206-3C:ATTCAGAAAACCAGGTAGGATAACAGGCGAAG',
       'CEMBA171206-3C:ATTCAGAACAGAACTGATAGCCTTCCTATCCT',
       'CEMBA171206-3C:CGCTCATTAACCAGGTCGAATTCCTAATCTTA',
       'CEMBA171206-3C:CGCTCATTAACCAGGTTTGGAAGTATAGAGGC',
       'CEMBA171206-3C:CGCTCATTCAAGTTCAATAGCCTTCCTATCCT',
       'CEMBA171206-3C:CGCTCATTCGAGGCTGTTATGCGAGTACTGAC',
       'CEMBA171206-3C:CGCTCATTGCCTCAATTTACGACCGTACTGAC',
       'CEMBA171206-3C:CGCTCATTTATGGTCGTTACGACCATAGAGGC',
       ...
       'CEMBA191031-11D:CAGAAGACTATGTCAGGAACAC',
       'CEMBA191031-11D:CTTCTGATGACGCCCTGCTATA',
       'CEMBA191031-11D:GCATAACCTACTCCGATATGAC',
       'CEMBA191031-11D:GCCAACGATTGTGTCATGAAAG',
       'CEMBA191031-11D:GGAACACTAGTCCACACCTCCA',
       'CEMBA191031-11D:TATACTCCGCTCGACCAGATCT',
       'CEMBA191031-11D:TCTATTCCTGATGTGCATCTGG',
       'CEMBA191031-11D:TGTACAGCGAACGCATAGGCAC',
       'CEMBA1910