In [5]:
import pathlib
import re

import pandas as pd
import xarray as xr
from tqdm import tqdm

In [2]:
# annotation from Supp Table 3
annot = pd.read_csv(
    "Table S3 – Metatable and annotation of single nuclei.txt",
    sep="\t",
    header=None,
    names=["sample", "barcode", "cell_type"],
)
annot["cell_id"] = annot["sample"] + "." + annot["barcode"]
annot = annot.set_index("cell_id")

# this table is incomplete

In [None]:
zarr_paths = list(pathlib.Path("/wmb/zarr/Li2023Science.CutSites/").glob("*.zarr"))
records = []
for zarr_path in tqdm(zarr_paths):
    ds = xr.open_zarr(zarr_path)
    barcode_map = ds["barcode_map"].to_pandas().reset_index()
    barcode_map.columns = ["barcode", "barcode_idx"]
    barcode_map["zarr_path"] = str(zarr_path)
    records.append(barcode_map)
barcode_records = pd.concat(records)
barcode_records["barcode_idx"] = barcode_records["barcode_idx"].astype("uint32")
barcode_records["zarr_path"] = barcode_records["zarr_path"].astype("category")
barcode_records = barcode_records.set_index("barcode")

barcode_records["sample"] = barcode_records.index.map(lambda i: i.split(".")[0])
barcode_records["cell_type"] = barcode_records["zarr_path"].map(lambda i: i.split("/")[-1][:-5])
suffix = re.compile(r"_a\w$")
barcode_records["cell_type"] = barcode_records["cell_type"].map(lambda i: suffix.split(i)[0])
barcode_records = barcode_records.reset_index()

In [28]:
barcode_records["sample"] = barcode_records["sample"].astype("category")
barcode_records["cell_type"] = barcode_records["cell_type"].astype("category")

In [31]:
barcode_records.to_feather("/wmb/Li2023Science/Li2023Science.CellMetadata.feather")

In [40]:
barcode_records.shape[0], "cells"

(1136505, 'cells')

In [42]:
barcode_records["cell_type"].unique().size, "cell_types"

(108, 'cell_types')

In [10]:
# sample meta
sample_meta = pd.read_csv("sample_meta.csv", index_col=0)
assert (~barcode_records["sample"].isin(sample_meta.index)).sum() == 0

In [33]:
name_map = {
    "Brain dissetion ID": "DissetionId",
    "donor": "Donor",
    "Brain structure": "MajorRegion",
    "Brain region": "SubRegion",
    "Sample quality": "SampleQuality",
}
sample_meta = sample_meta.rename(name_map).reset_index()

In [36]:
sample_meta.to_feather("/wmb/Li2023Science/Li2023Science.SampleMetadata.feather")