In [1]:
# ruff: noqa
import argparse
import logging
import os
from os.path import join
import sys
import warnings

from spatialdata import read_zarr

sys.path.insert(1, "/dss/dsshome1/0C/ra98gaq/Git/cellseg-benchmark")

from cellseg_benchmark import sdata_utils as su

  return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)


In [2]:
# warnings.filterwarnings("ignore")

In [3]:
logger = logging.getLogger("shape_mapping")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s"))
logger.addHandler(handler)

In [4]:
def get_args(test_args=None):  # noqa: D103
    p = argparse.ArgumentParser(
        description="Creates a master sdata for a given sample, containing multiple segmentation results."
    )
    p.add_argument("sample", help="Sample name.")
    p.add_argument(
        "data_path",
        help="Path to folder with merscope output data (e.g. /cohort1/slide2/region0).",
    )
    p.add_argument(
        "zmode",
        choices=["z3"],
        help="Mode of master sdata. Either 'z3' or '3d' (currently only z3 is implemented).",
    )
    p.add_argument("data_dir", help="Output data folder.")
    p.add_argument(
        "--n_ficture",
        default=21,
        type=int,
        help="Consider Ficture model with n_ficture factors.",
    )
    p.add_argument("--run_date", type=str, help="run date (YYYYMMDD).", default=None)
    p.add_argument("--organism", type=str, help="organism.", default=None)
    p.add_argument("--slide", type=str, help="slide.", default=None)
    p.add_argument("--region", type=str, help="region.", default=None)
    p.add_argument("--cohort", type=str, help="cohort.", default=None)
    p.add_argument(
        "--obs",
        action="append",
        default=[],
        metavar="KEY=VAL",
        help="Extra covariates to add to adata.obs (repeatable), e.g. --obs tissue=brain.",
    )
    if test_args is not None:
        return p.parse_args(test_args)
    else:
        return p.parse_args()

In [5]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "SynergyLung_s1_r0",
        "/dss/dssfs03/pn52re/pn52re-dss-0000/Synergy-projects-temp/MouseLung-Sijia/output-20250704_AGLiesz-Sijia-MouseLung-Slide1-Stroke_VMSC03901/region_R1-Stroke",
        "z3",
        "/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark",
        "--cohort",
        "SynergyLung",
        "--slide",
        "1",
        "--region",
        "0",
        "--organism",
        "mouse",
        "--run_date",
        "20250704",
        "--obs",
        "condition=stroke",
    ]
)

In [6]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "aging_s11_r0",
        "/dss/dssfs03/pn52re/pn52re-dss-0000/202405-Htra1-and-Aging/merfish_output/20250526_Aging-Slide11-cp-WT232_18m-WT999_24m-WT888_24m/region_3-WT232_18m",
        "z3",
        "/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark",
        "--cohort",
        "aging",
        "--slide",
        "11",
        "--region",
        "0",
        "--organism",
        "mouse",
        "--run_date",
        "20250526",
        "--obs",
        "genotype=WT",
        "--obs",
        "age_months=18",
        "--obs",
        "animal_id=232",
    ]
)

In [7]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "htra1_s4_r1",
        "/dss/dssfs03/pn52re/pn52re-dss-0000/202405-Htra1-and-Aging/merfish_output/20240829_Htra1-Slide04-cp-TG69-KO305/region_1-TG69",
        "z3",
        "/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark",
        "--cohort",
        "htra1",
        "--slide",
        "4",
        "--region",
        "1",
        "--organism",
        "mouse",
        "--run_date",
        "20240829",
        "--obs",
        "genotype=TG",
        "--obs",
        "age_months=6",
        "--obs",
        "animal_id=69",
    ]
)

In [8]:
# Simulate CLI arguments inside Jupyter
args = get_args(
    [
        "aging_s11_r0",
        "/dss/dssfs03/pn52re/pn52re-dss-0000/202405-Htra1-and-Aging/merfish_output/20250526_Aging-Slide11-cp-WT232_18m-WT999_24m-WT888_24m/region_3-WT232_18m",
        "z3",
        "/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark",
        "--cohort",
        "aging",
        "--slide",
        "11",
        "--region",
        "0",
        "--organism",
        "mouse",
        "--run_date",
        "20250526",
        "--obs",
        "genotype=WT",
        "--obs",
        "age_months=18",
        "--obs",
        "animal_id=232",
    ]
)

In [9]:
args

Namespace(sample='aging_s11_r0', data_path='/dss/dssfs03/pn52re/pn52re-dss-0000/202405-Htra1-and-Aging/merfish_output/20250526_Aging-Slide11-cp-WT232_18m-WT999_24m-WT888_24m/region_3-WT232_18m', zmode='z3', data_dir='/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark', n_ficture=21, run_date='20250526', organism='mouse', slide='11', region='0', cohort='aging', obs=['genotype=WT', 'age_months=18', 'animal_id=232'])

In [10]:
NONES = {"", "None", "none", "null", "NULL", None}
for k in ["organism", "slide", "region", "cohort"]:
    if getattr(args, k) in NONES:
        setattr(args, k, None)

In [11]:
extra_obs = {}
for kv in args.obs:
    k, v = kv.split("=", 1)
    extra_obs[k] = None if v in NONES else v

In [12]:
extra_obs

{'genotype': 'WT', 'age_months': '18', 'animal_id': '232'}

In [13]:
%%time
logger.info("Importing images and points...")
su.process_merscope(args.sample, args.data_dir, args.data_path, zmode=args.zmode)

2025-11-21 12:54:01,761 [INFO]: Importing images and points...


Skipping aging_s11_r0: z3 file already exists
CPU times: user 0 ns, sys: 3.42 ms, total: 3.42 ms
Wall time: 4.24 ms


In [14]:
sdata_path = join(args.data_dir, "samples", args.sample)
sdata_path

'/dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/samples/aging_s11_r0'

In [15]:
sdata_main = read_zarr(join(sdata_path, "sdata_z3.zarr"))

version mismatch: detected: RasterFormatV02, requested: FormatV04
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)
  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)


In [16]:
sdata_main

SpatialData object, with associated Zarr store: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/samples/aging_s11_r0/sdata_z3.zarr
├── Images
│     └── 'aging_s11_r0_z3': DataTree[cyx] (8, 35656, 65455), (8, 17828, 32727), (8, 8914, 16363), (8, 4457, 8181), (8, 2228, 4090)
└── Points
      └── 'aging_s11_r0_transcripts': DataFrame with shape: (<Delayed>, 9) (2D points)
with coordinate systems:
    ▸ 'global', with elements:
        aging_s11_r0_z3 (Images), aging_s11_r0_transcripts (Points)
    ▸ 'micron', with elements:
        aging_s11_r0_z3 (Images), aging_s11_r0_transcripts (Points)
    ▸ 'pixel', with elements:
        aging_s11_r0_z3 (Images), aging_s11_r0_transcripts (Points)

In [None]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs

In [None]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs.columns

In [None]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs["age_months"]

In [None]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs["genotype"]

In [None]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs["animal_id"]

In [17]:
logger.info("Integrating segmentation data from available methods into main sdata...")
seg_methods = [
    method
    for method in os.listdir(join(sdata_path, "results"))
    if os.path.isdir(join(sdata_path, "results", method, "sdata.zarr"))
]

2025-11-21 12:54:17,080 [INFO]: Integrating segmentation data from available methods into main sdata...


In [18]:
seg_methods

['Proseg_3D_Cellpose_1_nuclei_model',
 'Baysor_2D_Cellpose_2_DAPI_Transcripts_0.8',
 'vpt_2D_DAPI_nuclei',
 'Cellpose_1_nuclei_model',
 'Baysor_2D_Cellpose_2_DAPI_PolyT_0.2',
 'Baysor_2D_Cellpose_1_DAPI_Transcripts_0.8',
 'Negative_Control_Rastered_5',
 'Cellpose_2_DAPI_PolyT',
 'vpt_3D_DAPI_PolyT_nuclei',
 'Proseg_3D_Cellpose_1_DAPI_Transcripts',
 'Proseg_Cellpose_1_DAPI_Transcripts',
 'Baysor_2D_Cellpose_1_DAPI_Transcripts_0.2',
 'Baysor_2D_Cellpose_1_DAPI_PolyT_0.2',
 'Baysor_2D_Cellpose_2_DAPI_Transcripts_0.2',
 'Cellpose_1_DAPI_PolyT',
 'Negative_Control_Rastered_10',
 'Negative_Control_Voronoi',
 'Proseg_3D_Cellpose_2_DAPI_PolyT',
 'vpt_3D_DAPI_nuclei',
 'Proseg_Cellpose_2_DAPI_Transcripts',
 'ComSeg_Cellpose_1_nuclei_model',
 'Proseg_Cellpose_1_nuclei_model',
 'Cellpose_2_DAPI_Transcripts',
 'Cellpose_1_Merlin',
 'Negative_Control_Rastered_25',
 'vpt_2D_DAPI_PolyT',
 'Cellpose_1_DAPI_Transcripts',
 'Proseg_3D_Cellpose_1_DAPI_PolyT',
 'Baysor_2D_Cellpose_2_DAPI_PolyT_0.8',
 'Bays

In [19]:
# temp workaround
seg_methods = [
    s
    for s in seg_methods
    if not (s.startswith("Proseg_") and not s.startswith("Proseg_3D"))
]

In [20]:
seg_methods

['Proseg_3D_Cellpose_1_nuclei_model',
 'Baysor_2D_Cellpose_2_DAPI_Transcripts_0.8',
 'vpt_2D_DAPI_nuclei',
 'Cellpose_1_nuclei_model',
 'Baysor_2D_Cellpose_2_DAPI_PolyT_0.2',
 'Baysor_2D_Cellpose_1_DAPI_Transcripts_0.8',
 'Negative_Control_Rastered_5',
 'Cellpose_2_DAPI_PolyT',
 'vpt_3D_DAPI_PolyT_nuclei',
 'Proseg_3D_Cellpose_1_DAPI_Transcripts',
 'Baysor_2D_Cellpose_1_DAPI_Transcripts_0.2',
 'Baysor_2D_Cellpose_1_DAPI_PolyT_0.2',
 'Baysor_2D_Cellpose_2_DAPI_Transcripts_0.2',
 'Cellpose_1_DAPI_PolyT',
 'Negative_Control_Rastered_10',
 'Negative_Control_Voronoi',
 'Proseg_3D_Cellpose_2_DAPI_PolyT',
 'vpt_3D_DAPI_nuclei',
 'ComSeg_Cellpose_1_nuclei_model',
 'Cellpose_2_DAPI_Transcripts',
 'Cellpose_1_Merlin',
 'Negative_Control_Rastered_25',
 'vpt_2D_DAPI_PolyT',
 'Cellpose_1_DAPI_Transcripts',
 'Proseg_3D_Cellpose_1_DAPI_PolyT',
 'Baysor_2D_Cellpose_2_DAPI_PolyT_0.8',
 'Baysor_2D_Cellpose_1_nuclei_model_1.0',
 'vpt_2D_DAPI_PolyT_nuclei',
 'vpt_3D_DAPI_PolyT',
 'Baysor_2D_Cellpose_1_DAP

In [21]:
extra_obs

{'genotype': 'WT', 'age_months': '18', 'animal_id': '232'}

In [22]:
args.data_path

'/dss/dssfs03/pn52re/pn52re-dss-0000/202405-Htra1-and-Aging/merfish_output/20250526_Aging-Slide11-cp-WT232_18m-WT999_24m-WT888_24m/region_3-WT232_18m'

In [23]:
su.integrate_segmentation_data(
    sdata_path,
    seg_methods,
    sdata_main,
    run_date=args.run_date,
    organism=args.organism,
    slide=args.slide,
    region=args.region,
    cohort=args.cohort,
    write_to_disk=True,
    data_path=args.data_path,
    logger=logger,
    **extra_obs,
)

  0%|          | 0/30 [00:00<?, ?it/s]2025-11-21 12:54:24,276 [INFO]: Adding Proseg_3D_Cellpose_1_nuclei_model...
2025-11-21 12:54:26,217 [INFO]: Adding shapes of Proseg_3D_Cellpose_1_nuclei_model...
2025-11-21 12:55:14,671 [INFO]: Adding adata for Proseg_3D_Cellpose_1_nuclei_model...
  self._check_key(key, self.keys(), self._shared_keys)
2025-11-21 12:55:14,744 [INFO]: Adding cell type annotations for Proseg_3D_Cellpose_1_nuclei_model...
2025-11-21 12:55:22,987 [INFO]: Collecting volume metadata for Proseg_3D_Cellpose_1_nuclei_model
  return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)
  return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)
  return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)
  return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)
  return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)
  self._check_key(key, self.keys(), self._shared_keys)
2025-11-21 12:56:16,

SpatialData object, with associated Zarr store: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/samples/aging_s11_r0/sdata_z3.zarr
├── Images
│     └── 'aging_s11_r0_z3': DataTree[cyx] (8, 35656, 65455), (8, 17828, 32727), (8, 8914, 16363), (8, 4457, 8181), (8, 2228, 4090)
├── Points
│     └── 'aging_s11_r0_transcripts': DataFrame with shape: (<Delayed>, 9) (2D points)
├── Shapes
│     ├── 'boundaries_Baysor_2D_Cellpose_1_DAPI_PolyT_0.2': GeoDataFrame shape: (128972, 1) (2D shapes)
│     ├── 'boundaries_Baysor_2D_Cellpose_1_DAPI_PolyT_0.8': GeoDataFrame shape: (113076, 1) (2D shapes)
│     ├── 'boundaries_Baysor_2D_Cellpose_1_DAPI_Transcripts_0.2': GeoDataFrame shape: (122471, 1) (2D shapes)
│     ├── 'boundaries_Baysor_2D_Cellpose_1_DAPI_Transcripts_0.8': GeoDataFrame shape: (92417, 1) (2D shapes)
│     ├── 'boundaries_Baysor_2D_Cellpose_1_nuclei_model_1.0': GeoDataFrame shape: (121914, 1) (2D shapes)
│     ├── 'boundaries_Baysor_2D_Cellpose_2_DAPI_PolyT_0.2': GeoDataFrame shape

In [24]:
# test sdata #################

In [25]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs

Unnamed: 0,region,slide,spt_region,cell_type_incl_low_quality_revised,cell_type_mmc_incl_low_quality_clusters,cell_type_mmc_incl_low_quality,cell_type_incl_mixed_revised,cell_type_mmc_incl_mixed_clusters,cell_type_mmc_incl_mixed,cell_type_revised,...,solidity,elongation,genotype,age_months,run_date,organism,cohort,sample,animal_id,condition
aaaaaaaa-1,0,11,boundaries_Cellpose_1_nuclei_model,Oligodendrocytes,Choroid-Plexus,Astrocytes,Oligodendrocytes,Mixed,Mixed,Oligodendrocytes,...,0.956813,0.337908,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
aaaaaaab-1,0,11,boundaries_Cellpose_1_nuclei_model,Oligodendrocytes,Oligodendrocytes,Astrocytes,Oligodendrocytes,Oligodendrocytes,Mixed,Oligodendrocytes,...,1.000000,0.113391,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
aaaaaaac-1,0,11,boundaries_Cellpose_1_nuclei_model,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,...,1.000000,0.098323,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
aaaaaaad-1,0,11,boundaries_Cellpose_1_nuclei_model,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,Oligodendrocytes,...,1.000000,0.228436,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
aaaaaaae-1,0,11,boundaries_Cellpose_1_nuclei_model,Oligodendrocytes,Choroid-Plexus,Oligodendrocytes,Oligodendrocytes,Mixed,Oligodendrocytes,Oligodendrocytes,...,1.000000,0.083886,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
aaaaildm-1,0,11,boundaries_Cellpose_1_nuclei_model,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,...,0.997871,0.108961,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
aaaaildn-1,0,11,boundaries_Cellpose_1_nuclei_model,Oligodendrocytes,Neurons-Glut,Neurons-Glut,Oligodendrocytes,Neurons-Glut,Neurons-Glut,Oligodendrocytes,...,0.989197,0.240081,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
aaaaildo-1,0,11,boundaries_Cellpose_1_nuclei_model,ECs,ECs,ECs,ECs,ECs,ECs,ECs,...,1.000000,0.018043,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
aaaaildp-1,0,11,boundaries_Cellpose_1_nuclei_model,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,Neurons-Glut,...,0.968743,0.055256,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18


In [26]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs.columns

Index(['region', 'slide', 'spt_region', 'cell_type_incl_low_quality_revised',
       'cell_type_mmc_incl_low_quality_clusters',
       'cell_type_mmc_incl_low_quality', 'cell_type_incl_mixed_revised',
       'cell_type_mmc_incl_mixed_clusters', 'cell_type_mmc_incl_mixed',
       'cell_type_revised', 'cell_type_mmc_raw_clusters', 'cell_type_mmc_raw',
       'cell_id', 'area', 'volume_sum', 'volume_final', 'num_z_planes',
       'size_normalized', 'surface_to_volume_ratio', 'sphericity', 'solidity',
       'elongation', 'genotype', 'age_months', 'run_date', 'organism',
       'cohort', 'sample', 'animal_id', 'condition'],
      dtype='object')

In [None]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs["age_months"]

In [None]:
sdata_main.tables["adata_Cellpose_1_nuclei_model"].obs["condition"]

In [27]:
sdata_main.tables["adata_Cellpose_1_Merlin"].obs.columns

Index(['fov', 'volume', 'center_x', 'center_y', 'min_x', 'min_y', 'max_x',
       'max_y', 'anisotropy', 'transcript_count', 'perimeter_area_ratio',
       'Fth1_raw', 'Fth1_high_pass', 'DAPI_raw', 'DAPI_high_pass', 'App_raw',
       'App_high_pass', 'Aldoc_raw', 'Aldoc_high_pass', 'Sst_raw',
       'Sst_high_pass', 'Plp1_raw', 'Plp1_high_pass', 'PolyT_raw',
       'PolyT_high_pass', 'region', 'slide', 'dataset_id', 'cells_region',
       'spt_region', 'cell_type_incl_low_quality_revised',
       'cell_type_mmc_incl_low_quality_clusters',
       'cell_type_mmc_incl_low_quality', 'cell_type_incl_mixed_revised',
       'cell_type_mmc_incl_mixed_clusters', 'cell_type_mmc_incl_mixed',
       'cell_type_revised', 'cell_type_mmc_raw_clusters', 'cell_type_mmc_raw',
       'cell_id', 'area', 'volume_sum', 'volume_final', 'num_z_planes',
       'size_normalized', 'surface_to_volume_ratio', 'sphericity', 'solidity',
       'elongation', 'genotype', 'age_months', 'run_date', 'organism',
       'c

In [28]:
sdata_main.tables["adata_Cellpose_1_Merlin"].obs

Unnamed: 0_level_0,fov,volume,center_x,center_y,min_x,min_y,max_x,max_y,anisotropy,transcript_count,...,solidity,elongation,genotype,age_months,run_date,organism,cohort,sample,animal_id,condition
EntityID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
248711000003100001,417,808.562111,7231.107147,7198.372446,7226.604490,7193.764156,7235.853920,7202.938191,1.013078,0,...,1.000000,0.096341,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
248711000003100002,417,266.361792,7248.592975,7195.647341,7244.526919,7193.764156,7252.066519,7198.191981,1.704450,0,...,1.000000,0.210849,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
248711000003100005,417,198.258304,7272.734791,7195.519228,7269.911928,7193.764156,7275.837170,7197.394793,1.647894,0,...,1.000000,0.006583,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
248711000003100006,417,360.645661,7285.561699,7196.026506,7281.029287,7193.788223,7289.651378,7198.719600,1.755867,0,...,1.000000,0.256885,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
248711000003100010,417,259.700561,7246.363524,7199.820553,7243.445775,7196.495046,7249.041249,7203.355593,1.636452,0,...,1.000000,0.375512,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248711000110100127,40,1136.350842,11615.779651,10804.572500,11610.101084,10798.694083,11622.200637,10810.839648,1.242970,1,...,0.985538,0.216094,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
248711000110100128,40,1429.615413,11598.700329,10813.752208,11591.881941,10806.229373,11605.488999,10820.481903,1.017308,2,...,0.995633,0.040381,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
248711000110100129,40,1275.092968,11571.393604,10818.085659,11565.165457,10810.545320,11576.349561,10824.972732,1.289438,2,...,0.990718,0.100072,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18
248711000110100130,40,459.810753,11475.060957,10830.734862,11470.911481,10827.032940,11479.611647,10834.051085,1.323253,12,...,1.000000,0.183094,WT,18,20250526,mouse,aging,aging_s11_r0,232,WT_18


In [29]:
logger.info("Done.")

2025-11-21 13:25:21,492 [INFO]: Done.


In [23]:
sdata_path = join(args.data_dir, "samples", "aging_s1_r0")

In [None]:
# test boundaries #################

In [33]:
import gzip
import io
import logging
import geopandas as gpd

seg_method = "Proseg_3D_Cellpose_1_DAPI_Transcripts"
boundary_file = join(
    sdata_path,
    "results",
    seg_method,
    "sdata.zarr",
    ".sopa_cache",
    "transcript_patches",
    "0",
    "cell-polygons-layers.geojson.gz",
)

# 1. Load boundary GeoJSON as DataFrame
with gzip.open(boundary_file, "rt", encoding="utf-8") as f:
    geojson_text = f.read()
gdf = gpd.read_file(io.StringIO(geojson_text))

In [34]:
print("Loaded boundaries shape:", gdf.shape)
print("Boundary columns:", gdf.columns)
print(gdf.head())

Loaded boundaries shape: (744836, 3)
Boundary columns: Index(['cell', 'layer', 'geometry'], dtype='object')
   cell  layer                                           geometry
0     0      0  MULTIPOLYGON (((5065.7813 5726.773, 5065.7813 ...
1     0      1  MULTIPOLYGON (((5065.7813 5724.773, 5065.7813 ...
2     0      2  MULTIPOLYGON (((5064.7813 5726.773, 5064.7813 ...
3     0      3  MULTIPOLYGON (((5066.7813 5725.773, 5066.7813 ...
4     0      4  MULTIPOLYGON (((5068.7813 5725.773, 5068.7813 ...


In [35]:
gdf.shape

(744836, 3)

In [31]:
print("Loaded boundaries shape:", gdf.shape)
print("Boundary columns:", gdf.columns)
print(gdf.head())

Loaded boundaries shape: (338478, 3)
Boundary columns: Index(['cell', 'layer', 'geometry'], dtype='object')
   cell  layer                                           geometry
0     0      0  MULTIPOLYGON (((7609 8946, 7609 8945, 7610 894...
1     0      1  MULTIPOLYGON (((7608 8949, 7608 8950, 7609 895...
2     1      0  MULTIPOLYGON (((6640 8576, 6640 8580, 6642 858...
3     1      1  MULTIPOLYGON (((6641 8579, 6641 8581, 6642 858...
4     1      2  MULTIPOLYGON (((6645 8581, 6645 8586, 6648 858...


In [32]:
gdf.shape

(338478, 3)

In [26]:
# 2. Check for required columns
expected_cols = ["cell", "cell_id"]
missing = [col for col in expected_cols if col not in gdf.columns]
print("Missing columns in boundaries:", missing)

Missing columns in boundaries: ['cell_id']


In [None]:
# 4. Simulate annotation table (replace with your actual)
adata_obs = pd.DataFrame(
    {
        "cell": gdf["cell"].unique(),
        "cell_id": gdf["cell_id"].unique(),
        "dummy": range(len(gdf["cell"].unique())),
    }
)
print("Annotation table shape:", adata_obs.shape)

# 5. Merge boundaries with annotation
merged = gdf.merge(adata_obs[["cell", "cell_id"]], on="cell")
print("Merged boundaries shape:", merged.shape)
print(merged.head())

# 6. Any cells missing from boundaries?
missing_cells = set(adata_obs["cell"]) - set(gdf["cell"])
print("Cells in annotation not in boundaries:", len(missing_cells))

# 7. Downstream: Create dummy morphology DataFrame and test merge
df_morph = pd.DataFrame(
    {
        "cell_id": gdf["cell_id"],
        "volume": [42] * len(gdf),  # dummy
    }
)
merged_obs = adata_obs.merge(df_morph, on="cell_id", how="left")
print("Merge with morphology (for .obs) shape:", merged_obs.shape)
print(merged_obs.head())
print("Number of missing volume values:", merged_obs["volume"].isnull().sum())