# Combine MERFISH FICTURE results in spatialdata object
- for a given sample, create spatial data object
- iterate over all available FICTURE outputs:
    - add pixel-level factors to sdata object
    - calculate transcript-level factors, merge them, and add to sdata object

In [1]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import json
import os
from datetime import date

import dask

dask.config.set({"dataframe.query-planning": False})

<dask.config.set at 0x7fc4a53bd350>

In [3]:
import dask.dataframe as dd
import numpy as np
import spatialdata
import spatialdata_io
from scipy.spatial import KDTree
from spatialdata.models import PointsModel

In [4]:
import sys

sys.path.append("/home")
import src.ficture_utils as ficture_utils

In [5]:
data_dir = os.path.abspath("/home/data")
!ls -l $data_dir

lrwxrwxrwx 1 ra98gaq pn52ra 53 Mar 12 17:01 /home/data -> /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark


In [6]:
archive_dir = os.path.abspath("/home/archive")
!ls -l $archive_dir

lrwxrwxrwx 1 ra98gaq pn52ra 35 Mar 12 17:02 /home/archive -> /dss/dssfs03/pn52re/pn52re-dss-0000


In [7]:
cohort_name = "foxf2"
slide_name = "s2"
region_name = "r1"
sample_name = f"{cohort_name}_{slide_name}_{region_name}"
sdata_file = os.path.join(
    data_dir, "samples", sample_name, "results", "Ficture", "sdata.zarr"
)
sample_name

'foxf2_s2_r1'

In [9]:
# get path to merscope output data
with open(os.path.join(data_dir, "sample_paths.json")) as file:
    sample_paths = json.load(file)

## Create spatialdata object
Will overwrite existing objects

In [13]:
# create and save sdata object
sdata = spatialdata_io.merscope(
    sample_paths[sample_name],
    z_layers=None,
    cells_boundaries=True,
    mosaic_images=False,
    slide_name=f"{cohort_name}_{slide_name}",
    region_name=region_name,
)
os.makedirs(os.path.dirname(sdata_file), exist_ok=True)
sdata.write(sdata_file, overwrite=True)

[34mINFO    [0m The column [32m"global_x"[0m has now been renamed to [32m"x"[0m; the column [32m"x"[0m was already present in the dataframe,   
         and will be dropped.                                                                                      
[34mINFO    [0m The column [32m"global_y"[0m has now been renamed to [32m"y"[0m; the column [32m"y"[0m was already present in the dataframe,   
         and will be dropped.                                                                                      


  return method.__get__(obj, cls)(*args, **kwargs)


[34mINFO    [0m The Zarr backing store has been changed from [3;35mNone[0m the new file path:                                      
         [35m/dss/dsshome1/0C/ra98gaq/Git/st-bsb--finish-then-update/data/cellseg-benchmark/spatialdata-objects/foxf2_s[0m
         [35m2_r1/[0m[95msdata.zarr[0m                                                                                           


In [10]:
if "sdata" not in globals():
    sdata = spatialdata.read_zarr(sdata_file)

In [11]:
sdata

SpatialData object, with associated Zarr store: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/samples/foxf2_s2_r1/results/Ficture/sdata.zarr
├── Points
│     ├── 'foxf2_s2_r1_all_transcript_factors': DataFrame with shape: (<Delayed>, 13) (2D points)
│     ├── 'foxf2_s2_r1_nF5_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF10_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF20_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF21_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF25_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF28_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF30_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF50_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D

In [12]:
sdata[f"{sample_name}_transcripts"].head(3)

Unnamed: 0.1,x,y,gene,global_z,transcript_id,fov,barcode_id,cell_id,Unnamed: 0
0,7136.4756,2356.3298,Igf2,0.0,ENSMUST00000000033,461,0,1865558700069100825,107
1,7138.1094,2360.3838,Cfp,0.0,ENSMUST00000001156,461,8,1865558700069100825,22
2,7136.9727,2356.9917,Slc3a2,0.0,ENSMUST00000010239,461,44,1865558700069100825,84


In [13]:
sdata[f"{sample_name}_transcripts"]["fov"].compute().max()

np.int64(1302)

## Get pixel-level and transcript-level factors
- iterates over all available factors for given sample
- adds pixel-level factors as separate "point" objects to sdata
- extract transcript-level factors and export as merged table, plus add as "point" object to sdata

In [14]:
all_factors_dir = os.listdir(
    os.path.join(data_dir, "samples", sample_name, "results", "Ficture", "output")
)
all_factors_dir

['nF50.d_6',
 'nF28.d_6-bulkRNAseq-inclzonation',
 'nF10.d_6',
 'nF30.d_6',
 'nF5.d_6',
 'nF25.d_6',
 'nF21.d_6-bulkRNAseq-exclMBP',
 'nF20.d_6']

In [15]:
all_factors = [f.split(".")[0] for f in all_factors_dir]
all_factors

['nF50', 'nF28', 'nF10', 'nF30', 'nF5', 'nF25', 'nF21', 'nF20']

In [20]:
base_cols = ["x", "y", "global_z", "gene", "transcript_id"]

# Initialize with first factor
factor_folder_0 = next(
    f for f in all_factors_dir if f.startswith(all_factors[0] + ".")
)  # match name in case contains text label
pixel_level_factors_file = os.path.join(
    data_dir,
    "samples",
    sample_name,
    "results",
    "Ficture",
    "output",
    f"{factor_folder_0}",
    f"{all_factors[0]}.d_6.prj_6.r_4_5.pixel.sorted.tsv.gz",
)
metadata = ficture_utils.parse_metadata(pixel_level_factors_file)
df = ficture_utils.load_pixel_tsv(pixel_level_factors_file)
df = ficture_utils.process_coordinates(df, metadata)

# Update pixel-level factors to sdata
dask_df = dd.from_pandas(df, npartitions=96)
dask_df = PointsModel.parse(dask_df)
sdata[f"{sample_name}_{all_factors[0]}_pixel_factors"] = dask_df
sdata.write_element(f"{sample_name}_{all_factors[0]}_pixel_factors", overwrite=True)

coords = np.array([df["x"], df["y"]]).T
tree = KDTree(
    coords, leafsize=10, compact_nodes=True, copy_data=False, balanced_tree=True
)

# Start with first factor's data
transcripts = sdata[f"{sample_name}_transcripts"]
all_transcript_factors = transcripts.map_partitions(
    ficture_utils.get_transcript_level_factors,
    tree=tree,
    df=df,
    metadata=metadata,
    current_factor=all_factors[0],
)

# Subset genes for test run
# test_geneset = ["Igf2", "Cfp"]
# all_transcript_factors = all_transcript_factors[all_transcript_factors["gene"].isin(test_geneset)]

# Process remaining factors
for factor in all_factors[1:]:
    # Parse and process pixel-level factors
    factor_folder = next(f for f in all_factors_dir if f.startswith(factor + "."))
    pixel_level_factors_file = os.path.join(
        data_dir,
        "samples",
        sample_name,
        "results",
        "Ficture",
        "output",
        f"{factor_folder}",
        f"{factor}.d_6.prj_6.r_4_5.pixel.sorted.tsv.gz",
    )
    metadata = ficture_utils.parse_metadata(pixel_level_factors_file)
    df = ficture_utils.load_pixel_tsv(pixel_level_factors_file)
    df = ficture_utils.process_coordinates(df, metadata)

    # Update pixel-level factors to sdata
    dask_df = dd.from_pandas(df, npartitions=96)
    dask_df = PointsModel.parse(dask_df)
    sdata[f"{sample_name}_{factor}_pixel_factors"] = dask_df
    sdata.write_element(f"{sample_name}_{factor}_pixel_factors", overwrite=True)

    # Create KDTree for this iteration's pixels
    coords = np.array([df["x"], df["y"]]).T
    tree = KDTree(
        coords, leafsize=10, compact_nodes=True, copy_data=False, balanced_tree=True
    )

    # Get transcript-level factors
    all_transcript_factors = all_transcript_factors.map_partitions(
        ficture_utils.get_transcript_level_factors,
        tree=tree,
        df=df,
        metadata=metadata,
        current_factor=factor,
    )

# Final cleanup
result = all_transcript_factors[
    base_cols + [f"{factor}_factors" for factor in all_factors]
].reset_index(drop=True)

# Validate that each factor has >= 2 unique values (in a subset for speed)
sample = result.head(1000)
validation = {
    col: len(sample[col].unique()) > 1 for col in sample.columns if "_factors" in col
}
assert all(validation.values()), f"Invalid factors: {validation}"

Loading data: 9304it [00:57, 162.32it/s]                          
Loading data: 6378it [00:36, 175.11it/s]                          
Loading data: 9243it [00:51, 179.73it/s]                          
Loading data: 9261it [00:54, 170.10it/s]                          
Loading data: 9230it [00:50, 184.53it/s]                          
Loading data: 9224it [00:53, 171.14it/s]                          
Loading data: 6349it [00:36, 175.04it/s]                          
Loading data: 9229it [00:53, 172.20it/s]                          


In [21]:
result.head()

Unnamed: 0,x,y,global_z,gene,transcript_id,nF50_factors,nF28_factors,nF10_factors,nF30_factors,nF5_factors,nF25_factors,nF21_factors,nF20_factors
0,7136.4756,2356.3298,0.0,Igf2,ENSMUST00000000033,50,28,10,30,5,25,21,20
1,7138.1094,2360.3838,0.0,Cfp,ENSMUST00000001156,50,28,10,30,5,25,21,20
2,7136.9727,2356.9917,0.0,Slc3a2,ENSMUST00000010239,50,28,10,30,5,25,21,20
3,7135.136,2357.4858,0.0,Slc3a2,ENSMUST00000010239,50,28,10,30,5,25,21,20
4,7137.8364,2359.2458,0.0,Slc47a1,ENSMUST00000010267,50,28,10,30,5,25,21,20


In [22]:
result.compute()[all_factors[0] + "_factors"].unique()

array([50,  3, 30, 42, 37, 29,  4, 44, 21, 45, 20, 35, 41, 28, 38, 43, 47,
       48, 32, 18, 12, 33, 22, 39, 13, 25,  2, 10, 34, 27, 31, 49,  0, 17,
        1,  9,  5, 15, 14, 26, 46, 24, 11, 36, 40, 23, 16,  6, 19,  7,  8])

In [23]:
result.index.size.compute()

np.int64(66676355)

In [24]:
# export merged pixel-level factors
result.to_csv(
    os.path.join(
        data_dir,
        "samples",
        sample_name,
        "results",
        "Ficture",
        "analysis",
        date.today().strftime("%Y%m%d") + "_all_transcript_factors.csv.gz",
    ),
    compression="gzip",
    index=False,
    single_file=True,
)

['/dss/dsshome1/0C/ra98gaq/Git/st-bsb--finish-then-update/data/cellseg-benchmark/FICTURE/foxf2_s2_r1/analysis/20250207_all_transcript_factors.csv.gz']

In [25]:
sdata

SpatialData object, with associated Zarr store: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/spatialdata-objects/foxf2_s2_r1/sdata.zarr
├── Points
│     ├── 'foxf2_s2_r1_nF5_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF10_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF20_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF21_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF25_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF28_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF30_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF50_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     └── 'foxf2_s2_r1_transcripts': DataFrame with shape: (<Delayed>, 9) (2D points)
├── Sha

In [26]:
# Add merged pixel-level factors to sdata
sdata[f"{sample_name}_all_transcript_factors"] = result
sdata.write_element(f"{sample_name}_all_transcript_factors", overwrite=True)

In [27]:
sdata

SpatialData object, with associated Zarr store: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/spatialdata-objects/foxf2_s2_r1/sdata.zarr
├── Points
│     ├── 'foxf2_s2_r1_all_transcript_factors': DataFrame with shape: (<Delayed>, 13) (2D points)
│     ├── 'foxf2_s2_r1_nF5_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF10_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF20_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF21_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF25_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF28_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF30_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF50_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D poi

## Add shapes from other segmentations

In [26]:
msdata_file = os.path.join(
    data_dir, "samples", sample_name, "results", "Proseg", "sdata.zarr"
)

In [27]:
if "msdata" not in globals():
    msdata = spatialdata.read_zarr(msdata_file)

In [28]:
msdata

SpatialData object, with associated Zarr store: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/samples/foxf2_s2_r1/results/Proseg/sdata.zarr
├── Shapes
│     ├── '2D_boundaries': GeoDataFrame shape: (87915, 2) (2D shapes)
│     └── '3D_boundaries': GeoDataFrame shape: (351599, 3) (2D shapes)
└── Tables
      └── 'table': AnnData (87915, 550)
with coordinate systems:
    ▸ 'global', with elements:
        2D_boundaries (Shapes), 3D_boundaries (Shapes)

In [29]:
msdata['2D_boundaries']

Unnamed: 0,cell,geometry
0,0,"MULTIPOLYGON (((30237.76019 28486.51151, 30237..."
1,1,"MULTIPOLYGON (((14070.9021 40857.20164, 14070...."
2,2,"MULTIPOLYGON (((18969.10825 24856.78806, 18969..."
3,3,"MULTIPOLYGON (((26635.86569 28477.25201, 26635..."
4,4,"MULTIPOLYGON (((48237.97333 17588.08167, 48237..."
...,...,...
87913,87913,"MULTIPOLYGON (((46367.58081 51625.9985, 46367...."
87914,87914,"MULTIPOLYGON (((59877.00003 35097.79351, 59877..."
87915,87915,"MULTIPOLYGON (((53182.47632 37310.81367, 53182..."
87916,87916,"MULTIPOLYGON (((26265.49094 39181.23239, 26265..."


In [30]:
sdata['Proseg_2D_boundaries'] = msdata['2D_boundaries']

In [32]:
sdata

SpatialData object, with associated Zarr store: /dss/dssfs03/pn52re/pn52re-dss-0001/cellseg-benchmark/samples/foxf2_s2_r1/results/Ficture/sdata.zarr
├── Points
│     ├── 'foxf2_s2_r1_all_transcript_factors': DataFrame with shape: (<Delayed>, 13) (2D points)
│     ├── 'foxf2_s2_r1_nF5_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF10_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF20_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF21_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF25_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF28_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF30_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'foxf2_s2_r1_nF50_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D

In [33]:
sdata.write_element('Proseg_2D_boundaries')

## Update sdata

In [None]:
# not needed

In [None]:
def update_element(sdata, element_name):
    """
    Workaround for updating a backed element in sdata.
    Adapted from https://github.com/scverse/spatialdata/blob/main/tests/io/test_readwrite.py#L156
    """
    new_name = f"{element_name}_tmp"
    name = element_name
    # a a. write a backup copy of the data
    sdata[new_name] = sdata[name]
    sdata.write_element(new_name)
    # a2. remove the in-memory copy from the SpatialData object (note,
    # at this point the backup copy still exists on-disk)
    del sdata[new_name]
    del sdata[name]
    # a3 load the backup copy into memory
    sdata_copy = spatialdata.read_zarr(sdata.path)
    # b1. rewrite the original data
    sdata.delete_element_from_disk(name)
    sdata[name] = sdata_copy[new_name]
    sdata.write_element(name)
    # b2. reload the new data into memory (because it has been written but in-memory it still points
    # from the backup location)
    sdata = spatialdata.read_zarr(sdata.path)
    # c. remove the backup copy
    del sdata[new_name]
    sdata.delete_element_from_disk(new_name)

In [None]:
# update_element(sdata, f"{sample_name}_all_transcript_factors")