# Combine MERFISH FICTURE results in spatialdata object
- create spatial data object
- add pixel level factors
- calculate transcript level factors

In [5]:
import spatialdata
import spatialdata_io
import pandas as pd
import gzip
import dask.dataframe as dd
from scipy.spatial import KDTree
import numpy as np
from spatialdata.models import PointsModel
import os

In [11]:
sdata_file = '/home/hspitzer/projects/cellseg_benchmark/data/processed_data/FOXF2/SLIDE2/REGION1/sdata.zarr'
slide_name = 'slide2'
region_name = 'region1'

## Function definitions

In [None]:
def update_element(sdata, element_name):
    """
    Workaround for updating a backed element in sdata.
    Adapted from https://github.com/scverse/spatialdata/blob/main/tests/io/test_readwrite.py#L156
    """
    new_name = f'{element_name}_tmp'
    name = element_name
    #a a. write a backup copy of the data
    sdata[new_name] = sdata[name]
    sdata.write_element(new_name)
    # a2. remove the in-memory copy from the SpatialData object (note,
    # at this point the backup copy still exists on-disk)
    del sdata[new_name]
    del sdata[name]
    # a3 load the backup copy into memory
    sdata_copy = spatialdata.read_zarr(sdata.path)
    # b1. rewrite the original data
    sdata.delete_element_from_disk(name)
    sdata[name] = sdata_copy[new_name]
    sdata.write_element(name)
    # b2. reload the new data into memory (because it has been written but in-memory it still points
    # from the backup location)
    sdata = spatialdata.read_zarr(sdata.path)
    # c. remove the backup copy
    del sdata[new_name]
    sdata.delete_element_from_disk(new_name)

## 1. Create spatial data object
Will overwrite existing objects

In [None]:
# needed for "1. Create spatial data object"
merfish_results = '/home/hspitzer/projects/cellseg_benchmark/data/merfish_output/20240322_mousebrain-Slide02-cp-WT-PCKO/region_1'

In [23]:
sdata = spatialdata_io.merscope(merfish_results, z_layers=None, cells_boundaries=True, mosaic_images=False, slide_name=slide_name, region_name=region_name)
# save sdata
os.makedirs(os.path.dirname(sdata_file), exist_ok=True)
sdata.write(sdata_file, overwrite=True)

[34mINFO    [0m The column [32m"global_x"[0m has now been renamed to [32m"x"[0m; the column [32m"x"[0m was already present in the dataframe,   
         and will be dropped.                                                                                      
[34mINFO    [0m The column [32m"global_y"[0m has now been renamed to [32m"y"[0m; the column [32m"y"[0m was already present in the dataframe,   
         and will be dropped.                                                                                      


  return method.__get__(obj, cls)(*args, **kwargs)


[34mINFO    [0m The Zarr backing store has been changed from [3;35mNone[0m the new file path:                                      
         [35m/home/hspitzer/projects/cellseg_benchmark/data/processed_data/FOXF2/SLIDE2/REGION1/[0m[95msdata.zarr[0m             


## 2. Add pixel level factors

In [45]:
# needed for "2. Add pixel level factors and 3. Calculate transcript level factors"
#ficture_name = 'nF25_d5'
#pixel_level_factors_file = '/home/hspitzer/projects/cellseg_benchmark/data/ficture_output/r1/analysis/nF25.d_5/nF25.d_5.prj_6.r_4_5.pixel.sorted.tsv.gz'

ficture_name = 'nF25_d6'
pixel_level_factors_file = '/home/hspitzer/projects/cellseg_benchmark/data/ficture_output/r1/analysis/nF25.d_6/nF25.d_6.prj_6.r_4_5.pixel.sorted.tsv.gz'

In [46]:
sdata = spatialdata.read_zarr(sdata_file)

In [47]:
# read metadata
metadata = {}
with gzip.open(pixel_level_factors_file,'rb') as f:
    i = 0
    for line in f:
        line = line.decode()
        if i < 3:
            # read metadata from file
            for s in line.strip().strip('#').split(';'):
                k,v = s.split('=')
                metadata[k] = v
        if i == 3:
            break
        i+=1 

metadata  

{'K': '25',
 'TOPK': '3',
 'BLOCK_SIZE': '2000',
 'BLOCK_AXIS': 'X',
 'INDEX_AXIS': 'Y',
 'OFFSET_X': '173.92',
 'OFFSET_Y': '-47.43',
 'SIZE_X': '7065',
 'SIZE_Y': '5849',
 'SCALE': '100'}

In [48]:
# read data
df = pd.read_table(gzip.open(pixel_level_factors_file, 'rb'), skiprows=3, header=0, engine='c')

# create um coords in df
scale = float(metadata['SCALE'])
offset_x = float(metadata['OFFSET_X'])
offset_y = float(metadata['OFFSET_Y'])

df['X_um'] = df['X'] / scale + offset_x
df['Y_um'] = df['Y'] / scale + offset_y

# sort for potentially more efficieny
df = df.sort_values(['X_um', 'Y_um'])

# rename for adding to spatialdata
df = df.rename(columns={'X_um':'x', 'Y_um':'y', 'Y':'Y_px', 'X':'X_px'})

In [49]:
# create dask dataframe
dask_df = dd.from_pandas(df, npartitions=96)
parsed = PointsModel.parse(dask_df)

In [50]:
parsed

Unnamed: 0_level_0,#BLOCK,X_px,Y_px,K1,K2,K3,P1,P2,P3,x,y
npartitions=96,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,int64,int64,int64,int64,int64,int64,float64,float64,float64,float64,float64
960757,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
91271884,...,...,...,...,...,...,...,...,...,...,...
92232639,...,...,...,...,...,...,...,...,...,...,...


In [51]:
sdata[f'{slide_name}_{region_name}_{ficture_name}_pixel_factors'] = parsed

In [52]:
sdata

SpatialData object, with associated Zarr store: /home/hspitzer/projects/cellseg_benchmark/data/processed_data/FOXF2/SLIDE2/REGION1/sdata.zarr
├── Points
│     ├── 'slide2_region1_nF25_d5_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'slide2_region1_nF25_d6_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     └── 'slide2_region1_transcripts': DataFrame with shape: (<Delayed>, 10) (2D points)
├── Shapes
│     └── 'slide2_region1_polygons': GeoDataFrame shape: (107013, 9) (2D shapes)
└── Tables
      └── 'table': AnnData (107013, 500)
with coordinate systems:
    ▸ 'global', with elements:
        slide2_region1_nF25_d5_pixel_factors (Points), slide2_region1_nF25_d6_pixel_factors (Points), slide2_region1_transcripts (Points), slide2_region1_polygons (Shapes)
with the following elements not in the Zarr store:
    ▸ slide2_region1_nF25_d6_pixel_factors (Points)

In [53]:
# update sdata
sdata.write_element(f'{slide_name}_{region_name}_{ficture_name}_pixel_factors', overwrite=True)

## 3. Add transcript level factors

In [54]:
# set up KDTree for efficient querying
coords = np.array([df['x'], df['y']]).T
tree = KDTree(coords, leafsize=10, compact_nodes=True, copy_data=False, balanced_tree=True)

In [55]:
def get_transcript_level_factors(transcripts):
    # query tree to get nearest pixels and according factor assignment
    query = np.array([transcripts['x'], transcripts['y']]).T
    dd, ii = tree.query(query)
    # get factor prediction from df
    factor = np.array(df.iloc[ii]['K1'])
    # where distance > 5 um set factor to max_factor to indicate that this transcript was not mapped
    factor[dd > 5] = int(metadata['K'])
    kwargs = {f'{ficture_name}_factors': factor}
    return transcripts.assign(**kwargs)
    
# calculate transcript_level_factors
res = sdata[f'{slide_name}_{region_name}_transcripts'].map_partitions(get_transcript_level_factors)
# update transcripts table
sdata[f'{slide_name}_{region_name}_transcripts'] = res

  self._check_key(key, self.keys(), self._shared_keys)


In [56]:
# update sdata (and calculate transcript level factors)
update_element(sdata, f'{slide_name}_{region_name}_transcripts')

In [57]:
sdata

SpatialData object, with associated Zarr store: /home/hspitzer/projects/cellseg_benchmark/data/processed_data/FOXF2/SLIDE2/REGION1/sdata.zarr
├── Points
│     ├── 'slide2_region1_nF25_d5_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     ├── 'slide2_region1_nF25_d6_pixel_factors': DataFrame with shape: (<Delayed>, 11) (2D points)
│     └── 'slide2_region1_transcripts': DataFrame with shape: (<Delayed>, 11) (2D points)
├── Shapes
│     └── 'slide2_region1_polygons': GeoDataFrame shape: (107013, 9) (2D shapes)
└── Tables
      └── 'table': AnnData (107013, 500)
with coordinate systems:
    ▸ 'global', with elements:
        slide2_region1_nF25_d5_pixel_factors (Points), slide2_region1_nF25_d6_pixel_factors (Points), slide2_region1_transcripts (Points), slide2_region1_polygons (Shapes)
with the following Dask-backed elements not being self-contained:
    ▸ slide2_region1_transcripts: /home/hspitzer/projects/cellseg_benchmark/data/processed_data/FOXF2/SLIDE2/REGION1/sd

In [58]:
sdata[f'{slide_name}_{region_name}_transcripts']

Unnamed: 0_level_0,x,y,barcode_id,Unnamed: 0,cell_id,global_z,gene,fov,transcript_id,nF25_d5_factors,nF25_d6_factors
npartitions=96,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,float64,float64,int64,int64,int64,float64,category[known],int64,string,int64,int64
,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...


## Read images that were downloaded later