# Run FlowSOM for pixel and cell clustering

In [1]:
%load_ext autoreload
%autoreload 2

import harpy
from harpy.datasets import pixie_example
from harpy.table.cell_clustering._utils import _export_to_ark_format as _export_to_ark_format_cells
from harpy.table.pixel_clustering._cluster_intensity import _export_to_ark_format as _export_to_ark_format_pixels
from harpy.utils._keys import ClusteringKey



## Load example dataset

In [2]:
sdata_ark_analysis = pixie_example(["fov0", "fov1"])
sdata_ark_analysis

  from .autonotebook import tqdm as notebook_tqdm
2024-12-10 09:46:33,401 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov0'
2024-12-10 09:46:33,408 - harpy.image._manager - INFO - Writing results to layer 'label_nuclear_fov0'
2024-12-10 09:46:33,414 - harpy.image._manager - INFO - Writing results to layer 'label_whole_fov0'


/Users/arnedf/.cache/huggingface/datasets/downloads/extracted/ed276a09a07145a5c25cd3c0a3fd99368fc2f3387300f55927c0b600c043de39/post_clustering


2024-12-10 09:46:33,539 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov1'
2024-12-10 09:46:33,546 - harpy.image._manager - INFO - Writing results to layer 'label_nuclear_fov1'
2024-12-10 09:46:33,553 - harpy.image._manager - INFO - Writing results to layer 'label_whole_fov1'
  adata.uns[cls.ATTRS_KEY] = attr


SpatialData object
├── Images
│     ├── 'raw_image_fov0': DataArray[cyx] (22, 512, 512)
│     └── 'raw_image_fov1': DataArray[cyx] (22, 1024, 1024)
├── Labels
│     ├── 'label_nuclear_fov0': DataArray[yx] (512, 512)
│     ├── 'label_nuclear_fov1': DataArray[yx] (1024, 1024)
│     ├── 'label_whole_fov0': DataArray[yx] (512, 512)
│     └── 'label_whole_fov1': DataArray[yx] (1024, 1024)
└── Tables
      └── 'table': AnnData (1414, 22)
with coordinate systems:
    ▸ 'fov0', with elements:
        raw_image_fov0 (Images), label_nuclear_fov0 (Labels), label_whole_fov0 (Labels)
    ▸ 'fov1', with elements:
        raw_image_fov1 (Images), label_nuclear_fov1 (Labels), label_whole_fov1 (Labels)

In [3]:
channels = [
    "CD3",
    "CD4",
    "CD8",
    "CD14",
    "CD20",
    "CD31",
    "CD45",
    "CD68",
    "CD163",
    "CK17",
    "Collagen1",
    "Fibronectin",
    "ECAD",
    "HLADR",
    "SMA",
    "Vim",
]

In [4]:
sdata_ark_analysis = harpy.im.pixel_clustering_preprocess(
    sdata_ark_analysis,
    img_layer=["raw_image_fov0", "raw_image_fov1"],
    output_layer=["raw_image_fov0_processed", "raw_image_fov1_processed"],
    channels=channels,
    chunks=2048,
    overwrite=True,
    sigma=2.0,
)
sdata_ark_analysis

2024-12-10 09:46:34,839 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov0_processed'
2024-12-10 09:46:35,665 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov1_processed'


SpatialData object
├── Images
│     ├── 'raw_image_fov0': DataArray[cyx] (22, 512, 512)
│     ├── 'raw_image_fov0_processed': DataArray[cyx] (16, 512, 512)
│     ├── 'raw_image_fov1': DataArray[cyx] (22, 1024, 1024)
│     └── 'raw_image_fov1_processed': DataArray[cyx] (16, 1024, 1024)
├── Labels
│     ├── 'label_nuclear_fov0': DataArray[yx] (512, 512)
│     ├── 'label_nuclear_fov1': DataArray[yx] (1024, 1024)
│     ├── 'label_whole_fov0': DataArray[yx] (512, 512)
│     └── 'label_whole_fov1': DataArray[yx] (1024, 1024)
└── Tables
      └── 'table': AnnData (1414, 22)
with coordinate systems:
    ▸ 'fov0', with elements:
        raw_image_fov0 (Images), raw_image_fov0_processed (Images), label_nuclear_fov0 (Labels), label_whole_fov0 (Labels)
    ▸ 'fov1', with elements:
        raw_image_fov1 (Images), raw_image_fov1_processed (Images), label_nuclear_fov1 (Labels), label_whole_fov1 (Labels)

In [5]:
sdata_ark_analysis, fsom, mapping = harpy.im.flowsom(
    sdata_ark_analysis,
    img_layer=["raw_image_fov0_processed", "raw_image_fov1_processed"],
    output_layer_clusters=[
        "raw_image_fov0_flowsom_clusters",
        "raw_image_fov1_flowsom_clusters",
    ],  # we need output_cluster_layer and output_meta_cluster_layer --> these will both be labels layers
    output_layer_metaclusters=["raw_image_fov0_flowsom_metaclusters", "raw_image_fov1_flowsom_metaclusters"],
    n_clusters=20,
    random_state=111,
    chunks=512,
    overwrite=True,
)
sdata_ark_analysis

[32m2024-12-10 09:46:35.835[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m84[0m - [34m[1mReading input.[0m
[32m2024-12-10 09:46:35.836[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m86[0m - [34m[1mFitting model: clustering and metaclustering.[0m
[32m2024-12-10 09:46:37.821[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m88[0m - [34m[1mUpdating derived values.[0m
2024-12-10 09:46:38,618 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov0_flowsom_clusters'
2024-12-10 09:46:38,904 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov0_flowsom_metaclusters'
2024-12-10 09:46:39,650 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov1_flowsom_clusters'
2024-12-10 09:46:40,267 - harpy.image._manager - INFO - Writing results to layer 'raw_image_fov1_flowsom_metaclusters'


SpatialData object
├── Images
│     ├── 'raw_image_fov0': DataArray[cyx] (22, 512, 512)
│     ├── 'raw_image_fov0_processed': DataArray[cyx] (16, 512, 512)
│     ├── 'raw_image_fov1': DataArray[cyx] (22, 1024, 1024)
│     └── 'raw_image_fov1_processed': DataArray[cyx] (16, 1024, 1024)
├── Labels
│     ├── 'label_nuclear_fov0': DataArray[yx] (512, 512)
│     ├── 'label_nuclear_fov1': DataArray[yx] (1024, 1024)
│     ├── 'label_whole_fov0': DataArray[yx] (512, 512)
│     ├── 'label_whole_fov1': DataArray[yx] (1024, 1024)
│     ├── 'raw_image_fov0_flowsom_clusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov0_flowsom_metaclusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov1_flowsom_clusters': DataArray[yx] (1024, 1024)
│     └── 'raw_image_fov1_flowsom_metaclusters': DataArray[yx] (1024, 1024)
└── Tables
      └── 'table': AnnData (1414, 22)
with coordinate systems:
    ▸ 'fov0', with elements:
        raw_image_fov0 (Images), raw_image_fov0_processed (Images), label_nuclea

In [6]:
sdata_ark_analysis = harpy.tb.cluster_intensity(
    sdata_ark_analysis,
    mapping=mapping,
    img_layer=["raw_image_fov0_processed", "raw_image_fov1_processed"],
    labels_layer=["raw_image_fov0_flowsom_clusters", "raw_image_fov1_flowsom_clusters"],
    to_coordinate_system=[ "fov0", "fov1" ],
    output_layer="counts_clusters",
    overwrite=True,
)
sdata_ark_analysis

  adata.obsm[_SPATIAL] = coordinates
  adata.obsm[_SPATIAL] = coordinates
  self._check_key(key, self.keys(), self._shared_keys)
2024-12-10 09:46:42,395 - harpy.table._preprocess - INFO - Calculating cell size from provided labels_layer 'raw_image_fov0_flowsom_clusters'
2024-12-10 09:46:42,412 - harpy.table._preprocess - INFO - Calculating cell size from provided labels_layer 'raw_image_fov1_flowsom_clusters'
  return convert_region_column_to_categorical(adata)
  self._check_key(key, self.keys(), self._shared_keys)
  self._check_key(key, self.keys(), self._shared_keys)


SpatialData object
├── Images
│     ├── 'raw_image_fov0': DataArray[cyx] (22, 512, 512)
│     ├── 'raw_image_fov0_processed': DataArray[cyx] (16, 512, 512)
│     ├── 'raw_image_fov1': DataArray[cyx] (22, 1024, 1024)
│     └── 'raw_image_fov1_processed': DataArray[cyx] (16, 1024, 1024)
├── Labels
│     ├── 'label_nuclear_fov0': DataArray[yx] (512, 512)
│     ├── 'label_nuclear_fov1': DataArray[yx] (1024, 1024)
│     ├── 'label_whole_fov0': DataArray[yx] (512, 512)
│     ├── 'label_whole_fov1': DataArray[yx] (1024, 1024)
│     ├── 'raw_image_fov0_flowsom_clusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov0_flowsom_metaclusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov1_flowsom_clusters': DataArray[yx] (1024, 1024)
│     └── 'raw_image_fov1_flowsom_metaclusters': DataArray[yx] (1024, 1024)
└── Tables
      ├── 'counts_clusters': AnnData (100, 16)
      └── 'table': AnnData (1414, 22)
with coordinate systems:
    ▸ 'fov0', with elements:
        raw_image_fov0 (Images), 

In [7]:
sdata_ark_analysis, fsom = harpy.tb.flowsom(
    sdata_ark_analysis,
    labels_layer_cells=["label_whole_fov0", "label_whole_fov1"],
    labels_layer_clusters=[
        "raw_image_fov0_flowsom_metaclusters",
        "raw_image_fov1_flowsom_metaclusters",
    ],  # here you could also choose "ark_pixel_som_cluster"
    output_layer="table_cell_clustering_flowsom",
    chunks=512,
    overwrite=True,
    random_state=100,
)
sdata_ark_analysis

2024-12-10 09:46:42,627 - harpy.table._preprocess - INFO - Calculating cell size from provided labels_layer 'label_whole_fov0'
2024-12-10 09:46:42,643 - harpy.table._preprocess - INFO - Calculating cell size from provided labels_layer 'label_whole_fov1'
  return convert_region_column_to_categorical(adata)
  self._check_key(key, self.keys(), self._shared_keys)
[32m2024-12-10 09:46:42.688[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m84[0m - [34m[1mReading input.[0m
[32m2024-12-10 09:46:42.689[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m86[0m - [34m[1mFitting model: clustering and metaclustering.[0m
[32m2024-12-10 09:46:42.711[0m | [34m[1mDEBUG   [0m | [36mflowsom.main[0m:[36m__init__[0m:[36m88[0m - [34m[1mUpdating derived values.[0m
2024-12-10 09:46:42,840 - harpy.table.cell_clustering._clustering - INFO - Adding mean cluster intensity to '.uns['clustering']'
2024-12-10 09:46:42,853 - harpy.table.cell_cl

SpatialData object
├── Images
│     ├── 'raw_image_fov0': DataArray[cyx] (22, 512, 512)
│     ├── 'raw_image_fov0_processed': DataArray[cyx] (16, 512, 512)
│     ├── 'raw_image_fov1': DataArray[cyx] (22, 1024, 1024)
│     └── 'raw_image_fov1_processed': DataArray[cyx] (16, 1024, 1024)
├── Labels
│     ├── 'label_nuclear_fov0': DataArray[yx] (512, 512)
│     ├── 'label_nuclear_fov1': DataArray[yx] (1024, 1024)
│     ├── 'label_whole_fov0': DataArray[yx] (512, 512)
│     ├── 'label_whole_fov1': DataArray[yx] (1024, 1024)
│     ├── 'raw_image_fov0_flowsom_clusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov0_flowsom_metaclusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov1_flowsom_clusters': DataArray[yx] (1024, 1024)
│     └── 'raw_image_fov1_flowsom_metaclusters': DataArray[yx] (1024, 1024)
└── Tables
      ├── 'counts_clusters': AnnData (100, 16)
      ├── 'table': AnnData (1414, 22)
      └── 'table_cell_clustering_flowsom': AnnData (1409, 20)
with coordinate systems:


In [8]:
# weighted channel average for visualization -> calculate this on the flowsom clustered matrix
sdata_ark_analysis = harpy.tb.weighted_channel_expression(
    sdata_ark_analysis,
    table_layer_cell_clustering="table_cell_clustering_flowsom",
    table_layer_pixel_cluster_intensity="counts_clusters",
    output_layer="table_cell_clustering_flowsom",
    clustering_key=ClusteringKey._METACLUSTERING_KEY,
    overwrite=True,
)
sdata_ark_analysis

2024-12-10 09:46:42,910 - harpy.table.cell_clustering._weighted_channel_expression - INFO - Adding mean over obtained cell clusters '(clustering)' of the average marker expression for each cell weighted by pixel cluster count to '.uns[ 'clustering_channels' ]' of table layer 'table_cell_clustering_flowsom'
2024-12-10 09:46:42,931 - harpy.table.cell_clustering._weighted_channel_expression - INFO - Adding mean over obtained cell clusters '(metaclustering)' of the average marker expression for each cell weighted by pixel cluster count to '.uns[ 'metaclustering_channels' ]' of table layer 'table_cell_clustering_flowsom'
2024-12-10 09:46:42,933 - harpy.table.cell_clustering._weighted_channel_expression - INFO - Adding average marker expression for each cell weighted by pixel cluster count to '.obs' of table layer 'table_cell_clustering_flowsom'
  self._check_key(key, self.keys(), self._shared_keys)


SpatialData object
├── Images
│     ├── 'raw_image_fov0': DataArray[cyx] (22, 512, 512)
│     ├── 'raw_image_fov0_processed': DataArray[cyx] (16, 512, 512)
│     ├── 'raw_image_fov1': DataArray[cyx] (22, 1024, 1024)
│     └── 'raw_image_fov1_processed': DataArray[cyx] (16, 1024, 1024)
├── Labels
│     ├── 'label_nuclear_fov0': DataArray[yx] (512, 512)
│     ├── 'label_nuclear_fov1': DataArray[yx] (1024, 1024)
│     ├── 'label_whole_fov0': DataArray[yx] (512, 512)
│     ├── 'label_whole_fov1': DataArray[yx] (1024, 1024)
│     ├── 'raw_image_fov0_flowsom_clusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov0_flowsom_metaclusters': DataArray[yx] (512, 512)
│     ├── 'raw_image_fov1_flowsom_clusters': DataArray[yx] (1024, 1024)
│     └── 'raw_image_fov1_flowsom_metaclusters': DataArray[yx] (1024, 1024)
└── Tables
      ├── 'counts_clusters': AnnData (100, 16)
      ├── 'table': AnnData (1414, 22)
      └── 'table_cell_clustering_flowsom': AnnData (1409, 20)
with coordinate systems:


In [9]:
df = _export_to_ark_format_pixels(adata=sdata_ark_analysis["counts_clusters"], output=None)
(
    df_cell_som_cluster_count_avg,
    df_cell_som_cluster_channel_avg,
    df_cell_meta_cluster_channel_avg,
) = _export_to_ark_format_cells(sdata_ark_analysis, table_layer="table_cell_clustering_flowsom", output=None)
df



channels,CD3,CD4,CD8,CD14,CD20,CD31,CD45,CD68,CD163,CK17,Collagen1,Fibronectin,ECAD,HLADR,SMA,Vim,pixel_meta_cluster,pixel_som_cluster,count
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1_counts_clusters_fe90513d,37.843970,2.887842,97.043740,5.178658,1.315880,0.589464,30.252512,1.639205,2.432772,2.077082,7.781796,4.139287,1.638660,2.509107,2.552425,7.210590,2,1,9930
2_counts_clusters_fe90513d,38.690112,9.945175,49.737972,5.014744,8.713191,1.219908,56.096997,2.077967,4.129621,2.686037,5.455420,5.524063,2.271283,3.385231,0.894532,8.693713,1,2,12134
3_counts_clusters_fe90513d,73.091492,17.922896,18.220559,4.631251,4.585831,1.025564,52.106125,2.153796,4.097220,2.331725,7.267913,6.139904,2.960410,2.698596,1.826165,9.665717,1,3,5392
4_counts_clusters_fe90513d,76.282287,61.344410,2.660174,3.988683,3.950231,0.583871,39.929262,1.899014,2.709320,1.705489,7.024916,4.346849,2.103179,2.492270,1.044199,5.976883,9,4,12617
5_counts_clusters_fe90513d,47.305260,59.382376,3.534356,5.237534,8.039006,0.985423,55.497515,2.064406,4.395945,2.033076,4.772678,4.934692,2.381851,4.444559,0.383196,5.731354,9,5,16815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96_counts_clusters_fe90513d,3.381922,10.202954,2.722567,19.882297,2.628480,1.435061,12.535290,6.423287,7.808438,1.716372,37.751931,11.812116,2.634270,31.108982,6.185559,9.853439,5,96,11251
97_counts_clusters_fe90513d,2.470278,8.337522,2.250204,45.239891,0.850247,0.568664,8.317586,3.651513,7.229076,1.053102,55.057834,11.486410,1.136371,6.642796,3.817631,4.767680,5,97,13194
98_counts_clusters_fe90513d,3.030046,12.170709,2.506028,59.447868,1.032256,0.860999,11.455320,4.825058,9.948482,1.224571,33.234845,12.156787,1.475615,9.946127,4.933891,6.348947,5,98,15672
99_counts_clusters_fe90513d,3.106052,14.744514,2.634652,77.775664,1.103696,0.984863,14.039679,5.912146,12.283513,1.081740,13.259535,10.251130,1.735470,12.797073,6.092375,8.267192,5,99,16393


In [10]:
df_cell_meta_cluster_channel_avg

channels,cell_meta_cluster,CD3,CD4,CD8,CD14,CD20,CD31,CD45,CD68,CD163,CK17,Collagen1,Fibronectin,ECAD,HLADR,SMA,Vim,cell_meta_cluster_rename
0,1,4.732977,12.334994,3.82658,27.80726,2.461496,1.329181,10.076252,33.944057,17.968353,1.629934,16.61867,9.723362,2.770627,12.875719,5.481965,15.619396,1
1,2,7.075747,16.851241,4.782487,23.933335,4.768343,1.419091,18.424167,8.391063,9.533554,1.756295,15.256073,10.150131,3.817615,33.035568,5.489962,15.135321,2
2,3,5.31087,6.326648,4.552025,6.782602,1.889759,1.057797,7.021682,3.08574,4.636659,1.31156,10.039122,7.425196,4.031896,3.593263,3.935879,7.82934,3
3,4,4.588128,7.504651,3.670254,12.055852,4.338342,13.443183,9.397056,4.727704,6.856803,2.037149,21.240715,24.963244,3.788241,5.20855,9.801442,20.021242,4
4,5,4.078594,5.858771,3.888499,9.472374,2.372011,1.703166,6.518079,3.563198,5.230883,1.530075,38.030225,15.434641,3.098016,3.948902,7.7597,12.790385,5
5,6,18.033245,20.708114,8.375777,10.894179,22.905814,2.071226,36.052011,4.270251,8.48945,2.859756,11.291614,9.799747,4.532029,10.463147,3.164878,12.215809,6
6,7,35.246133,45.755326,5.143688,8.575204,12.245368,1.517205,40.691112,3.552398,7.167313,2.519683,8.262509,7.113883,3.989649,6.796181,1.706669,9.4814,7
7,8,22.038232,6.726384,48.541199,12.794302,2.148181,1.414991,22.117769,4.839054,6.926617,2.439495,17.799409,11.264155,2.942548,5.373507,5.779891,13.53331,8
8,9,19.287928,16.044287,13.400287,12.406111,8.926173,2.135102,31.626518,4.361473,9.713586,4.407114,13.297999,12.033864,6.729911,8.253084,5.336903,13.898886,9
9,10,10.383163,15.077001,4.677668,9.39819,39.115224,1.869134,37.112992,3.799953,7.84245,2.654522,12.000378,8.996242,3.813376,11.426639,2.557836,11.440348,10


In [11]:
# "table_cell_clustering_flowsom" is annotated by segmentation masks, so they can be visualised using napari-spatialdata
sdata_ark_analysis[ "table_cell_clustering_flowsom" ].uns[ "spatialdata_attrs" ]

#from napari_spatialdata import Interactive

#Interactive(sdata_ark_analysis)

{'region': ['label_whole_fov0', 'label_whole_fov1'],
 'region_key': 'fov_labels',
 'instance_key': 'cell_ID'}