From 8df352159ee494387f8479d752ed1bf0bd314c12 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Wed, 19 Apr 2023 14:48:07 +0200 Subject: [PATCH 01/17] merfish reader v0 (handle vpt outputs) --- src/spatialdata_io/__init__.py | 2 + src/spatialdata_io/_constants/_constants.py | 26 ++++ src/spatialdata_io/readers/merfish.py | 126 ++++++++++++++++++++ 3 files changed, 154 insertions(+) create mode 100644 src/spatialdata_io/readers/merfish.py diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py index 297c4435..739ca41c 100644 --- a/src/spatialdata_io/__init__.py +++ b/src/spatialdata_io/__init__.py @@ -2,6 +2,7 @@ from spatialdata_io.readers.cosmx import cosmx from spatialdata_io.readers.mcmicro import mcmicro +from spatialdata_io.readers.merfish import merfish from spatialdata_io.readers.steinbock import steinbock from spatialdata_io.readers.visium import visium from spatialdata_io.readers.xenium import xenium @@ -12,6 +13,7 @@ "cosmx", "mcmicro", "steinbock", + "merfish", ] __version__ = version("spatialdata-io") diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index aee085d5..ba3d6cd4 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -120,3 +120,29 @@ class McmicroKeys(ModeEnum): COORDS_X = "X_centroid" COORDS_Y = "Y_centroid" INSTANCE_KEY = "CellID" + + +@unique +class MerfishKeys(ModeEnum): + """Keys for *MERFISH* data (*MERSCOPE* plateform from Vizgen)""" + + # files and directories + IMAGES_DIR = "images" + TRANSFORMATION_FILE = "micron_to_mosaic_pixel_transform.csv" + TRANSCRIPTS_FILE = "detected_transcripts.csv" + BOUNDARIES_FILE = "cell_boundaries.parquet" + COUNTS_FILE = "cell_by_gene.csv" + CELL_METADATA_FILE = "cell_metadata.csv" + + # VPT default outputs + CELLPOSE_BOUNDARIES = "cellpose_micron_space.parquet" + WATERSHED_BOUNDARIES = "watershed_micron_space.parquet" + + # metadata + INSTANCE_KEY = "EntityID" + COUNTS_CELL_KEY = "cell" + CELL_X = "center_x" + CELL_Y = "center_y" + GLOBAL_X = "global_x" + GLOBAL_Y = "global_y" + GLOBAL_Z = "global_z" diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py new file mode 100644 index 00000000..84251137 --- /dev/null +++ b/src/spatialdata_io/readers/merfish.py @@ -0,0 +1,126 @@ +import re +from pathlib import Path +from typing import Optional, Tuple, Union + +import anndata +import geopandas +import numpy as np +import pandas as pd +from dask import array as da +from dask.dataframe import read_csv +from dask_image.imread import imread +from spatialdata import SpatialData +from spatialdata.models import Image2DModel, PointsModel, ShapesModel, TableModel +from spatialdata.transformations import Affine, Identity + +from spatialdata_io._constants._constants import MerfishKeys +from spatialdata_io._docs import inject_docs + + +def _scan_images(images_dir: Path) -> Tuple[set]: + exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" + matches = [re.search(exp, file.name) for file in images_dir.iterdir()] + + stains = set(match.group("stain") for match in matches if match) + z_levels = set(match.group("z") for match in matches if match) + + return stains, z_levels + + +def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict]]): + """Gets the file paths to (i) the file of transcript per cell, (ii) the cell metadata file, and (iii) the cell boundary file""" + if vpt_outputs is None: + return path / MerfishKeys.COUNTS_FILE, path / MerfishKeys.CELL_METADATA_FILE, path / MerfishKeys.BOUNDARIES_FILE + + if isinstance(vpt_outputs, str) or isinstance(vpt_outputs, Path): + vpt_outputs = Path(vpt_outputs) + + plausible_boundaries = [ + vpt_outputs / MerfishKeys.CELLPOSE_BOUNDARIES, + vpt_outputs / MerfishKeys.WATERSHED_BOUNDARIES, + ] + valid_boundaries = [path for path in plausible_boundaries if path.exists()] + + assert ( + not valid_boundaries + ), f"Boundary file not found - expected to find one of these files: {', '.join(plausible_boundaries)}" + + return vpt_outputs / MerfishKeys.COUNTS_FILE, vpt_outputs / MerfishKeys.CELL_METADATA_FILE, valid_boundaries[0] + + if isinstance(vpt_outputs, dict): + raise NotImplementedError() + + raise ValueError(f"'vpt_outputs' has to be either None, a str, a Path, or a dict. Found type {type(vpt_outputs)}.") + + +@inject_docs(mf=MerfishKeys) +def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict]]) -> SpatialData: + """ + Read *MERFISH* data from Vizgen. + + Parameters + ---------- + path + Path to the root directory containing the *Merfish* files (e.g., `detected_transcripts.csv`). + vpt_outputs + Optional arguments to indicate the output of the vizgen-postprocessing-tool (VPT), when used. If a folder path is provided, it looks inside the folder for the following files: ``{mf.COUNTS_FILE!r}``, ``{mf.CELL_METADATA_FILE!r}``, and a boundary parquet file. If a dictionnary, then the following keys can be provided: `cell_by_gene`, `cell_metadata`, `boundaries` with the desired path as the value. + + Returns + ------- + :class:`spatialdata.SpatialData` + """ + path = Path(path) + count_path, obs_path, boundaries_path = _get_file_paths(path, vpt_outputs) + images_dir = path / MerfishKeys.IMAGES_DIR + + microns_to_pixels = np.genfromtxt(images_dir / MerfishKeys.TRANSFORMATION_FILE) + microns_to_pixels = Affine(microns_to_pixels, input_axes=("x", "y"), output_axes=("x", "y")) + + ### Images + images = {} + + stains, z_levels = _scan_images(images_dir) + for z in z_levels: + im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z}.tif").squeeze() for stain in stains], axis=0) + parsed_im = Image2DModel.parse( + im, dims=("c", "y", "x"), transformations={"pixels": Identity(), "microns": microns_to_pixels.inverse()} + ) + images[f"z{z}"] = parsed_im + + ### Transcripts + transcript_df = read_csv(path / MerfishKeys.TRANSCRIPTS_FILE) + transcripts = PointsModel.parse( + transcript_df, + coordinates={"x": MerfishKeys.GLOBAL_X, "y": MerfishKeys.GLOBAL_Y, "z": MerfishKeys.GLOBAL_Z}, + transformations={"microns": Identity(), "pixels": microns_to_pixels}, + ) + points = {"transcripts": transcripts} + + ### Polygons + geo_df = geopandas.read_parquet(boundaries_path) + geo_df = geo_df.rename_geometry("geometry") + geo_df.index = geo_df[MerfishKeys.INSTANCE_KEY].astype(str) + + polygons = ShapesModel.parse(geo_df, transformations={"microns": Identity(), "pixels": microns_to_pixels}) + shapes = {"polygons": polygons} + + ### Table + data = pd.read_csv(count_path, index_col=0, dtype={MerfishKeys.COUNTS_CELL_KEY: str}) + obs = pd.read_csv(obs_path, index_col=0, dtype={MerfishKeys.INSTANCE_KEY: str}) + + is_gene = ~data.columns.str.lower().str.contains("blank") + adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs) + + adata.obsm["blank"] = data.loc[:, ~is_gene] # blank fields are excluded from adata.X + adata.obsm["spatial"] = adata.obs[[MerfishKeys.CELL_X, MerfishKeys.CELL_Y]].values + adata.obs["region"] = pd.Series(path.stem, index=adata.obs_names, dtype="category") + adata.obs[MerfishKeys.INSTANCE_KEY] = adata.obs.index + + table = TableModel.parse( + adata, + region_key="region", + region=adata.obs["region"].cat.categories.tolist(), + instance_key=MerfishKeys.INSTANCE_KEY, + ) + + return SpatialData(table=table, shapes=shapes, points=points, images=images) From 713b37c7b2d4f2e03ce7e9b1131ef1354090021f Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Wed, 19 Apr 2023 16:49:43 +0200 Subject: [PATCH 02/17] merfish reader: minor fixes (assert and KEY.value) --- src/spatialdata_io/readers/merfish.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py index 84251137..465236b6 100644 --- a/src/spatialdata_io/readers/merfish.py +++ b/src/spatialdata_io/readers/merfish.py @@ -42,7 +42,7 @@ def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict]]): valid_boundaries = [path for path in plausible_boundaries if path.exists()] assert ( - not valid_boundaries + valid_boundaries ), f"Boundary file not found - expected to find one of these files: {', '.join(plausible_boundaries)}" return vpt_outputs / MerfishKeys.COUNTS_FILE, vpt_outputs / MerfishKeys.CELL_METADATA_FILE, valid_boundaries[0] @@ -120,7 +120,7 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] adata, region_key="region", region=adata.obs["region"].cat.categories.tolist(), - instance_key=MerfishKeys.INSTANCE_KEY, + instance_key=MerfishKeys.INSTANCE_KEY.value, ) return SpatialData(table=table, shapes=shapes, points=points, images=images) From 511412d41e167a998d0f06b0f08121b16b18f35f Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Thu, 20 Apr 2023 11:04:39 +0200 Subject: [PATCH 03/17] add staining channel names to merfish reader --- src/spatialdata_io/readers/merfish.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py index 465236b6..76a6fb1d 100644 --- a/src/spatialdata_io/readers/merfish.py +++ b/src/spatialdata_io/readers/merfish.py @@ -1,6 +1,6 @@ import re from pathlib import Path -from typing import Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import anndata import geopandas @@ -17,14 +17,15 @@ from spatialdata_io._docs import inject_docs -def _scan_images(images_dir: Path) -> Tuple[set]: +def _scan_images(images_dir: Path) -> Tuple[List]: + """Searches inside the image directory and get all the different channels (stainings) and all the z-levels (usually 0...6)""" exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" matches = [re.search(exp, file.name) for file in images_dir.iterdir()] - stains = set(match.group("stain") for match in matches if match) + stainings = set(match.group("stain") for match in matches if match) z_levels = set(match.group("z") for match in matches if match) - return stains, z_levels + return list(stainings), list(z_levels) def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict]]): @@ -43,7 +44,7 @@ def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict]]): assert ( valid_boundaries - ), f"Boundary file not found - expected to find one of these files: {', '.join(plausible_boundaries)}" + ), f"Boundary file not found - expected to find one of these files: {', '.join(map(str, plausible_boundaries))}" return vpt_outputs / MerfishKeys.COUNTS_FILE, vpt_outputs / MerfishKeys.CELL_METADATA_FILE, valid_boundaries[0] @@ -79,11 +80,14 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] ### Images images = {} - stains, z_levels = _scan_images(images_dir) + stainings, z_levels = _scan_images(images_dir) for z in z_levels: - im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z}.tif").squeeze() for stain in stains], axis=0) + im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z}.tif").squeeze() for stain in stainings], axis=0) parsed_im = Image2DModel.parse( - im, dims=("c", "y", "x"), transformations={"pixels": Identity(), "microns": microns_to_pixels.inverse()} + im, + dims=("c", "y", "x"), + transformations={"pixels": Identity(), "microns": microns_to_pixels.inverse()}, + c_coords=stainings, ) images[f"z{z}"] = parsed_im @@ -123,4 +127,4 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] instance_key=MerfishKeys.INSTANCE_KEY.value, ) - return SpatialData(table=table, shapes=shapes, points=points, images=images) + return SpatialData(shapes=shapes, points=points, images=images, table=table) From 71a958e43a3fb50e003906e53f934fc4ed053212 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Apr 2023 09:21:41 +0000 Subject: [PATCH 04/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/merfish.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py index 76a6fb1d..79948e02 100644 --- a/src/spatialdata_io/readers/merfish.py +++ b/src/spatialdata_io/readers/merfish.py @@ -17,13 +17,13 @@ from spatialdata_io._docs import inject_docs -def _scan_images(images_dir: Path) -> Tuple[List]: +def _scan_images(images_dir: Path) -> tuple[list]: """Searches inside the image directory and get all the different channels (stainings) and all the z-levels (usually 0...6)""" exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" matches = [re.search(exp, file.name) for file in images_dir.iterdir()] - stainings = set(match.group("stain") for match in matches if match) - z_levels = set(match.group("z") for match in matches if match) + stainings = {match.group("stain") for match in matches if match} + z_levels = {match.group("z") for match in matches if match} return list(stainings), list(z_levels) From a0dc6ec3525c79347d4acbb5143a71e469df8a4b Mon Sep 17 00:00:00 2001 From: Luca Marconato <2664412+LucaMarconato@users.noreply.github.com> Date: Fri, 21 Apr 2023 20:49:58 +0200 Subject: [PATCH 05/17] points to categorical, splitting points by z value, debugging the transformations --- src/spatialdata_io/readers/merfish.py | 32 ++++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py index 79948e02..5d8868d9 100644 --- a/src/spatialdata_io/readers/merfish.py +++ b/src/spatialdata_io/readers/merfish.py @@ -1,13 +1,13 @@ import re from pathlib import Path -from typing import List, Optional, Tuple, Union +from typing import Optional, Union import anndata import geopandas import numpy as np import pandas as pd from dask import array as da -from dask.dataframe import read_csv +import dask.dataframe as dd from dask_image.imread import imread from spatialdata import SpatialData from spatialdata.models import Image2DModel, PointsModel, ShapesModel, TableModel @@ -17,7 +17,7 @@ from spatialdata_io._docs import inject_docs -def _scan_images(images_dir: Path) -> tuple[list]: +def _scan_images(images_dir: Path) -> tuple[list, list]: """Searches inside the image directory and get all the different channels (stainings) and all the z-levels (usually 0...6)""" exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" matches = [re.search(exp, file.name) for file in images_dir.iterdir()] @@ -86,26 +86,42 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] parsed_im = Image2DModel.parse( im, dims=("c", "y", "x"), - transformations={"pixels": Identity(), "microns": microns_to_pixels.inverse()}, + transformations={"pixels": Identity()}, + # transformations={"pixels": Identity(), "microns": microns_to_pixels.inverse()}, c_coords=stainings, ) images[f"z{z}"] = parsed_im ### Transcripts - transcript_df = read_csv(path / MerfishKeys.TRANSCRIPTS_FILE) + transcript_df = dd.read_csv(path / MerfishKeys.TRANSCRIPTS_FILE) transcripts = PointsModel.parse( transcript_df, coordinates={"x": MerfishKeys.GLOBAL_X, "y": MerfishKeys.GLOBAL_Y, "z": MerfishKeys.GLOBAL_Z}, - transformations={"microns": Identity(), "pixels": microns_to_pixels}, + transformations={"pixels": Identity()}, + # transformations={"microns": Identity(), "pixels": microns_to_pixels}, ) - points = {"transcripts": transcripts} + # points = {"transcripts": transcripts} + points = {} + gene_categorical = dd.from_pandas(transcripts['gene'].compute().astype('category'), npartitions=transcripts.npartitions).reset_index(drop=True) + transcripts['gene'] = gene_categorical + + # split the transcripts into the different z-levels + z = transcripts['z'].compute() + z_levels = z.value_counts().index + z_levels = sorted(z_levels, key=lambda x: int(x)) + for z_level in z_levels: + transcripts_subset = transcripts[z == z_level] + # temporary solution until the 3D support is better developed + transcripts_subset = transcripts_subset.drop('z', axis=1) + points[f"transcripts_z{int(z_level)}"] = transcripts_subset ### Polygons geo_df = geopandas.read_parquet(boundaries_path) geo_df = geo_df.rename_geometry("geometry") geo_df.index = geo_df[MerfishKeys.INSTANCE_KEY].astype(str) - polygons = ShapesModel.parse(geo_df, transformations={"microns": Identity(), "pixels": microns_to_pixels}) + polygons = ShapesModel.parse(geo_df, transformations={"pixels": microns_to_pixels}) + # polygons = ShapesModel.parse(geo_df, transformations={"microns": Identity(), "pixels": microns_to_pixels}) shapes = {"polygons": polygons} ### Table From ca89463e4691429872ba32b2cc3190b3f9f09b89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Apr 2023 18:50:16 +0000 Subject: [PATCH 06/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata_io/readers/merfish.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py index 5d8868d9..21c69485 100644 --- a/src/spatialdata_io/readers/merfish.py +++ b/src/spatialdata_io/readers/merfish.py @@ -3,11 +3,11 @@ from typing import Optional, Union import anndata +import dask.dataframe as dd import geopandas import numpy as np import pandas as pd from dask import array as da -import dask.dataframe as dd from dask_image.imread import imread from spatialdata import SpatialData from spatialdata.models import Image2DModel, PointsModel, ShapesModel, TableModel @@ -102,17 +102,19 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] ) # points = {"transcripts": transcripts} points = {} - gene_categorical = dd.from_pandas(transcripts['gene'].compute().astype('category'), npartitions=transcripts.npartitions).reset_index(drop=True) - transcripts['gene'] = gene_categorical + gene_categorical = dd.from_pandas( + transcripts["gene"].compute().astype("category"), npartitions=transcripts.npartitions + ).reset_index(drop=True) + transcripts["gene"] = gene_categorical # split the transcripts into the different z-levels - z = transcripts['z'].compute() + z = transcripts["z"].compute() z_levels = z.value_counts().index z_levels = sorted(z_levels, key=lambda x: int(x)) for z_level in z_levels: transcripts_subset = transcripts[z == z_level] # temporary solution until the 3D support is better developed - transcripts_subset = transcripts_subset.drop('z', axis=1) + transcripts_subset = transcripts_subset.drop("z", axis=1) points[f"transcripts_z{int(z_level)}"] = transcripts_subset ### Polygons From 663a4066753213134b4ac31d6e16f2ea331fdd58 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Thu, 11 May 2023 14:48:26 +0200 Subject: [PATCH 07/17] merfish reader support dict paths inputs --- src/spatialdata_io/_constants/_constants.py | 4 ++++ src/spatialdata_io/readers/merfish.py | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 8ec28f30..c9bb3e9b 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -138,6 +138,9 @@ class MerfishKeys(ModeEnum): # VPT default outputs CELLPOSE_BOUNDARIES = "cellpose_micron_space.parquet" WATERSHED_BOUNDARIES = "watershed_micron_space.parquet" + VPT_NAME_COUNTS = "cell_by_gene" + VPT_NAME_OBS = "cell_metadata" + VPT_NAME_BOUNDARIES = "cell_boundaries" # metadata INSTANCE_KEY = "EntityID" @@ -147,3 +150,4 @@ class MerfishKeys(ModeEnum): GLOBAL_X = "global_x" GLOBAL_Y = "global_y" GLOBAL_Z = "global_z" + Z_INDEX = "ZIndex" diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py index 76a6fb1d..f9517898 100644 --- a/src/spatialdata_io/readers/merfish.py +++ b/src/spatialdata_io/readers/merfish.py @@ -49,7 +49,11 @@ def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict]]): return vpt_outputs / MerfishKeys.COUNTS_FILE, vpt_outputs / MerfishKeys.CELL_METADATA_FILE, valid_boundaries[0] if isinstance(vpt_outputs, dict): - raise NotImplementedError() + return ( + vpt_outputs[MerfishKeys.VPT_NAME_COUNTS], + vpt_outputs[MerfishKeys.VPT_NAME_OBS], + vpt_outputs[MerfishKeys.VPT_NAME_BOUNDARIES], + ) raise ValueError(f"'vpt_outputs' has to be either None, a str, a Path, or a dict. Found type {type(vpt_outputs)}.") @@ -64,7 +68,7 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] path Path to the root directory containing the *Merfish* files (e.g., `detected_transcripts.csv`). vpt_outputs - Optional arguments to indicate the output of the vizgen-postprocessing-tool (VPT), when used. If a folder path is provided, it looks inside the folder for the following files: ``{mf.COUNTS_FILE!r}``, ``{mf.CELL_METADATA_FILE!r}``, and a boundary parquet file. If a dictionnary, then the following keys can be provided: `cell_by_gene`, `cell_metadata`, `boundaries` with the desired path as the value. + Optional arguments to indicate the output of the vizgen-postprocessing-tool (VPT), when used. If a folder path is provided, it looks inside the folder for the following files: ``{mf.COUNTS_FILE!r}``, ``{mf.CELL_METADATA_FILE!r}``, and a boundary parquet file. If a dictionnary, then the following keys can be provided: ``{mf.VPT_NAME_COUNTS!r}``, ``{mf.VPT_NAME_OBS!r}``, ``{mf.VPT_NAME_BOUNDARIES!r}`` with the desired path as the value. Returns ------- @@ -103,6 +107,7 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] ### Polygons geo_df = geopandas.read_parquet(boundaries_path) geo_df = geo_df.rename_geometry("geometry") + geo_df = geo_df[geo_df[MerfishKeys.Z_INDEX] == 0] # Avoid duplicate boundaries on all z-levels geo_df.index = geo_df[MerfishKeys.INSTANCE_KEY].astype(str) polygons = ShapesModel.parse(geo_df, transformations={"microns": Identity(), "pixels": microns_to_pixels}) From 8453fa7065520061b6ba74f7ad223b8b43699760 Mon Sep 17 00:00:00 2001 From: giovp Date: Fri, 9 Jun 2023 20:49:27 +0000 Subject: [PATCH 08/17] fix some (but not all) pre-commit checks --- src/spatialdata_io/readers/merfish.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py index 21c69485..fad275eb 100644 --- a/src/spatialdata_io/readers/merfish.py +++ b/src/spatialdata_io/readers/merfish.py @@ -77,7 +77,7 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] microns_to_pixels = np.genfromtxt(images_dir / MerfishKeys.TRANSFORMATION_FILE) microns_to_pixels = Affine(microns_to_pixels, input_axes=("x", "y"), output_axes=("x", "y")) - ### Images + # Images images = {} stainings, z_levels = _scan_images(images_dir) @@ -92,7 +92,7 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] ) images[f"z{z}"] = parsed_im - ### Transcripts + # Transcripts transcript_df = dd.read_csv(path / MerfishKeys.TRANSCRIPTS_FILE) transcripts = PointsModel.parse( transcript_df, @@ -117,7 +117,7 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] transcripts_subset = transcripts_subset.drop("z", axis=1) points[f"transcripts_z{int(z_level)}"] = transcripts_subset - ### Polygons + # Polygons geo_df = geopandas.read_parquet(boundaries_path) geo_df = geo_df.rename_geometry("geometry") geo_df.index = geo_df[MerfishKeys.INSTANCE_KEY].astype(str) @@ -126,7 +126,7 @@ def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict] # polygons = ShapesModel.parse(geo_df, transformations={"microns": Identity(), "pixels": microns_to_pixels}) shapes = {"polygons": polygons} - ### Table + # Table data = pd.read_csv(count_path, index_col=0, dtype={MerfishKeys.COUNTS_CELL_KEY: str}) obs = pd.read_csv(obs_path, index_col=0, dtype={MerfishKeys.INSTANCE_KEY: str}) From 7bc0efdd258cd88e8ac4ee8ffd3ea13e8b36ed3e Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Tue, 13 Jun 2023 14:40:59 +0200 Subject: [PATCH 09/17] merscope: renaming + solving mypy type errors --- src/spatialdata_io/__init__.py | 4 +- src/spatialdata_io/_constants/_constants.py | 4 +- src/spatialdata_io/readers/merfish.py | 153 ----------------- src/spatialdata_io/readers/merscope.py | 178 ++++++++++++++++++++ 4 files changed, 182 insertions(+), 157 deletions(-) delete mode 100644 src/spatialdata_io/readers/merfish.py create mode 100644 src/spatialdata_io/readers/merscope.py diff --git a/src/spatialdata_io/__init__.py b/src/spatialdata_io/__init__.py index 5f7d57e2..90a29147 100644 --- a/src/spatialdata_io/__init__.py +++ b/src/spatialdata_io/__init__.py @@ -3,7 +3,7 @@ from spatialdata_io.readers.cosmx import cosmx from spatialdata_io.readers.curio import curio from spatialdata_io.readers.mcmicro import mcmicro -from spatialdata_io.readers.merfish import merfish +from spatialdata_io.readers.merscope import merscope from spatialdata_io.readers.steinbock import steinbock from spatialdata_io.readers.visium import visium from spatialdata_io.readers.xenium import xenium @@ -15,7 +15,7 @@ "cosmx", "mcmicro", "steinbock", - "merfish", + "merscope", ] __version__ = version("spatialdata-io") diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index f0106a55..13372d9f 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -141,8 +141,8 @@ class McmicroKeys(ModeEnum): @unique -class MerfishKeys(ModeEnum): - """Keys for *MERFISH* data (*MERSCOPE* plateform from Vizgen)""" +class MerscopeKeys(ModeEnum): + """Keys for *MERSCOPE* data (Vizgen plateform)""" # files and directories IMAGES_DIR = "images" diff --git a/src/spatialdata_io/readers/merfish.py b/src/spatialdata_io/readers/merfish.py deleted file mode 100644 index 142a47c1..00000000 --- a/src/spatialdata_io/readers/merfish.py +++ /dev/null @@ -1,153 +0,0 @@ -import re -from pathlib import Path -from typing import Optional, Union - -import anndata -import dask.dataframe as dd -import geopandas -import numpy as np -import pandas as pd -from dask import array as da -from dask_image.imread import imread -from spatialdata import SpatialData -from spatialdata.models import Image2DModel, PointsModel, ShapesModel, TableModel -from spatialdata.transformations import Affine, Identity - -from spatialdata_io._constants._constants import MerfishKeys -from spatialdata_io._docs import inject_docs - - -def _scan_images(images_dir: Path) -> tuple[list, list]: - """Searches inside the image directory and get all the different channels (stainings) and all the z-levels (usually 0...6)""" - exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" - matches = [re.search(exp, file.name) for file in images_dir.iterdir()] - - stainings = {match.group("stain") for match in matches if match} - z_levels = {match.group("z") for match in matches if match} - - return list(stainings), list(z_levels) - - -def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict]]): - """Gets the file paths to (i) the file of transcript per cell, (ii) the cell metadata file, and (iii) the cell boundary file""" - if vpt_outputs is None: - return path / MerfishKeys.COUNTS_FILE, path / MerfishKeys.CELL_METADATA_FILE, path / MerfishKeys.BOUNDARIES_FILE - - if isinstance(vpt_outputs, str) or isinstance(vpt_outputs, Path): - vpt_outputs = Path(vpt_outputs) - - plausible_boundaries = [ - vpt_outputs / MerfishKeys.CELLPOSE_BOUNDARIES, - vpt_outputs / MerfishKeys.WATERSHED_BOUNDARIES, - ] - valid_boundaries = [path for path in plausible_boundaries if path.exists()] - - assert ( - valid_boundaries - ), f"Boundary file not found - expected to find one of these files: {', '.join(map(str, plausible_boundaries))}" - - return vpt_outputs / MerfishKeys.COUNTS_FILE, vpt_outputs / MerfishKeys.CELL_METADATA_FILE, valid_boundaries[0] - - if isinstance(vpt_outputs, dict): - return ( - vpt_outputs[MerfishKeys.VPT_NAME_COUNTS], - vpt_outputs[MerfishKeys.VPT_NAME_OBS], - vpt_outputs[MerfishKeys.VPT_NAME_BOUNDARIES], - ) - - raise ValueError(f"'vpt_outputs' has to be either None, a str, a Path, or a dict. Found type {type(vpt_outputs)}.") - - -@inject_docs(mf=MerfishKeys) -def merfish(path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict]]) -> SpatialData: - """ - Read *MERFISH* data from Vizgen. - - Parameters - ---------- - path - Path to the root directory containing the *Merfish* files (e.g., `detected_transcripts.csv`). - vpt_outputs - Optional arguments to indicate the output of the vizgen-postprocessing-tool (VPT), when used. If a folder path is provided, it looks inside the folder for the following files: ``{mf.COUNTS_FILE!r}``, ``{mf.CELL_METADATA_FILE!r}``, and a boundary parquet file. If a dictionnary, then the following keys can be provided: ``{mf.VPT_NAME_COUNTS!r}``, ``{mf.VPT_NAME_OBS!r}``, ``{mf.VPT_NAME_BOUNDARIES!r}`` with the desired path as the value. - - Returns - ------- - :class:`spatialdata.SpatialData` - """ - path = Path(path) - count_path, obs_path, boundaries_path = _get_file_paths(path, vpt_outputs) - images_dir = path / MerfishKeys.IMAGES_DIR - - microns_to_pixels = np.genfromtxt(images_dir / MerfishKeys.TRANSFORMATION_FILE) - microns_to_pixels = Affine(microns_to_pixels, input_axes=("x", "y"), output_axes=("x", "y")) - - # Images - images = {} - - stainings, z_levels = _scan_images(images_dir) - for z in z_levels: - im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z}.tif").squeeze() for stain in stainings], axis=0) - parsed_im = Image2DModel.parse( - im, - dims=("c", "y", "x"), - transformations={"pixels": Identity()}, - # transformations={"pixels": Identity(), "microns": microns_to_pixels.inverse()}, - c_coords=stainings, - ) - images[f"z{z}"] = parsed_im - - # Transcripts - transcript_df = dd.read_csv(path / MerfishKeys.TRANSCRIPTS_FILE) - transcripts = PointsModel.parse( - transcript_df, - coordinates={"x": MerfishKeys.GLOBAL_X, "y": MerfishKeys.GLOBAL_Y, "z": MerfishKeys.GLOBAL_Z}, - transformations={"pixels": Identity()}, - # transformations={"microns": Identity(), "pixels": microns_to_pixels}, - ) - # points = {"transcripts": transcripts} - points = {} - gene_categorical = dd.from_pandas( - transcripts["gene"].compute().astype("category"), npartitions=transcripts.npartitions - ).reset_index(drop=True) - transcripts["gene"] = gene_categorical - - # split the transcripts into the different z-levels - z = transcripts["z"].compute() - z_levels = z.value_counts().index - z_levels = sorted(z_levels, key=lambda x: int(x)) - for z_level in z_levels: - transcripts_subset = transcripts[z == z_level] - # temporary solution until the 3D support is better developed - transcripts_subset = transcripts_subset.drop("z", axis=1) - points[f"transcripts_z{int(z_level)}"] = transcripts_subset - - # Polygons - geo_df = geopandas.read_parquet(boundaries_path) - geo_df = geo_df.rename_geometry("geometry") - geo_df = geo_df[geo_df[MerfishKeys.Z_INDEX] == 0] # Avoid duplicate boundaries on all z-levels - geo_df.index = geo_df[MerfishKeys.INSTANCE_KEY].astype(str) - - polygons = ShapesModel.parse(geo_df, transformations={"pixels": microns_to_pixels}) - # polygons = ShapesModel.parse(geo_df, transformations={"microns": Identity(), "pixels": microns_to_pixels}) - shapes = {"polygons": polygons} - - # Table - data = pd.read_csv(count_path, index_col=0, dtype={MerfishKeys.COUNTS_CELL_KEY: str}) - obs = pd.read_csv(obs_path, index_col=0, dtype={MerfishKeys.INSTANCE_KEY: str}) - - is_gene = ~data.columns.str.lower().str.contains("blank") - adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs) - - adata.obsm["blank"] = data.loc[:, ~is_gene] # blank fields are excluded from adata.X - adata.obsm["spatial"] = adata.obs[[MerfishKeys.CELL_X, MerfishKeys.CELL_Y]].values - adata.obs["region"] = pd.Series(path.stem, index=adata.obs_names, dtype="category") - adata.obs[MerfishKeys.INSTANCE_KEY] = adata.obs.index - - table = TableModel.parse( - adata, - region_key="region", - region=adata.obs["region"].cat.categories.tolist(), - instance_key=MerfishKeys.INSTANCE_KEY.value, - ) - - return SpatialData(shapes=shapes, points=points, images=images, table=table) diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py new file mode 100644 index 00000000..a412d407 --- /dev/null +++ b/src/spatialdata_io/readers/merscope.py @@ -0,0 +1,178 @@ +import re +from pathlib import Path +from typing import Any, Optional, Union + +import anndata +import dask.dataframe as dd +import geopandas +import numpy as np +import pandas as pd +from dask import array as da +from dask_image.imread import imread +from spatialdata import SpatialData +from spatialdata.models import Image3DModel, PointsModel, ShapesModel, TableModel +from spatialdata.transformations import Affine, Identity + +from spatialdata_io._constants._constants import MerscopeKeys +from spatialdata_io._docs import inject_docs + + +def _scan_images(images_dir: Path) -> tuple[list[str], list[str]]: + """Searches inside the image directory and get all the different channels (stainings) + and all the z-levels (usually 0...6)""" + exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" + matches = [re.search(exp, file.name) for file in images_dir.iterdir()] + + stainings = {match.group("stain") for match in matches if match} + z_levels = {match.group("z") for match in matches if match} + + return list(stainings), list(z_levels) + + +def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict[str, Any]]]) -> tuple[Path, Path, Path]: + """Gets the file paths to (i) the file of transcript per cell, + (ii) the cell metadata file, and (iii) the cell boundary file""" + if vpt_outputs is None: + return ( + path / MerscopeKeys.COUNTS_FILE, + path / MerscopeKeys.CELL_METADATA_FILE, + path / MerscopeKeys.BOUNDARIES_FILE, + ) + + if isinstance(vpt_outputs, str) or isinstance(vpt_outputs, Path): + vpt_outputs = Path(vpt_outputs) + + plausible_boundaries = [ + vpt_outputs / MerscopeKeys.CELLPOSE_BOUNDARIES, + vpt_outputs / MerscopeKeys.WATERSHED_BOUNDARIES, + ] + valid_boundaries = [path for path in plausible_boundaries if path.exists()] + + assert ( + valid_boundaries + ), f"Boundary file not found - expected to find one of these files: {', '.join(map(str, plausible_boundaries))}" + + return ( + vpt_outputs / MerscopeKeys.COUNTS_FILE, + vpt_outputs / MerscopeKeys.CELL_METADATA_FILE, + valid_boundaries[0], + ) + + if isinstance(vpt_outputs, dict): + return ( + vpt_outputs[MerscopeKeys.VPT_NAME_COUNTS], + vpt_outputs[MerscopeKeys.VPT_NAME_OBS], + vpt_outputs[MerscopeKeys.VPT_NAME_BOUNDARIES], + ) + + raise ValueError( + f"`vpt_outputs` has to be either `None`, `str`, `Path`, or `dict`. Found type {type(vpt_outputs)}." + ) + + +@inject_docs(ms=MerscopeKeys) +def merscope( + path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict[str, Any]]] = None, read_tif: bool = True +) -> SpatialData: + """ + Read *MERSCOPE* data from Vizgen. + + This function reads the following files: + + - ``{ms.COUNTS_FILE!r}``: Counts file. + - ``{ms.TRANSCRIPTS_FILE!r}``: Transcript file. + - ``{ms.CELL_METADATA_FILE!r}``: Per-cell metadata file. + - ``{ms.BOUNDARIES_FILE!r}``: Cell polygon boundaries. + - `mosaic_**_z*.tif` images inside the ``{ms.IMAGES_DIR!r}`` directory. + + Parameters + ---------- + path + Path to the root directory containing the *Merscope* files (e.g., `detected_transcripts.csv`). + vpt_outputs + Optional arguments to indicate the output of the vizgen-postprocessing-tool (VPT), when used. + If a folder path is provided, it looks inside the folder for the following files: + ``{ms.COUNTS_FILE!r}``, ``{ms.CELL_METADATA_FILE!r}``, and a boundary parquet file. + If a dictionnary, then the following keys can be provided: + ``{ms.VPT_NAME_COUNTS!r}``, ``{ms.VPT_NAME_OBS!r}``, ``{ms.VPT_NAME_BOUNDARIES!r}`` with the desired path as the value. + read_tif + Whether to read the tif images or not + + Returns + ------- + :class:`spatialdata.SpatialData` + """ + path = Path(path) + count_path, obs_path, boundaries_path = _get_file_paths(path, vpt_outputs) + images_dir = path / MerscopeKeys.IMAGES_DIR + + microns_to_pixels = np.genfromtxt(images_dir / MerscopeKeys.TRANSFORMATION_FILE) + microns_to_pixels = Affine(microns_to_pixels, input_axes=("x", "y"), output_axes=("x", "y")) + + # Images + images = {} + + if read_tif: + stainings, z_levels = _scan_images(images_dir) + for z in z_levels: + im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z}.tif").squeeze() for stain in stainings], axis=0) + parsed_im = Image3DModel.parse( + im, + dims=("c", "z", "y", "x"), + transformations={"pixels": Identity()}, + c_coords=stainings, + ) + images[f"z{z}"] = parsed_im + + # Transcripts + transcript_df = dd.read_csv(path / MerscopeKeys.TRANSCRIPTS_FILE) + transcripts = PointsModel.parse( + transcript_df, + coordinates={"x": MerscopeKeys.GLOBAL_X, "y": MerscopeKeys.GLOBAL_Y, "z": MerscopeKeys.GLOBAL_Z}, + transformations={"pixels": Identity()}, + ) + points = {} + gene_categorical = dd.from_pandas( + transcripts["gene"].compute().astype("category"), npartitions=transcripts.npartitions + ).reset_index(drop=True) + transcripts["gene"] = gene_categorical + + # split the transcripts into the different z-levels + z = transcripts["z"].compute() + z_levels = z.value_counts().index + z_levels = sorted(z_levels, key=lambda x: int(x)) + for z_level in z_levels: + transcripts_subset = transcripts[z == z_level] + # temporary solution until the 3D support is better developed + transcripts_subset = transcripts_subset.drop("z", axis=1) + points[f"transcripts_z{int(z_level)}"] = transcripts_subset + + # Polygons + geo_df = geopandas.read_parquet(boundaries_path) + geo_df = geo_df.rename_geometry("geometry") + geo_df = geo_df[geo_df[MerscopeKeys.Z_INDEX] == 0] # Avoid duplicate boundaries on all z-levels + geo_df.index = geo_df[MerscopeKeys.INSTANCE_KEY].astype(str) + + polygons = ShapesModel.parse(geo_df, transformations={"pixels": microns_to_pixels}) + shapes = {"polygons": polygons} + + # Table + data = pd.read_csv(count_path, index_col=0, dtype={MerscopeKeys.COUNTS_CELL_KEY: str}) + obs = pd.read_csv(obs_path, index_col=0, dtype={MerscopeKeys.INSTANCE_KEY: str}) + + is_gene = ~data.columns.str.lower().str.contains("blank") + adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs) + + adata.obsm["blank"] = data.loc[:, ~is_gene] # blank fields are excluded from adata.X + adata.obsm["spatial"] = adata.obs[[MerscopeKeys.CELL_X, MerscopeKeys.CELL_Y]].values + adata.obs["region"] = pd.Series(path.stem, index=adata.obs_names, dtype="category") + adata.obs[MerscopeKeys.INSTANCE_KEY] = adata.obs.index + + table = TableModel.parse( + adata, + region_key="region", + region=adata.obs["region"].cat.categories.tolist(), + instance_key=MerscopeKeys.INSTANCE_KEY.value, + ) + + return SpatialData(shapes=shapes, points=points, images=images, table=table) From 9257b5501db5656be63ec57301560383e0cc601e Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Tue, 13 Jun 2023 15:09:32 +0200 Subject: [PATCH 10/17] fix flake8 errors --- src/spatialdata_io/readers/merscope.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py index a412d407..8561e8ed 100644 --- a/src/spatialdata_io/readers/merscope.py +++ b/src/spatialdata_io/readers/merscope.py @@ -18,8 +18,11 @@ def _scan_images(images_dir: Path) -> tuple[list[str], list[str]]: - """Searches inside the image directory and get all the different channels (stainings) - and all the z-levels (usually 0...6)""" + """ + Gets images names inside a directory + + It returns all the different channels (stainings) and all the z-levels (usually 0...6) + """ exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" matches = [re.search(exp, file.name) for file in images_dir.iterdir()] @@ -30,8 +33,11 @@ def _scan_images(images_dir: Path) -> tuple[list[str], list[str]]: def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict[str, Any]]]) -> tuple[Path, Path, Path]: - """Gets the file paths to (i) the file of transcript per cell, - (ii) the cell metadata file, and (iii) the cell boundary file""" + """ + Gets the MERSCOPE file paths when vpt_outputs is provided + + That is, (i) the file of transcript per cell, (ii) the cell metadata file, and (iii) the cell boundary file + """ if vpt_outputs is None: return ( path / MerscopeKeys.COUNTS_FILE, @@ -114,15 +120,17 @@ def merscope( if read_tif: stainings, z_levels = _scan_images(images_dir) - for z in z_levels: - im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z}.tif").squeeze() for stain in stainings], axis=0) + for z_level in z_levels: + im = da.stack( + [imread(images_dir / f"mosaic_{stain}_z{z_level}.tif").squeeze() for stain in stainings], axis=0 + ) parsed_im = Image3DModel.parse( im, dims=("c", "z", "y", "x"), transformations={"pixels": Identity()}, c_coords=stainings, ) - images[f"z{z}"] = parsed_im + images[f"z{z_level}"] = parsed_im # Transcripts transcript_df = dd.read_csv(path / MerscopeKeys.TRANSCRIPTS_FILE) From ca7ae968ecef823421e48932a9a90f9debdb5af1 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Wed, 14 Jun 2023 10:39:26 +0200 Subject: [PATCH 11/17] z-layers argument + consider only 2D stacks --- src/spatialdata_io/readers/merscope.py | 60 +++++++++++--------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py index 8561e8ed..7d4dfa5c 100644 --- a/src/spatialdata_io/readers/merscope.py +++ b/src/spatialdata_io/readers/merscope.py @@ -10,26 +10,20 @@ from dask import array as da from dask_image.imread import imread from spatialdata import SpatialData -from spatialdata.models import Image3DModel, PointsModel, ShapesModel, TableModel +from spatialdata.models import Image2DModel, PointsModel, ShapesModel, TableModel from spatialdata.transformations import Affine, Identity from spatialdata_io._constants._constants import MerscopeKeys from spatialdata_io._docs import inject_docs -def _scan_images(images_dir: Path) -> tuple[list[str], list[str]]: - """ - Gets images names inside a directory - - It returns all the different channels (stainings) and all the z-levels (usually 0...6) - """ +def _get_channel_names(images_dir: Path) -> list[str]: exp = r"mosaic_(?P[\w|-]+[0-9]?)_z(?P[0-9]+).tif" matches = [re.search(exp, file.name) for file in images_dir.iterdir()] stainings = {match.group("stain") for match in matches if match} - z_levels = {match.group("z") for match in matches if match} - return list(stainings), list(z_levels) + return list(stainings) def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict[str, Any]]]) -> tuple[Path, Path, Path]: @@ -78,7 +72,9 @@ def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict[str, @inject_docs(ms=MerscopeKeys) def merscope( - path: Union[str, Path], vpt_outputs: Optional[Union[Path, str, dict[str, Any]]] = None, read_tif: bool = True + path: str | Path, + vpt_outputs: Optional[Path | str | dict[str, Any]] = None, + z_layers: int | list[int] | None = 3, ) -> SpatialData: """ Read *MERSCOPE* data from Vizgen. @@ -101,8 +97,9 @@ def merscope( ``{ms.COUNTS_FILE!r}``, ``{ms.CELL_METADATA_FILE!r}``, and a boundary parquet file. If a dictionnary, then the following keys can be provided: ``{ms.VPT_NAME_COUNTS!r}``, ``{ms.VPT_NAME_OBS!r}``, ``{ms.VPT_NAME_BOUNDARIES!r}`` with the desired path as the value. - read_tif - Whether to read the tif images or not + z_layers + Indices of the z-layers to consider. Either one `int` index, or a list of `int` indices. If `None`, then no image is loaded. + By default, only the middle layer is considered (that is, layer 3). Returns ------- @@ -118,42 +115,32 @@ def merscope( # Images images = {} - if read_tif: - stainings, z_levels = _scan_images(images_dir) - for z_level in z_levels: - im = da.stack( - [imread(images_dir / f"mosaic_{stain}_z{z_level}.tif").squeeze() for stain in stainings], axis=0 - ) - parsed_im = Image3DModel.parse( - im, - dims=("c", "z", "y", "x"), - transformations={"pixels": Identity()}, - c_coords=stainings, - ) - images[f"z{z_level}"] = parsed_im + z_layers = [z_layers] if isinstance(z_layers, int) else z_layers or [] + + stainings = _get_channel_names(images_dir) + for z_layer in z_layers: + im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z_layer}.tif").squeeze() for stain in stainings], axis=0) + parsed_im = Image2DModel.parse( + im, + dims=("c", "y", "x"), + transformations={"pixels": Identity()}, + c_coords=stainings, + ) + images[f"z{z_layer}"] = parsed_im # Transcripts transcript_df = dd.read_csv(path / MerscopeKeys.TRANSCRIPTS_FILE) transcripts = PointsModel.parse( transcript_df, - coordinates={"x": MerscopeKeys.GLOBAL_X, "y": MerscopeKeys.GLOBAL_Y, "z": MerscopeKeys.GLOBAL_Z}, + coordinates={"x": MerscopeKeys.GLOBAL_X, "y": MerscopeKeys.GLOBAL_Y}, transformations={"pixels": Identity()}, ) - points = {} gene_categorical = dd.from_pandas( transcripts["gene"].compute().astype("category"), npartitions=transcripts.npartitions ).reset_index(drop=True) transcripts["gene"] = gene_categorical - # split the transcripts into the different z-levels - z = transcripts["z"].compute() - z_levels = z.value_counts().index - z_levels = sorted(z_levels, key=lambda x: int(x)) - for z_level in z_levels: - transcripts_subset = transcripts[z == z_level] - # temporary solution until the 3D support is better developed - transcripts_subset = transcripts_subset.drop("z", axis=1) - points[f"transcripts_z{int(z_level)}"] = transcripts_subset + points = {"transcripts": transcripts} # Polygons geo_df = geopandas.read_parquet(boundaries_path) @@ -162,6 +149,7 @@ def merscope( geo_df.index = geo_df[MerscopeKeys.INSTANCE_KEY].astype(str) polygons = ShapesModel.parse(geo_df, transformations={"pixels": microns_to_pixels}) + shapes = {"polygons": polygons} # Table From 2452a792005117ef0d0dc5ba7a9a619775576695 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Wed, 14 Jun 2023 10:49:26 +0200 Subject: [PATCH 12/17] back to py3.9 Union typing syntax --- src/spatialdata_io/readers/merscope.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py index 7d4dfa5c..fc82c8df 100644 --- a/src/spatialdata_io/readers/merscope.py +++ b/src/spatialdata_io/readers/merscope.py @@ -72,9 +72,9 @@ def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict[str, @inject_docs(ms=MerscopeKeys) def merscope( - path: str | Path, - vpt_outputs: Optional[Path | str | dict[str, Any]] = None, - z_layers: int | list[int] | None = 3, + path: Union[str, Path], + vpt_outputs: Optional[Union[Path, str, dict[str, Any]]] = None, + z_layers: Union[int, list[int], None] = 3, ) -> SpatialData: """ Read *MERSCOPE* data from Vizgen. From 0226e53c16679a23ce267d01ac3e7e0ed0a9501e Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Mon, 19 Jun 2023 17:29:28 +0200 Subject: [PATCH 13/17] infer the dataset id using the directory path --- src/spatialdata_io/readers/merscope.py | 44 ++++++++++++++++++-------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py index fc82c8df..2f3731f1 100644 --- a/src/spatialdata_io/readers/merscope.py +++ b/src/spatialdata_io/readers/merscope.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import re from pathlib import Path -from typing import Any, Optional, Union +from typing import Any import anndata import dask.dataframe as dd @@ -26,7 +28,7 @@ def _get_channel_names(images_dir: Path) -> list[str]: return list(stainings) -def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict[str, Any]]]) -> tuple[Path, Path, Path]: +def _get_file_paths(path: Path, vpt_outputs: Path | str | dict[str, Any] | None) -> tuple[Path, Path, Path]: """ Gets the MERSCOPE file paths when vpt_outputs is provided @@ -72,9 +74,11 @@ def _get_file_paths(path: Path, vpt_outputs: Optional[Union[Path, str, dict[str, @inject_docs(ms=MerscopeKeys) def merscope( - path: Union[str, Path], - vpt_outputs: Optional[Union[Path, str, dict[str, Any]]] = None, - z_layers: Union[int, list[int], None] = 3, + path: str | Path, + vpt_outputs: Path | str | dict[str, Any] | None = None, + z_layers: int | list[int] | None = 3, + region_name: str | None = None, + slide_name: str | None = None, ) -> SpatialData: """ Read *MERSCOPE* data from Vizgen. @@ -90,16 +94,24 @@ def merscope( Parameters ---------- path - Path to the root directory containing the *Merscope* files (e.g., `detected_transcripts.csv`). + Path to the region/root directory containing the *Merscope* files (e.g., `detected_transcripts.csv`). vpt_outputs Optional arguments to indicate the output of the vizgen-postprocessing-tool (VPT), when used. If a folder path is provided, it looks inside the folder for the following files: - ``{ms.COUNTS_FILE!r}``, ``{ms.CELL_METADATA_FILE!r}``, and a boundary parquet file. - If a dictionnary, then the following keys can be provided: - ``{ms.VPT_NAME_COUNTS!r}``, ``{ms.VPT_NAME_OBS!r}``, ``{ms.VPT_NAME_BOUNDARIES!r}`` with the desired path as the value. + - ``{ms.COUNTS_FILE!r}`` + - ``{ms.CELL_METADATA_FILE!r}`` + - ``{ms.BOUNDARIES_FILE!r}`` + If a dictionnary, then the following keys should be provided with the desired path: + - ``{ms.VPT_NAME_COUNTS!r}`` + - ``{ms.VPT_NAME_OBS!r}`` + - ``{ms.VPT_NAME_BOUNDARIES!r}`` z_layers Indices of the z-layers to consider. Either one `int` index, or a list of `int` indices. If `None`, then no image is loaded. By default, only the middle layer is considered (that is, layer 3). + region_name + Name of the region of interest, e.g., `'region_0'`. If `None` then the name of the `path` directory is used. + slide_name + Name of the slide/run. If `None` then the name of the parent directory of `path` is used (whose name starts with a date). Returns ------- @@ -112,6 +124,10 @@ def merscope( microns_to_pixels = np.genfromtxt(images_dir / MerscopeKeys.TRANSFORMATION_FILE) microns_to_pixels = Affine(microns_to_pixels, input_axes=("x", "y"), output_axes=("x", "y")) + region_name = path.name if region_name is None else region_name + slide_name = path.parent.name if slide_name is None else slide_name + dataset_id = f"{slide_name}_{region_name}" + # Images images = {} @@ -126,7 +142,7 @@ def merscope( transformations={"pixels": Identity()}, c_coords=stainings, ) - images[f"z{z_layer}"] = parsed_im + images[f"{dataset_id}_z{z_layer}"] = parsed_im # Transcripts transcript_df = dd.read_csv(path / MerscopeKeys.TRANSCRIPTS_FILE) @@ -140,7 +156,7 @@ def merscope( ).reset_index(drop=True) transcripts["gene"] = gene_categorical - points = {"transcripts": transcripts} + points = {f"{dataset_id}_transcripts": transcripts} # Polygons geo_df = geopandas.read_parquet(boundaries_path) @@ -150,7 +166,7 @@ def merscope( polygons = ShapesModel.parse(geo_df, transformations={"pixels": microns_to_pixels}) - shapes = {"polygons": polygons} + shapes = {f"{dataset_id}_polygons": polygons} # Table data = pd.read_csv(count_path, index_col=0, dtype={MerscopeKeys.COUNTS_CELL_KEY: str}) @@ -161,7 +177,9 @@ def merscope( adata.obsm["blank"] = data.loc[:, ~is_gene] # blank fields are excluded from adata.X adata.obsm["spatial"] = adata.obs[[MerscopeKeys.CELL_X, MerscopeKeys.CELL_Y]].values - adata.obs["region"] = pd.Series(path.stem, index=adata.obs_names, dtype="category") + adata.obs["region"] = pd.Series(region_name, index=adata.obs_names, dtype="category") + adata.obs["slide"] = pd.Series(slide_name, index=adata.obs_names, dtype="category") + adata.obs["dataset_id"] = pd.Series(dataset_id, index=adata.obs_names, dtype="category") adata.obs[MerscopeKeys.INSTANCE_KEY] = adata.obs.index table = TableModel.parse( From 71f075a1e26cf1406525fa6c365d049e6175a811 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Mon, 19 Jun 2023 17:34:16 +0200 Subject: [PATCH 14/17] fix flake8 unindent warning --- src/spatialdata_io/readers/merscope.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py index 2f3731f1..1ff2c736 100644 --- a/src/spatialdata_io/readers/merscope.py +++ b/src/spatialdata_io/readers/merscope.py @@ -98,13 +98,16 @@ def merscope( vpt_outputs Optional arguments to indicate the output of the vizgen-postprocessing-tool (VPT), when used. If a folder path is provided, it looks inside the folder for the following files: - - ``{ms.COUNTS_FILE!r}`` - - ``{ms.CELL_METADATA_FILE!r}`` - - ``{ms.BOUNDARIES_FILE!r}`` + + - ``{ms.COUNTS_FILE!r}`` + - ``{ms.CELL_METADATA_FILE!r}`` + - ``{ms.BOUNDARIES_FILE!r}`` + If a dictionnary, then the following keys should be provided with the desired path: - - ``{ms.VPT_NAME_COUNTS!r}`` - - ``{ms.VPT_NAME_OBS!r}`` - - ``{ms.VPT_NAME_BOUNDARIES!r}`` + + - ``{ms.VPT_NAME_COUNTS!r}`` + - ``{ms.VPT_NAME_OBS!r}`` + - ``{ms.VPT_NAME_BOUNDARIES!r}`` z_layers Indices of the z-layers to consider. Either one `int` index, or a list of `int` indices. If `None`, then no image is loaded. By default, only the middle layer is considered (that is, layer 3). From c488ccb5b04eaf3a51b0262a28d827e482f978d0 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Tue, 20 Jun 2023 10:15:36 +0200 Subject: [PATCH 15/17] adding the merscope reader to the docs API --- README.md | 2 +- docs/api.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 441f05b4..d2cbc080 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,10 @@ This package contains reader functions to load common spatial omics formats into - 10x Genomics Visium - 10x Genomics Xenium - Curio Seeker +- Vizgen MERSCOPE (MERFISH) Coming soon: -- Vizgen MERSCOPE (MERFISH) - Spatial Genomics seqFISH - Akoya PhenoCycler (formerly CODEX) diff --git a/docs/api.md b/docs/api.md index 63c1e0cf..d39ebd07 100644 --- a/docs/api.md +++ b/docs/api.md @@ -17,5 +17,6 @@ I/O for the `spatialdata` project. visium xenium steinbock + merscope mcmicro ``` From 0c9158098d04cc142a7f549b78089747928d4862 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Mon, 31 Jul 2023 10:02:54 +0200 Subject: [PATCH 16/17] merscope: handle big image + fix TableModel region --- src/spatialdata_io/_constants/_constants.py | 3 +- src/spatialdata_io/readers/merscope.py | 52 +++++++++++++++------ 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index e540e324..c572fced 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -174,7 +174,7 @@ class MerscopeKeys(ModeEnum): VPT_NAME_BOUNDARIES = "cell_boundaries" # metadata - INSTANCE_KEY = "EntityID" + METADATA_CELL_KEY = "EntityID" COUNTS_CELL_KEY = "cell" CELL_X = "center_x" CELL_Y = "center_y" @@ -182,3 +182,4 @@ class MerscopeKeys(ModeEnum): GLOBAL_Y = "global_y" GLOBAL_Z = "global_z" Z_INDEX = "ZIndex" + REGION_KEY = "cells_region" diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py index da347f9e..0487d5d9 100644 --- a/src/spatialdata_io/readers/merscope.py +++ b/src/spatialdata_io/readers/merscope.py @@ -1,7 +1,9 @@ from __future__ import annotations import re +from collections.abc import Mapping from pathlib import Path +from types import MappingProxyType from typing import Any import anndata @@ -79,6 +81,8 @@ def merscope( z_layers: int | list[int] | None = 3, region_name: str | None = None, slide_name: str | None = None, + imread_kwargs: Mapping[str, Any] = MappingProxyType({}), + image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> SpatialData: """ Read *MERSCOPE* data from Vizgen. @@ -115,11 +119,26 @@ def merscope( Name of the region of interest, e.g., `'region_0'`. If `None` then the name of the `path` directory is used. slide_name Name of the slide/run. If `None` then the name of the parent directory of `path` is used (whose name starts with a date). + imread_kwargs + Keyword arguments to pass to the image reader. + image_models_kwargs + Keyword arguments to pass to the image models. Returns ------- :class:`spatialdata.SpatialData` """ + if "chunks" not in image_models_kwargs: + if isinstance(image_models_kwargs, MappingProxyType): + image_models_kwargs = {} + assert isinstance(image_models_kwargs, dict) + image_models_kwargs["chunks"] = (1, 4096, 4096) + if "scale_factors" not in image_models_kwargs: + if isinstance(image_models_kwargs, MappingProxyType): + image_models_kwargs = {} + assert isinstance(image_models_kwargs, dict) + image_models_kwargs["scale_factors"] = [2, 2, 2, 2] + path = Path(path) count_path, obs_path, boundaries_path = _get_file_paths(path, vpt_outputs) images_dir = path / MerscopeKeys.IMAGES_DIR @@ -127,9 +146,10 @@ def merscope( microns_to_pixels = np.genfromtxt(images_dir / MerscopeKeys.TRANSFORMATION_FILE) microns_to_pixels = Affine(microns_to_pixels, input_axes=("x", "y"), output_axes=("x", "y")) - region_name = path.name if region_name is None else region_name + vizgen_region = path.name if region_name is None else region_name slide_name = path.parent.name if slide_name is None else slide_name - dataset_id = f"{slide_name}_{region_name}" + dataset_id = f"{slide_name}_{vizgen_region}" + region = f"{dataset_id}_polygons" # Images images = {} @@ -138,12 +158,16 @@ def merscope( stainings = _get_channel_names(images_dir) for z_layer in z_layers: - im = da.stack([imread(images_dir / f"mosaic_{stain}_z{z_layer}.tif").squeeze() for stain in stainings], axis=0) + im = da.stack( + [imread(images_dir / f"mosaic_{stain}_z{z_layer}.tif", **imread_kwargs).squeeze() for stain in stainings], + axis=0, + ) parsed_im = Image2DModel.parse( im, dims=("c", "y", "x"), - transformations={"pixels": Identity()}, + transformations={"microns": microns_to_pixels.inverse()}, c_coords=stainings, + **image_models_kwargs, ) images[f"{dataset_id}_z{z_layer}"] = parsed_im @@ -152,7 +176,7 @@ def merscope( transcripts = PointsModel.parse( transcript_df, coordinates={"x": MerscopeKeys.GLOBAL_X, "y": MerscopeKeys.GLOBAL_Y}, - transformations={"pixels": microns_to_pixels}, + transformations={"microns": Identity()}, ) categories = transcripts["gene"].compute().astype("category") gene_categorical = dd.from_pandas(categories, npartitions=transcripts.npartitions).reset_index(drop=True) @@ -164,31 +188,33 @@ def merscope( geo_df = geopandas.read_parquet(boundaries_path) geo_df = geo_df.rename_geometry("geometry") geo_df = geo_df[geo_df[MerscopeKeys.Z_INDEX] == 0] # Avoid duplicate boundaries on all z-levels - geo_df.index = geo_df[MerscopeKeys.INSTANCE_KEY].astype(str) + geo_df.geometry = geo_df.geometry.map(lambda x: x.geoms[0]) # The MultiPolygons contain only one polygon + geo_df.index = geo_df[MerscopeKeys.METADATA_CELL_KEY].astype(str) - polygons = ShapesModel.parse(geo_df, transformations={"pixels": microns_to_pixels}) + polygons = ShapesModel.parse(geo_df, transformations={"microns": Identity()}) shapes = {f"{dataset_id}_polygons": polygons} # Table data = pd.read_csv(count_path, index_col=0, dtype={MerscopeKeys.COUNTS_CELL_KEY: str}) - obs = pd.read_csv(obs_path, index_col=0, dtype={MerscopeKeys.INSTANCE_KEY: str}) + obs = pd.read_csv(obs_path, index_col=0, dtype={MerscopeKeys.METADATA_CELL_KEY: str}) is_gene = ~data.columns.str.lower().str.contains("blank") adata = anndata.AnnData(data.loc[:, is_gene], dtype=data.values.dtype, obs=obs) adata.obsm["blank"] = data.loc[:, ~is_gene] # blank fields are excluded from adata.X adata.obsm["spatial"] = adata.obs[[MerscopeKeys.CELL_X, MerscopeKeys.CELL_Y]].values - adata.obs["region"] = pd.Series(region_name, index=adata.obs_names, dtype="category") + adata.obs["region"] = pd.Series(vizgen_region, index=adata.obs_names, dtype="category") adata.obs["slide"] = pd.Series(slide_name, index=adata.obs_names, dtype="category") adata.obs["dataset_id"] = pd.Series(dataset_id, index=adata.obs_names, dtype="category") - adata.obs[MerscopeKeys.INSTANCE_KEY] = adata.obs.index + adata.obs[MerscopeKeys.REGION_KEY] = pd.Series(region, index=adata.obs_names, dtype="category") + adata.obs[MerscopeKeys.METADATA_CELL_KEY] = adata.obs.index table = TableModel.parse( adata, - region_key="region", - region=adata.obs["region"].cat.categories.tolist(), - instance_key=MerscopeKeys.INSTANCE_KEY.value, + region_key=MerscopeKeys.REGION_KEY.value, + region=region, + instance_key=MerscopeKeys.METADATA_CELL_KEY.value, ) return SpatialData(shapes=shapes, points=points, images=images, table=table) From 137fe8a55868da8c200cd158c0dc2382712482f3 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Mon, 31 Jul 2023 10:08:49 +0200 Subject: [PATCH 17/17] microns_to_pixel: in one line for mypy --- src/spatialdata_io/readers/merscope.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/merscope.py b/src/spatialdata_io/readers/merscope.py index 0487d5d9..686dcf7c 100644 --- a/src/spatialdata_io/readers/merscope.py +++ b/src/spatialdata_io/readers/merscope.py @@ -143,8 +143,9 @@ def merscope( count_path, obs_path, boundaries_path = _get_file_paths(path, vpt_outputs) images_dir = path / MerscopeKeys.IMAGES_DIR - microns_to_pixels = np.genfromtxt(images_dir / MerscopeKeys.TRANSFORMATION_FILE) - microns_to_pixels = Affine(microns_to_pixels, input_axes=("x", "y"), output_axes=("x", "y")) + microns_to_pixels = Affine( + np.genfromtxt(images_dir / MerscopeKeys.TRANSFORMATION_FILE), input_axes=("x", "y"), output_axes=("x", "y") + ) vizgen_region = path.name if region_name is None else region_name slide_name = path.parent.name if slide_name is None else slide_name