In [2]:
import h5py
import numpy as np
import pandas as pd
import re
import yaml
import os
from collections.abc import MutableMapping
from typing import Dict, List, Tuple
from exceptions import *
from copy import deepcopy
import collections
from parser_base import Parser
from imaris import ImarisDataObject
import ray
from functools import partial
import time

In [3]:
data_path = "../../data/surface_parser_dev_data/P1 DHBR Roi2 6x6_TileScan_001_Merging_Crop_0_batch.ims"
# data_path = (
#     "../../data/multi_surface_track_parser_dev_data/GFP #1 Sec1 Roi2 2x2 1h30min.ims"
# )
os.path.isfile(data_path)

True

In [4]:
# @ray.remote
class SurfaceParser(Parser):
    """
    Extracts Surface Level Information From Imaris File

    Args:
        Parser (ABCMeta): Parser Abstract Base Class
    """

    def __init__(self, ims_file_path: str) -> None:
        self.ims_file_path = ims_file_path
        self.ims = ImarisDataObject(self.ims_file_path)
        self.configure_instance()

    def configure_instance(self) -> None:
        """
        Extracts relevant information from ims object and
        instantiates it as instance variables for fast recall.

        Currently Extracts:
            - all the surface names -- List
            - all the stats_names -- {id: pd.DataFrame}
            - all the stats values -- {id: pd.DataFrame}
            - all the factor info -- {id: pd.DataFrame}
        """
        # TODO: check to ensure surfaces exist or raise error
        # extract all information and saves it as a instance var
        self.surface_names = self.ims.get_object_names("Surface")

        # get all the stats names for every surface {surf_id: stats_name_df}
        self.stats_names = {
            surface_id: self.ims.get_stats_names(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

        # get all the stats values for every surface {surf_id: stats_values_df}
        self.stats_values = {
            surface_id: self.ims.get_stats_values(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

        # get all the factor table info for every surface {surf_id: factor_df}
        self.factors = {
            surface_id: self.ims.get_object_factor(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

        # get all the factor table info for every surface {surf_id: factor_df}
        self.object_ids = {
            surface_id: self.ims.get_object_ids(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

    def organize_stats(self, stats_values: pd.DataFrame) -> Dict:
        """Organized the data such that it looks like
        {ID_Object: {Stats Name: Value}}

        Args:
            surface_stats_values (pd.DataFrame): a single dataframe
            that contains the statistics for a single surface

        Returns:
            Dict: _description_
        """
        grouped_stats = (
            stats_values.groupby("ID_Object")[["ID_StatisticsType", "Value"]]
            .apply(lambda x: x.set_index("ID_StatisticsType").to_dict(orient="dict"))
            .to_dict()
        )
        grouped_stats = {k: v["Value"] for k, v in grouped_stats.items()}
        return grouped_stats

    def generate_csv(
        self, stats_values: Dict, stat_names: pd.DataFrame
    ) -> pd.DataFrame:
        """_summary_

        Args:
            organized_stats (Dict): _description_

        Returns:
            pd.DataFrame: _description_
        """
        # create a dict that maps stat_id to stat_name
        column_names_dict = dict(zip(stat_names["ID"], stat_names["Name"]))
        dataframe = pd.DataFrame(stats_values).transpose()

        # replaces id columns with respective stat name and add idx
        dataframe = dataframe.rename(column_names_dict, axis=1)
        dataframe["Object_ID"] = dataframe.index
        return dataframe

    def save_csv(self):
        # a function to write csv information to disk
        pass

    def process(self, surface_id: int) -> None:
        """
        Runs a single end to end parser pipeline on a single surface
        Steps:
            - get stat names for a single surface
            - get stat values for a single surface
            - filter stat values to keep only track ids
            - filter stats values to remove track level stat information
            - rename certian columns (if needed)(need a custom func for this to add channel info)
            - organize the filtered stats
            - generate csv
            - save csv

        Args:
            surface_id (int): _description_
        """
        # gather info for current surface
        start = time.perf_counter()
        surface_name = self.surface_names[surface_id]
        print(f"surface_name: {time.perf_counter() - start}")
        stat_names = self.stats_names.get(surface_id)
        print(f"stat_names: {time.perf_counter() - start}")
        stat_values = self.stats_values.get(surface_id)
        print(f"stat_values: {time.perf_counter() - start}")
        object_id = self.object_ids.get(surface_id)
        print(f"object_id: {time.perf_counter() - start}")
        factor = self.factors.get(surface_id)
        print(f"factor: {time.perf_counter() - start}")

        # update channel and surface names
        stat_names = self.update_channel_info(stats_names=stat_names, factor=factor)
        print(f"stat_names_channel: {time.perf_counter() - start}")
        stat_names = self.update_surface_info(stats_names=stat_names, factor=factor)
        print(f"stat_names_surfaces: {time.perf_counter() - start}")

        # filter stats values by object ids (ie: ignore info related to trackids)
        stat_values = self.filter_stats(
            stats_values=stat_values,
            filter_col_names=["ID_Object"],
            filter_values=[object_id],
        )
        print(f"filtered_stat_values: {time.perf_counter() - start}")

        # organize stats values
        organized_stats = self.organize_stats(stat_values)
        print(f"organized_stats: {time.perf_counter() - start}")

        # generate csv
        stats_df = self.generate_csv(organized_stats, stat_names=stat_names)
        print(f"stats_df: {time.perf_counter() - start}")

        return stats_df

    def filter_stats(
        self,
        stats_values: pd.DataFrame,
        filter_col_names: List[str],
        filter_values: List[pd.Series],
    ) -> pd.DataFrame:
        """
        Filters the stats values dataframe. It keeps information
        from col_names and filter_values that is passed in as arguments.

        Args:
            stats_values (pd.DataFrame): _description_
            filter_col_name (str): name of the column we want to use to filter
            filter_values (str): values that we want to keep

        Returns:
            pd.DataFrame: _description_
        """
        # for surface parser need to filter out track id information
        # and statistics related to track information.
        for col_names, values in zip(filter_col_names, filter_values):
            stats_values = stats_values[stats_values[col_names].isin(values=values)]

        return stats_values

    def extract_and_save(self):
        # this function is the funtion that gets called externally
        # we can have this function as a ray method to help with distributed execution
        pass

    def get_available_stat_names(self):
        # interacts with data object and returns requested data for inspection
        pass

    def update_stats_with_real_names(
        self, surface_name: str, stats_names: Dict, user_defined_list: List
    ) -> Dict:
        """
        Update the stats names according to the real surface names found
        inside Contents->SurfaceName->Factor

        Args:
            surface_name (str): the name of the surface to extract data from
            stats_names (Dict): stats_names dictionary
            user_defined_list (List): list of stats name given by ...
                ...the user to be replaced by the real surface names

        Returns:
            Dict: stats name dict with the updated surface names
        """
        real_stats_names = self.ims.get_real_surface_names(surface_name)
        filtered_dicts = [
            self.get_filtered_stat_names(stats_names, keyword, exact=True)
            for keyword in user_defined_list
        ]
        for dict in filtered_dicts:
            for idx, (k, _) in enumerate(dict.items()):
                stats_names[k] = real_stats_names[idx]

        return stats_names

    def update_channel_info(
        self, stats_names: pd.DataFrame, factor: pd.DataFrame
    ) -> pd.DataFrame:
        """
        Updates the channel information for the relavent rows
        based on th ID_FactorList information in stats_names

        Args:
            stats_names (pd.DataFrame): _description_
            factor (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """

        # create function get channel number from a pandas row from stats_names
        # inner func
        def get_channel_id(row_info, factor: pd.DataFrame):
            factor_id = row_info["ID_FactorList"]  # factor id
            name = row_info["Name"]  # stat name

            # filter factor to only include items related to Channel
            channel_info = factor[factor["Name"] == "Channel"]

            # main logic to select the right channel given the factor id
            if factor_id in channel_info["ID_List"].to_list():
                channel = channel_info[channel_info["ID_List"] == factor_id][
                    "Level"
                ].item()
                return f"{name} Channel_{channel}"
            # if factor id is not in the channel list no channel info is needed
            else:
                return name

        # create partial
        get_channel_id_partial = partial(get_channel_id, factor=factor)

        # update stats name with the newly mapped stats names values
        stats_names["Name"] = stats_names.apply(func=get_channel_id_partial, axis=1)

        return stats_names

    def update_surface_info(
        self, stats_names: pd.DataFrame, factor: pd.DataFrame
    ) -> pd.DataFrame:
        """
        Updates the surface name information for the relavent rows
        based on th ID_FactorList information in stats_names

        Args:
            stats_names (pd.DataFrame): _description_
            factor (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """

        # create function get channel number from a pandas row from stats_names
        # inner func
        def get_surface_name(row_info, factor: pd.DataFrame):
            factor_id = row_info["ID_FactorList"]  # factor id
            name = row_info["Name"]  # stat name

            # filter factor to only include items related to Channel
            channel_info = factor[factor["Name"] == "Surfaces"]

            # main logic to select the right channel given the factor id
            if factor_id in channel_info["ID_List"].to_list():
                channel = channel_info[channel_info["ID_List"] == factor_id][
                    "Level"
                ].item()
                return channel
            # if factor id is not in the channel list no channel info is needed
            else:
                return name

        # create partial
        get_surface_name_partial = partial(get_surface_name, factor=factor)

        # update stats name with the newly mapped stats names values
        stats_names["Name"] = stats_names.apply(func=get_surface_name_partial, axis=1)

        return stats_names

    def inspect(self, surface_id: int) -> Dict:
        """
        Used to inspect intermediate steps in the
        parser's process.

        Args:
            surface_id (int): _description_

        Returns:
            Dict: _description_
        """
        storage = {}
        surface_name = self.surface_names[surface_id]
        storage["surface_name"] = surface_name
        stat_names = self.stats_names.get(surface_id)
        storage["stat_names_raw"] = deepcopy(stat_names)
        stat_values = self.stats_values.get(surface_id)
        storage["stat_values_raw"] = stat_values
        object_id = self.object_ids.get(surface_id)
        storage["object_id"] = object_id
        factor = self.factors.get(surface_id)
        storage["factor"] = factor

        # update channel and surface names
        stat_names = self.update_channel_info(stats_names=stat_names, factor=factor)
        storage["stat_names_channel_added"] = deepcopy(stat_names)
        stat_names = self.update_surface_info(stats_names=stat_names, factor=factor)
        storage["stat_names_surface_added"] = deepcopy(stat_names)

        # filter stats values by object ids (ie: ignore info related to trackids)
        stat_values = self.filter_stats(
            stats_values=stat_values,
            filter_col_names=["ID_Object"],
            filter_values=[object_id],
        )
        storage["filtered_stat_values"] = deepcopy(stat_values)

        # organize stats values
        organized_stats = self.organize_stats(stat_values)
        storage["organized_stats"] = organized_stats

        # generate csv
        stats_df = self.generate_csv(organized_stats, stat_names=stat_names)
        storage["stats_df"] = stats_df

        return storage

In [5]:
parser = SurfaceParser(data_path)

In [5]:
df = parser.process(surface_id=0)

surface_name: 1.8440186977386475e-06
stat_names: 0.0005665420321747661
stat_values: 0.0005804289830848575
object_id: 0.0005901750409975648
factor: 0.0005994890816509724
stat_names_channel: 0.08809218904934824
stat_names_surfaces: 0.12369912106078118
filtered_stat_values: 0.2227470630314201
organized_stats: 7.501630506012589
stats_df: 8.530978473019786


In [6]:
# working surface parser

Unnamed: 0,Area,BoundingBoxAA Length X,BoundingBoxAA Length Y,BoundingBoxAA Length Z,BoundingBoxOO Length A,BoundingBoxOO Length B,BoundingBoxOO Length C,Center of Homogeneous Mass X,Center of Homogeneous Mass Y,Center of Homogeneous Mass Z,...,Position Z,Surfaces Unnamed 2 CB.icsx_[ibrx_2022-06-27T12-49-56.078],"Surfaces Unnamed 1 CB.icsx_[ibrx_2022-06-27T12-49-56.078] Selection [""Intensity Max Ch=6 Img=1"" above 7.26 ]","Surfaces Unnamed 1 CB.icsx_[ibrx_2022-06-27T12-49-56.078] Selection 2 [""Intensity Max Ch=6 Img=1"" below 7.26 ]","Surfaces Unnamed 2 CB.icsx_[ibrx_2022-06-27T12-49-56.078] Selection [""Intensity Max Ch=5 Img=1"" above 138 ]",Sphericity,Time,Time Index,Volume,Object_ID
0,1416.449463,15.445801,14.537109,49.483913,12.435013,14.216797,39.194008,2201.750977,1098.804321,8.719857,...,8.719857,103.984497,70.556183,0.0,103.984497,0.728735,0.0,1.0,3118.384521,0
1,1439.677612,4.542725,18.171509,49.483913,4.381836,16.885925,48.666321,2215.124268,1084.293945,11.086892,...,11.086892,108.208221,83.030891,0.0,108.208221,0.407894,0.0,1.0,1338.113403,1
2,4187.246094,22.714355,24.531494,69.277481,19.336182,28.558105,62.139992,2206.579102,1086.499512,18.480581,...,18.480581,99.872391,74.137321,0.0,99.872391,0.627070,0.0,1.0,12651.461914,2
3,1999.841187,19.080078,15.445801,49.483917,14.625244,18.227539,47.446346,2207.949219,1075.563354,38.868191,...,38.868191,102.785751,80.786064,0.0,102.785751,0.757118,0.0,1.0,5540.032715,3
4,1048.769287,8.177002,13.628540,49.483917,7.214600,13.120605,38.128593,2213.926758,1059.961060,57.340115,...,57.340115,114.359413,96.914055,0.0,114.359413,0.655737,0.0,1.0,1695.852051,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34189,1317.524536,7.268604,9.994263,108.864609,8.381348,8.808960,93.008011,1.756580,1114.789185,23.244741,...,23.244741,50.536652,34.976704,0.0,50.536652,0.487673,0.0,1.0,1531.459106,34189
34190,1717.538818,23.623047,13.628418,39.587128,11.199585,21.944458,38.052277,2169.155762,1112.963501,101.627502,...,101.627502,60.343472,47.800472,0.0,60.343472,0.617823,0.0,1.0,3250.347412,34190
34191,1569.860229,19.988647,15.445557,39.587132,14.575073,17.861328,33.454041,779.363342,1117.380249,6.088751,...,6.088751,62.313019,73.274330,0.0,92.715706,0.830725,0.0,1.0,4428.447754,34191
34192,910.736938,12.720093,13.628418,39.587143,11.377655,12.901367,31.624489,725.254150,1113.939209,90.908417,...,90.908417,62.687698,89.263580,0.0,102.113998,0.853312,0.0,1.0,2037.160156,34192
