In [1]:
import h5py
import numpy as np
import pandas as pd
import re
import yaml
import os
from collections.abc import MutableMapping
from typing import Dict, List, Tuple
from exceptions import *
from copy import deepcopy
import collections
from parser_base import Parser
from imaris import ImarisDataObject
import ray

In [35]:
data_path = (
    "../../data/multi_surface_track_parser_dev_data/YFP #1 Sec2 Roi1 2x2 1h shehan.ims"
)
os.path.isfile(data_path)

True

In [36]:
# @ray.remote
class SurfaceParser(Parser):
    """
    Extracts Surface Level Information From Imaris File

    Args:
        Parser (ABCMeta): Parser Abstract Base Class
    """

    def __init__(self, ims_file_path: str) -> None:
        self.ims_file_path = ims_file_path
        self.ims = ImarisDataObject(self.ims_file_path)
        self.surface_names = self.ims.get_object_names("Surface")

    def organize_stats(self, stats_values: pd.DataFrame) -> Dict:
        """Organized the data such that it looks like
        {ID_Object: {Stats Name: Value}}

        Args:
            surface_stats_values (pd.DataFrame): a single dataframe
            that contains the statistics for a single surface

        Returns:
            Dict: _description_
        """
        grouped_stats = (
            stats_values.groupby("ID_Object")[["Names", "Value"]]
            .apply(lambda x: x.set_index("Names").to_dict(orient="dict"))
            .to_dict()
        )
        grouped_stats = {k: v["Value"] for k, v in grouped_stats.items()}
        return grouped_stats

    def generate_csv(self, stats_values: Dict) -> pd.DataFrame:
        """_summary_

        Args:
            organized_stats (Dict): _description_

        Returns:
            pd.DataFrame: _description_
        """
        dataframe = pd.DataFrame(stats_values).transpose()
        dataframe["Object_ID"] = dataframe.index
        return dataframe

    def save_csv(self):
        # a function to write csv information to disk
        pass

    def process(self, surface_id: int) -> None:
        """
        Runs a single end to end parser pipeline on a single surface
        Steps:
            - get stat names for a single surface
            - get stat values for a single surface
            - filter stat values to keep only track ids
            - filter stats values to remove track level stat information
            - rename certian columns (if needed)(need a custom func for this to add channel info)
            - organize the filtered stats
            - generate csv
            - save csv

        Args:
            surface_id (int): _description_
        """
        surface_name = self.surface_names[surface_id]
        stat_names = self.ims.get_stats_names(surface_name)  # stat names
        stat_values = self.ims.get_stats_values(surface_name)  # stat values
        stat_values = self.map_names_to_id(stat_values, stat_names)  # maps name to id
        object_ids = self.ims.get_object_ids(surface_name)  # trackid values
        filtered_stat_values = self.filter_stats(
            stats_values=stat_values,
            filter_col_names=["ID_Object"],
            filter_values=[object_ids],
        )
        organized_stats = self.organize_stats(filtered_stat_values)
        stats_df = self.generate_csv(organized_stats)

        return stats_df

    def filter_stats(
        self,
        stats_values: pd.DataFrame,
        filter_col_names: List[str],
        filter_values: List[pd.Series],
    ) -> pd.DataFrame:
        """
        Filters the stats values dataframe. It keeps information
        from col_names and filter_values that is passed in as arguments.

        Args:
            stats_values (pd.DataFrame): _description_
            filter_col_name (str): name of the column we want to use to filter
            filter_values (str): values that we want to keep

        Returns:
            pd.DataFrame: _description_
        """
        # for surface parser need to filter out track id information
        # and statistics related to track information.
        for col_names, values in zip(filter_col_names, filter_values):
            stats_values = stats_values[stats_values[col_names].isin(values=values)]

        return stats_values

    def extract_and_save(self):
        # this function is the funtion that gets called externally
        # we can have this function as a ray method to help with distributed execution
        pass

    def get_available_stat_names(self):
        # interacts with data object and returns requested data for inspection
        pass

    def get_filtered_stat_names(
        self,
        stats_names: Dict,
        keyword: str = b"Track",
        exact: bool = False,
        include: bool = True,
    ) -> Dict:
        """
        Filter stats names dict that have {stat_id: stat_name} and reduce
        it to a list of stat_ids whos stat_name contain the provided keyword.
        The final stat_ids list is returned as a pd.DataFrame.

        Args:
            stats_names (Dict): _description_
            keyword (str, optional): _description_. Defaults to b"Track".

        Returns:
            Tuple[pd.Series, Dict]: pd.Series of only the stat_ids, and a Dict
            of the filtered {stat_id: stat_name} dict
        """
        filtered_stat_names = {}

        if exact:
            # returns only the exact match to keyword
            for stat_id, stat_name in stats_names.items():
                if keyword == stat_name:
                    filtered_stat_names[stat_id] = stat_name
        else:
            # returns even partial matches to keyword
            for stat_id, stat_name in stats_names.items():
                if include:
                    if len(re.findall(keyword, stat_name)):
                        filtered_stat_names[stat_id] = stat_name
                else:
                    if not len(re.findall(keyword, stat_name)):
                        filtered_stat_names[stat_id] = stat_name

        return filtered_stat_names

    def map_names_to_id(
        self, stat_values: pd.DataFrame, stat_names: Dict
    ) -> pd.DataFrame:
        """
        Maps the string representation of the stat_ids to create a column
        in the stats values dataframe.

        Args:
            stats_values (pd.DataFrame): dataframe
            stats_names (Dict): {stats_ids: stats_name} where stats_ids=int, stats_names=string

        Returns:
            pd.DataFrame: _description_
        """
        stat_values["Names"] = stat_values["ID_StatisticsType"].map(stat_names)
        return stat_values

    def update_stats_with_real_names(
        self, surface_name: str, stats_names: Dict, user_defined_list: List
    ) -> Dict:
        """
        Update the stats names according to the real surface names found
        inside Contents->SurfaceName->Factor

        Args:
            surface_name (str): the name of the surface to extract data from
            stats_names (Dict): stats_names dictionary
            user_defined_list (List): list of stats name given by ...
                ...the user to be replaced by the real surface names

        Returns:
            Dict: stats name dict with the updated surface names
        """
        real_stats_names = self.ims.get_real_surface_names(surface_name)
        filtered_dicts = [
            self.get_filtered_stat_names(stats_names, keyword, exact=True)
            for keyword in user_defined_list
        ]
        for dict in filtered_dicts:
            for idx, (k, _) in enumerate(dict.items()):
                stats_names[k] = real_stats_names[idx]

        return stats_names

In [37]:
parser = SurfaceParser(data_path)

In [60]:
stats_names, stats_df = parser.ims.get_stats_names("MegaSurfaces1")

In [61]:
stats_names

{42793: b'Acceleration',
 42940: b'Acceleration X',
 42941: b'Acceleration Y',
 42942: b'Acceleration Z',
 42801: b'Area',
 43113: b'BoundingBoxAA Length X',
 43114: b'BoundingBoxAA Length Y',
 43115: b'BoundingBoxAA Length Z',
 43116: b'BoundingBoxOO Length A',
 43117: b'BoundingBoxOO Length B',
 43118: b'BoundingBoxOO Length C',
 43119: b'Center of Homogeneous Mass X',
 43120: b'Center of Homogeneous Mass Y',
 43121: b'Center of Homogeneous Mass Z',
 43122: b'Center of Image Mass X',
 43123: b'Center of Image Mass X',
 43124: b'Center of Image Mass X',
 43125: b'Center of Image Mass X',
 47408: b'Center of Image Mass X',
 43126: b'Center of Image Mass Y',
 43127: b'Center of Image Mass Y',
 43128: b'Center of Image Mass Y',
 43129: b'Center of Image Mass Y',
 47409: b'Center of Image Mass Y',
 43130: b'Center of Image Mass Z',
 43131: b'Center of Image Mass Z',
 43132: b'Center of Image Mass Z',
 43133: b'Center of Image Mass Z',
 47410: b'Center of Image Mass Z',
 42819: b'Displacem

In [66]:
stats_df

Unnamed: 0,ID,ID_Category,ID_FactorList,Name,Unit
0,42793,0,0,b'Acceleration',b''
1,42793,0,0,b'Acceleration',b''
2,42793,0,0,b'Acceleration',b''
3,42793,0,0,b'Acceleration',b''
4,42793,0,0,b'Acceleration',b''
...,...,...,...,...,...
9101,42938,0,0,b'Volume',b''
9102,42938,0,0,b'Volume',b''
9103,42938,0,0,b'Volume',b''
9104,42938,0,0,b'Volume',b''


In [58]:
np.unique(stats_df["ID_FactorList"].to_numpy())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40])

In [62]:
np.array(parser.ims.data["Scene8"]["Content"]["MegaSurfaces1"]["FactorList"])

array([( 1,), ( 2,), ( 3,), ( 4,), ( 5,), ( 6,), ( 7,), ( 8,), ( 9,),
       (10,), (11,), (12,), (13,), (14,), (15,), (16,), (17,), (18,),
       (19,), (20,), (21,), (22,), (23,), (24,), (25,), (26,), (27,),
       (28,), (29,), (30,), (31,), (32,), (33,), (34,), (35,), (36,),
       (37,), (38,), (39,), (40,)], dtype=[('ID', '<i8')])

In [63]:
np.array(parser.ims.data["Scene8"]["Content"]["MegaSurfaces1"]["Factor"])

array([( 1, b'Collection', b'Acceleration Vector'),
       ( 2, b'Collection', b'BoundingBoxAA Length'),
       ( 3, b'Collection', b'BoundingBoxOO Length'),
       ( 4, b'Collection', b'Center of Homogeneous Mass'),
       ( 5, b'Channel', b'1'),
       ( 5, b'Collection', b'Center of Image Mass'),
       ( 5, b'Image', b'Image 1'), ( 6, b'Channel', b'2'),
       ( 6, b'Collection', b'Center of Image Mass'),
       ( 6, b'Image', b'Image 1'), ( 7, b'Channel', b'3'),
       ( 7, b'Collection', b'Center of Image Mass'),
       ( 7, b'Image', b'Image 1'), ( 8, b'Channel', b'4'),
       ( 8, b'Collection', b'Center of Image Mass'),
       ( 8, b'Image', b'Image 1'), ( 9, b'Channel', b'5'),
       ( 9, b'Collection', b'Center of Image Mass'),
       ( 9, b'Image', b'Image 1'),
       (10, b'Collection', b'Displacement Delta'),
       (11, b'Collection', b'Displacement'),
       (12, b'Collection', b'Distance from Origin'),
       (13, b'Image', b'Image 1'),
       (14, b'Collection', b'Ell

In [124]:
l = [
    b"Overlapped Volume Ratio to Surfaces",
    b"Overlapped Volume to Surfaces",
    b"Shortest Distance to Surfaces",
]
out = parser.update_stats_with_real_names("MegaSurfaces0", stats_names, user_defined_list=l)

In [125]:
def add_channel_info(stats_values: Dict) -> Dict:
    """
    Add in channel information to each redundant variable name
    Assumes there are no nested dictionaries. Example: if theres multiple ImageMean strings then each
    ImageMean in acending id order will be renamed ImageMean_channel_1 etc.

    Return:
        stats_values (Dict): a single dictionary with key = stat id, value = state name
    """
    # get counts
    counts = collections.Counter(list(stats_values.values()))

    # map names to id numbers
    inverse_dict = {k: [] for k in counts.keys()}
    for id, name in stats_values.items():
        inverse_dict[name].append(id)

    # rename each id with the updated string
    updated_stats_values = {}
    for name, data in inverse_dict.items():
        if len(data) == 1:
            updated_stats_values[data.pop()] = name
        elif len(data) > 1:
            data = sorted(data)
            for idx, stat_id in enumerate(data):
                updated_stats_values[stat_id] = f"{name}_channel_{idx+1}"
        else:
            raise ValueError

    return updated_stats_values

In [141]:
filtered_out = parser.get_filtered_stat_names(out, keyword=b"Surface", exact=False, include=False)

In [142]:
g =  add_channel_info(filtered_out)

In [153]:
stats_names

{26071: b'Acceleration',
 26072: b'Acceleration X',
 26073: b'Acceleration Y',
 26074: b'Acceleration Z',
 24927: b'Area',
 26075: b'BoundingBoxAA Length X',
 26076: b'BoundingBoxAA Length Y',
 26077: b'BoundingBoxAA Length Z',
 26078: b'BoundingBoxOO Length A',
 26079: b'BoundingBoxOO Length B',
 26080: b'BoundingBoxOO Length C',
 26081: b'Center of Homogeneous Mass X',
 26082: b'Center of Homogeneous Mass Y',
 26083: b'Center of Homogeneous Mass Z',
 26084: b'Center of Image Mass X',
 26085: b'Center of Image Mass X',
 26086: b'Center of Image Mass X',
 26087: b'Center of Image Mass X',
 26088: b'Center of Image Mass X',
 26089: b'Center of Image Mass Y',
 26090: b'Center of Image Mass Y',
 26091: b'Center of Image Mass Y',
 26092: b'Center of Image Mass Y',
 26093: b'Center of Image Mass Y',
 26094: b'Center of Image Mass Z',
 26095: b'Center of Image Mass Z',
 26096: b'Center of Image Mass Z',
 26097: b'Center of Image Mass Z',
 26098: b'Center of Image Mass Z',
 26099: b'Displacem

In [150]:
h = list(filtered_out.values())

In [151]:
g = [i.decode("utf-8") for i in h]

In [152]:
g

['Acceleration',
 'Acceleration X',
 'Acceleration Y',
 'Acceleration Z',
 'Area',
 'BoundingBoxAA Length X',
 'BoundingBoxAA Length Y',
 'BoundingBoxAA Length Z',
 'BoundingBoxOO Length A',
 'BoundingBoxOO Length B',
 'BoundingBoxOO Length C',
 'Center of Homogeneous Mass X',
 'Center of Homogeneous Mass Y',
 'Center of Homogeneous Mass Z',
 'Center of Image Mass X',
 'Center of Image Mass X',
 'Center of Image Mass X',
 'Center of Image Mass X',
 'Center of Image Mass X',
 'Center of Image Mass Y',
 'Center of Image Mass Y',
 'Center of Image Mass Y',
 'Center of Image Mass Y',
 'Center of Image Mass Y',
 'Center of Image Mass Z',
 'Center of Image Mass Z',
 'Center of Image Mass Z',
 'Center of Image Mass Z',
 'Center of Image Mass Z',
 'Displacement Delta Length',
 'Displacement Delta X',
 'Displacement Delta Y',
 'Displacement Delta Z',
 'Displacement Length',
 'Displacement X',
 'Displacement Y',
 'Displacement Z',
 'Displacement^2',
 'Distance from Origin',
 'Distance to Image B

In [None]:
# out = ray.get([p.process.remote(i) for i, p in enumerate(parser_ray)])