In [1]:
import h5py
import numpy as np
import pandas as pd
import re
import yaml
import os
from collections.abc import MutableMapping
from typing import Dict, List
from exceptions import *
from copy import deepcopy
import collections

In [101]:
# data_path = (
#     "../../data/multi_surface_track_parser_dev_data/GFP #1 Sec1 Roi2 2x2 1h30min.ims"
# )

In [102]:
data_path = (
    "../../data/multi_surface_track_parser_dev_data/GFP #1 Sec2 Roi1 2x2 1h30min.ims"
)

In [103]:
class ImarisDataParser(object):
    def __init__(self, ims_file_path: str) -> None:
        self.data = self.load_ims(ims_file_path)
        self.surface_names = self.get_object_names(search_for="Surface")

    def load_ims(self, ims_file_path: str) -> h5py.File:
        """
        loads a imaris file with .ims extension using h5py

        Args:
            ims_file_path (str): path to .ims file

        Returns:
            h5py.File: returns the data contained within the .ims file
        """
        return h5py.File(ims_file_path, "r+")

    def get_object_names(self, search_for: str) -> List:
        """
        extracts the object names that we are interested in.
        objects are usually surfaces( is this always true?)

        Args:
            full_data_file (h5py.File): full imaris file in h5py File format
            search_for (str): string containing full or partial filename to search for

        Returns:
            list: a list of all the object names that match search_for parameter
        """
        try:
            values = self.data.get("Scene8").get("Content").keys()
            storage = list()
            for item in values:
                if len(re.findall(search_for, item)):
                    storage.append(item)
            return storage
        except AttributeError:
            raise NoSurfaceException

    def get_statistics_names(self, object_name: str) -> Dict:
        """
        for a given object_name, extracts the statistics names and ids into a dict
        ex: statistics name = mean intensity, associated id=404

        Args:
            full_data_file (h5py._hl.files.File): full imaris file in h5py File format
            object_name (str): name of the object to get statistic names from

        Returns:
            dict: a dict where the keys=unique stats ID, value=static name
        """

        # get object specific data
        obj_specific_data = self.data.get("Scene8").get("Content").get(object_name)

        # rearrange data
        statistics_name = np.asarray(obj_specific_data["StatisticsType"])
        statistics_name = pd.DataFrame(statistics_name)

        # extract statistics names
        stats_name = statistics_name["Name"]

        # extract statistics ID names
        stats_type = statistics_name["ID"]

        # combine stats type and stats names
        return dict(zip(stats_type, stats_name))

    def get_stats_values(self, object_name: str) -> pd.DataFrame:
        """
        for a given object_name, extracts the statistics values for all object ids
        within the object

        Args:
            full_data_file (h5py._hl.files.File): full imaris file in h5py File format
            object_name (str): name of the object to get statistic names from

        Returns:
            pd.DataFrame: a pandas data frame that contains information about each object id
            where each object id has a stats id and associated stats value.
        """
        # get the track ids
        obj_specific_stats = self.data.get("Scene8").get("Content")[object_name][
            "StatisticsValue"
        ]
        obj_specific_stats = np.asarray(obj_specific_stats)
        return pd.DataFrame(obj_specific_stats)

    def add_channel_info(self, stats_values: Dict) -> Dict:
        """
        Add in channel information to each redundant variable name
        Assumes there are no nested dictionaries. Example: if theres multiple ImageMean strings then each
        ImageMean in acending id order will be renamed ImageMean_channel_1 etc.

        Return:
            stats_values (Dict): a single dictionary with key = stat id, value = state name
        """
        # get counts
        counts = collections.Counter(list(stats_values.values()))

        # map names to id numbers
        inverse_dict = {k: [] for k in counts.keys()}
        for id, name in stats_values.items():
            inverse_dict[name].append(id)

        # rename each id with the updated string
        updated_stats_values = {}
        for name, data in inverse_dict.items():
            if len(data) == 1:
                updated_stats_values[data.pop()] = name
            elif len(data) > 1:
                data = sorted(data)
                for idx, stat_id in enumerate(data):
                    updated_stats_values[stat_id] = f"{name}_channel_{idx+1}"
            else:
                raise ValueError

        return updated_stats_values

    def get_track_ids(self, object_name: str) -> pd.Series:
        """
        Gets all the track ids for a given surface
        Args:
            full_data_file (h5py.File): _description_
            object_name (str): _description_
        """
        try:
            track_ids = pd.DataFrame(
                np.asarray(
                    self.data.get("Scene8").get("Content")[object_name]["Track0"]
                )
            )
            return track_ids["ID"]
        except KeyError:
            raise NoTrackException

    def flatten(self, input_dict: dict, parent_key="", sep="_") -> Dict:
        """
        flattens a nested dictionary

        Args:
            input_dict (dict): _description_
            parent_key (str, optional): _description_. Defaults to ''.
            sep (str, optional): _description_. Defaults to '_'.

        Returns:
            dict: flattened dictionary
        """

        items = []
        for k, v in input_dict.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, MutableMapping):
                items.extend(self.flatten(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)

    def generate_stats_names(self) -> Dict[str, Dict[int, str]]:
        """Uses the surface name information from the imaris data file
        and extracts the numerical statistics id and the string
        statistics names for each Surface

        Returns:
            Dict[str, Dict[int, str]]: key=SurfaceName, value=Statistics Dict[id, name]
        """
        object_stats_dict = {}
        for _, name in enumerate(self.surface_names):
            stats_dict = self.get_statistics_names(object_name=name)
            object_stats_dict[name] = stats_dict

        return object_stats_dict

    def generate_stats_values(self) -> Dict[str, Dict[int, pd.DataFrame]]:
        """Uses the surface name information from the imaris data file
        and extracts the numerical statistics id and the associated
        statistics values for each surface

        Returns:
            Dict[str, Dict[int, pd.DataFrame]]: _description_
        """
        stats_values_dict = {}
        for _, name in enumerate(self.surface_names):
            stats_value_dict = self.get_stats_values(object_name=name)
            stats_values_dict[name] = stats_value_dict

        return stats_values_dict

    def map_name_to_id(self, stats_values: Dict, stats_names: Dict):
        storage = {}
        for name in stats_names:
            mapping_dict = stats_names.get(name)
            dataframe = stats_values[name]
            dataframe["Names"] = dataframe["ID_StatisticsType"].map(mapping_dict)
            storage[name] = dataframe
        return storage

    def get_surface_names(self, surface_name: str):
        arr = self.data.get("Scene8").get("Content").get(surface_name).get("Factor")
        surface_names = []
        for item in arr:
            if len(re.findall(b"Surface", item)):
                surface_names.append(item[-1])
        return surface_names

In [104]:
parser = ImarisDataParser(data_path)
parser.surface_names

['MegaSurfaces0',
 'MegaSurfaces1',
 'MegaSurfaces2',
 'MegaSurfaces3',
 'MegaSurfaces4',
 'MegaSurfaces5',
 'MegaSurfaces6',
 'MegaSurfaces7']

In [105]:
stats_dict_storage = parser.generate_stats_names()
stats_value_storage = parser.generate_stats_values()
unique_surface_names = {name: parser.get_surface_names(name) for name in parser.surface_names}

In [106]:
stats_dict_storage["MegaSurfaces0"]

{26071: b'Acceleration',
 26072: b'Acceleration X',
 26073: b'Acceleration Y',
 26074: b'Acceleration Z',
 24927: b'Area',
 26075: b'BoundingBoxAA Length X',
 26076: b'BoundingBoxAA Length Y',
 26077: b'BoundingBoxAA Length Z',
 26078: b'BoundingBoxOO Length A',
 26079: b'BoundingBoxOO Length B',
 26080: b'BoundingBoxOO Length C',
 26081: b'Center of Homogeneous Mass X',
 26082: b'Center of Homogeneous Mass Y',
 26083: b'Center of Homogeneous Mass Z',
 26084: b'Center of Image Mass X',
 26085: b'Center of Image Mass X',
 26086: b'Center of Image Mass X',
 26087: b'Center of Image Mass X',
 26088: b'Center of Image Mass X',
 26089: b'Center of Image Mass Y',
 26090: b'Center of Image Mass Y',
 26091: b'Center of Image Mass Y',
 26092: b'Center of Image Mass Y',
 26093: b'Center of Image Mass Y',
 26094: b'Center of Image Mass Z',
 26095: b'Center of Image Mass Z',
 26096: b'Center of Image Mass Z',
 26097: b'Center of Image Mass Z',
 26098: b'Center of Image Mass Z',
 26099: b'Displacem

In [107]:
unique_surface_names['MegaSurfaces0']

[b'Surfaces Unnamed 2 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 4 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']

In [96]:
unique_surface_names['MegaSurfaces1']

[b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 4 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']

In [97]:
unique_surface_names['MegaSurfaces2']

[b'Surfaces Unnamed 2 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 4 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']

In [98]:
unique_surface_names['MegaSurfaces3']

[b'Surfaces Unnamed 2 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
 b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
 b'Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']

In [67]:
unique_surface_names['MegaSurfaces3']

[b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-23T16-55-31.963]',
 b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-23T16-55-31.963]',
 b'Surfaces Unnamed 2 BCGmR.icsx_[ibrx_2023-06-23T16-55-31.963]']

In [39]:
def get_surface_names(data):
    arr = data.get("Scene8").get("Content").get("MegaSurfaces1").get("Factor")
    surface_names = []
    for item in arr:
        if len(re.findall(b"Surface", item)):
            surface_names.append(item[-1])

    return surface_names


surface_names = get_surface_names(data)

In [40]:
surface_names

[b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-23T16-55-31.963]',
 b'Surfaces Unnamed 4 BCGmR.icsx_[ibrx_2023-06-23T16-55-31.963]',
 b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-23T16-55-31.963]']

In [107]:
%timeit list(names.values()).index(b"Center of Image Mass Y")

1.7 µs ± 1.68 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [13]:
[9].pop()

9