* Develop a Parser Object that serves as the main interaction with extracting data from the imaris files
* Parser types like Surface Parser, Track Parser are all built on top of the Parser Object
    * This is because the surface parser just reformats and processes the data extracted from the imaris file 

In [56]:
import h5py
import numpy as np
import pandas as pd
import re
import yaml
import os
from collections.abc import MutableMapping
from typing import Dict, List
from exceptions import *
from copy import deepcopy
import collections
import polars

In [57]:
data_path = "../data/YFP #1 Sec2 Roi1 2x2 1h shehan.ims"
os.path.isfile(data_path)

True

In [1]:
class ImarisDataObject:
    def __init__(self, ims_file_path: str) -> None:
        self.data = self.load_ims(ims_file_path)

    def load_ims(self, ims_file_path: str) -> h5py.File:
        """
        loads a imaris file with .ims extension using h5py

        Args:
            ims_file_path (str): path to .ims file

        Returns:
            h5py.File: returns the data contained within the .ims file
        """
        return h5py.File(ims_file_path, "r+")

    def get_object_names(self, object_name: str) -> List:
        """
        extracts the object names that we are interested in.
        objects are usually surfaces( is this always true?)

        Args:
            full_data_file (h5py.File): full imaris file in h5py File format
            object_name (str): string containing full or partial filename to search for

        Returns:
            list: a list of all the object names that match search_for parameter
        """
        try:
            values = self.data.get("Scene8").get("Content").keys()
            storage = list()
            for item in values:
                if len(re.findall(object_name, item)):
                    storage.append(item)
            return storage
        except AttributeError:
            raise NoSurfaceException

    def get_stats_names(self, object_name: str) -> Dict:
        """
        for a given object_name, extracts the statistics names and ids into a dict
        ex: statistics name = mean intensity, associated id=404

        Args:
            full_data_file (h5py._hl.files.File): full imaris file in h5py File format
            object_name (str): name of the object to get statistic names from

        Returns:
            dict: a dict where the keys=unique stats ID, value=static name
        """

        # get object specific data
        obj_specific_data = self.data.get("Scene8").get("Content").get(object_name)

        # rearrange data
        statistics_name = np.asarray(obj_specific_data["StatisticsType"])
        statistics_name = pd.DataFrame(statistics_name)

        # extract statistics names
        stats_name = statistics_name["Name"]

        # extract statistics ID names
        stats_type = statistics_name["ID"]

        # combine stats type and stats names
        return dict(zip(stats_type, stats_name))

    def get_stats_values(self, object_name: str) -> pd.DataFrame:
        """
        for a given object_name, extracts the statistics values for all object ids
        within the object

        Args:
            full_data_file (h5py._hl.files.File): full imaris file in h5py File format
            object_name (str): name of the object to get statistic names from

        Returns:
            pd.DataFrame: a pandas data frame that contains information about each object id
            where each object id has a stats id and associated stats value.
        """
        # get the track ids
        obj_specific_stats = self.data.get("Scene8").get("Content")[object_name][
            "StatisticsValue"
        ]
        obj_specific_stats = np.asarray(obj_specific_stats)
        return pd.DataFrame(obj_specific_stats)

    def get_track_ids(self, object_name: str) -> pd.Series:
        """
        Gets all the track ids for a given surface

        Args:
            full_data_file (h5py.File): _description_
            object_name (str): _description_

        Returns:
            pd.Series: Track ids for given surface
        """
        try:
            track_ids = pd.DataFrame(
                np.asarray(
                    self.data.get("Scene8").get("Content")[object_name]["Track0"]
                )
            )
            return track_ids["ID"]
        except KeyError:
            raise NoTrackException

    def get_real_surface_names(self, surface_name: str) -> List:
        """Gets all the real surface names for given surface name

        Args:
            surface_name (str): example: MegaSurface0

        Returns:
            List: _description_
        """
        arr = self.data.get("Scene8").get("Content").get(surface_name).get("Factor")
        surface_names = []
        for item in arr:
            if len(re.findall(b"Surface", item)):
                surface_names.append(item[-1])
        return surface_names

NameError: name 'h5py' is not defined

In [59]:
class SurfaceParser:
    def __init__(self, ims_file_path: str, config) -> None:
        self.data_obj = ImarisDataObject(ims_file_path)
        self.surface_names = self.data_obj.get_object_names(object_name="Surface")
        self.config = config

    def group_stats(self, surface_stats_values: pd.DataFrame) -> Dict:
        """Groups the data such that it looks like
        {ID_Object: {ID_StatisticsType: Value}}

        Args:
            surface_stats_values (pd.DataFrame): a single dataframe
            that contains the statistics for a single surface

        Returns:
            Dict: _description_
        """
        grouped_stats = (
            surface_stats_values.groupby("ID_Object")[["ID_StatisticsType", "Value"]]
            .apply(lambda x: x.set_index("ID_StatisticsType").to_dict(orient="dict"))
            .to_dict()
        )
        grouped_stats = {k: v["Value"] for k, v in grouped_stats.items()}
        return grouped_stats

    def get_filtered_stats_list(self, column_names: Dict, surface_id: int) -> Dict:
        """
        Helper function to generate lists to remove

        Args:
            column_names (Dict): _description_
            surface_id (int): _description_

        Returns:
            Dict: _description_
        """
        filtered_column_names = list()
        for item in column_names.values():
            if len(re.findall(b"Track", item)):
                filtered_column_names.append(item)

        # drop columns related to trackids
        surface_name = self.surface_names[surface_id]
        track_ids = self.data_obj.get_track_ids(surface_name).to_list()

        # filtered stats list
        filtered_stats_list = {
            "track_column_names": filtered_column_names,
            "track_ids": track_ids,
        }

        return filtered_stats_list

    def generate_csv(
        self, data_dict: Dict, column_names: Dict, filter_info: Dict
    ) -> pd.DataFrame:
        # converts the dictionary to a dataframe

        # convert to dataframe
        dataframe = pd.DataFrame.from_dict(data_dict).transpose()

        # update column info
        dataframe = dataframe.rename(column_names, axis=1)
        dataframe["ID"] = dataframe.index

        # filter out and stats columns we dont need
        dataframe = dataframe.drop(labels=filter_info["track_column_names"], axis=1)
        dataframe = dataframe.drop(labels=filter_info["track_ids"], axis=0)

        return dataframe

    def save_csv(self, data: pd.DataFrame, surface_id: int) -> None:
        """
        Saves given dataframe to the given data path

        Args:
            data (pd.DataFrame): Data to be saved
            save_path (str): Directory to save to
        """
        raise NotImplementedError

    def process(self, surface_id: int) -> Dict:
        # get surface names
        surface_name = self.surface_names[surface_id]  # should be 0 indexed

        # organize stats data
        available_stats_names = self.data_obj.get_stats_names(surface_name)
        available_stats_values = self.data_obj.get_stats_values(surface_name)
        grouped_stats = self.group_stats(available_stats_values)
        # since we are working on surface should we drop the track level
        # information?

        # generate column names
        stats_ids = available_stats_values["ID_StatisticsType"].unique()
        column_names = {key: available_stats_names[key] for key in stats_ids}

        # processed out
        processed_out = {"grouped_stats": grouped_stats, "column_names": column_names}

        return processed_out

    def extract_and_save(self, surface_id: int) -> None:
        # main access point
        # processes the whole dataset
        processed_out = self.process(surface_id=surface_id)
        filter_info = self.get_filtered_stats_list(
            column_names=processed_out["column_names"],
            surface_id=surface_id,
        )
        generated_csv = self.generate_csv(
            data_dict=processed_out["grouped_stats"],
            column_names=processed_out["column_names"],
            filter_info=filter_info,
        )
        # self.save_csv(data=generated_csv, surface_id=surface_id)
        return generated_csv

    def get_available_stats(self):
        raise NotImplementedError

    def generate_summary(self):
        raise NotImplementedError


In [51]:
surface_parser = SurfaceParser(data_path, config=None)

In [41]:
out = surface_parser.process(0)
filters = surface_parser.get_filtered_stats_list(out["column_names"], 0)

In [52]:
df = surface_parser.extract_and_save(0)


In [55]:
df.columns.to_list()


[b'Total Number of Disconnected Components',
 b'Total Number of Surfaces',
 b'Total Number of Triangles',
 b'Total Number of Voxels',
 b'Number of Disconnected Components per Time Point',
 b'Number of Surfaces per Time Point',
 b'Acceleration',
 b'Acceleration X',
 b'Acceleration Y',
 b'Acceleration Z',
 b'Area',
 b'BoundingBoxAA Length X',
 b'BoundingBoxAA Length Y',
 b'BoundingBoxAA Length Z',
 b'BoundingBoxOO Length A',
 b'BoundingBoxOO Length B',
 b'BoundingBoxOO Length C',
 b'Center of Homogeneous Mass X',
 b'Center of Homogeneous Mass Y',
 b'Center of Homogeneous Mass Z',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Z',
 b'Center of Image Mass Z',
 b'Center of Image Mass Z',
 b'Center of Image Mass Z',
 b'Center of Image Mass

In [14]:
column_names = out["column_names"]

In [15]:
filtered_column_names = list()
for item in column_names.values():
    if not len(re.findall(b"Track", item)):
        filtered_column_names.append(item)

In [16]:
filtered_column_names

[b'Total Number of Disconnected Components',
 b'Total Number of Surfaces',
 b'Total Number of Triangles',
 b'Total Number of Voxels',
 b'Acceleration',
 b'Acceleration X',
 b'Acceleration Y',
 b'Acceleration Z',
 b'Area',
 b'BoundingBoxAA Length X',
 b'BoundingBoxAA Length Y',
 b'BoundingBoxAA Length Z',
 b'BoundingBoxOO Length A',
 b'BoundingBoxOO Length B',
 b'BoundingBoxOO Length C',
 b'Center of Homogeneous Mass X',
 b'Center of Homogeneous Mass Y',
 b'Center of Homogeneous Mass Z',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass X',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Y',
 b'Center of Image Mass Z',
 b'Center of Image Mass Z',
 b'Center of Image Mass Z',
 b'Center of Image Mass Z',
 b'Center of Image Mass Z',
 b'Displacement Delta Length',
 b'Displacement Delta X',
 b'Displacement Delta Y',
 b'Di

In [None]:
parser = SurfaceParser(data_path, config=None)


In [None]:
j = parser.process(0)


In [None]:
j.keys()


In [None]:
stats_value_storage["MegaSurfaces0"]["ID_StatisticsType"].unique()


In [None]:
%timeit set(stats_value_storage["MegaSurfaces0"]["ID_StatisticsType"])

In [None]:
extracted_stats = (
    stats_value_storage["MegaSurfaces0"]
    .groupby("ID_Object")[["ID_StatisticsType", "Value"]]
    .apply(lambda x: x.set_index("ID_StatisticsType").to_dict(orient="dict"))
    .to_dict()
)

In [None]:
extracted_stats


In [None]:
extracted_stats2 = {k: v["Value"] for k, v in extracted_stats.items()}  # c


In [None]:
extracted_stats2


In [None]:
out = pd.DataFrame.from_dict(extracted_stats2).transpose()
out


In [None]:
new_column_names = {key: stats_dict_storage[key] for key in out.columns}

# rename columns
dataframe = out.rename(new_column_names, axis=1)
dataframe["ID"] = dataframe.index


In [None]:
dataframe


In [None]:
ids = pd.DataFrame(
    np.asarray(data_obj.data.get("Scene8").get("Content")["MegaSurfaces0"]["Track0"])
)["ID"].to_list()

In [None]:
ids

In [None]:
dataframe.drop(labels=ids)


In [None]:
track_stat_names = list()
for item in new_column_names.values():
    if not len(re.findall(b"Track", item)):
        track_stat_names.append(item)


In [None]:
new_column_names

In [None]:
# storage = {}
# df = stats_value_storage["MegaSurfaces0"]
# for row_idx in range(df.shape[0]):
#     data = df.iloc[row_idx]
#     object_id = data["ID_Object"]
#     object_stat_type = data["ID_StatisticsType"]
#     object_stat_value = data["Value"]
#     if object_id not in storage.keys():
#         storage[object_id] = {}
#         storage[object_id][object_stat_type] = object_stat_value
#     else:
#         storage[object_id][object_stat_type] = object_stat_value


In [None]:
# unique_surface_names["MegaSurfaces3"]

In [None]:
stats_values = stats_value_storage.get("MegaSurfaces3")

In [None]:
stats_values[stats_values["ID_StatisticsType"] == 47436].sort_values("ID_Object")


In [None]:
np.array(parser.data.get("Scene8").get("Content").get("MegaSurfaces1").get("Factor"))


In [None]:
# Surface 0
#  26173: b'Shortest Distance to Surfaces',
#  26174: b'Shortest Distance to Surfaces',
#  26175: b'Shortest Distance to Surfaces',
#  25034: b'Shortest Distance to Surfaces',
#  25293: b'Shortest Distance to Surfaces',
#  25552: b'Shortest Distance to Surfaces',
#  25811: b'Shortest Distance to Surfaces',

# [b'Surfaces Unnamed 2 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 4 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']

# Surface 1
#  26174: b'Shortest Distance to Surfaces',
#  26175: b'Shortest Distance to Surfaces',
#  26495: b'Shortest Distance to Surfaces',
#  25034: b'Shortest Distance to Surfaces',
#  25293: b'Shortest Distance to Surfaces',
#  25552: b'Shortest Distance to Surfaces',
#  25811: b'Shortest Distance to Surfaces',

# [b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
# b'Surfaces Unnamed 4 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
# b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
# b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
# b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
# b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
# b'Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']


In [None]:
# Surface 3

#  26173: b'Shortest Distance to Surfaces',
#  26174: b'Shortest Distance to Surfaces',
#  26495: b'Shortest Distance to Surfaces',
#  25034: b'Shortest Distance to Surfaces',
#  25293: b'Shortest Distance to Surfaces',
#  25552: b'Shortest Distance to Surfaces',
#  25811: b'Shortest Distance to Surfaces',

#  [b'Surfaces Unnamed 2 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']

In [None]:
# Surface 7

#  26173: b'Shortest Distance to Surfaces',
#  26174: b'Shortest Distance to Surfaces',
#  26175: b'Shortest Distance to Surfaces',
#  26495: b'Shortest Distance to Surfaces',
#  25034: b'Shortest Distance to Surfaces',
#  25293: b'Shortest Distance to Surfaces',
#  25552: b'Shortest Distance to Surfaces',

#  [b'Surfaces Unnamed 2 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 3 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 4 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 1 BCGmR.icsx_[ibrx_2023-06-26T09-47-24.240]',
#  b'Surfaces Unnamed 1 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 2 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]',
#  b'Surfaces Unnamed 3 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364]']


In [None]:
# def get_surface_names(data):
#     arr = data.get("Scene8").get("Content").get("MegaSurfaces1").get("Factor")
#     surface_names = []
#     for item in arr:
#         if len(re.findall(b"Surface", item)):
#             surface_names.append(item[-1])

#     return surface_names


# surface_names = get_surface_names(data)


In [None]:
# surface_names

In [None]:
    def get_all_stats_names(self) -> Dict[str, Dict[int, str]]:
        """Uses the surface name information from the imaris data file
        and extracts the numerical statistics id and the string
        statistics names for each Surface

        Returns:
            Dict[str, Dict[int, str]]: key=SurfaceName, value=Statistics Dict[id, name]
        """
        object_stats_dict = {}
        for _, name in enumerate(self.surface_names):
            stats_dict = self.get_statistics_names(object_name=name)
            object_stats_dict[name] = stats_dict

        return object_stats_dict

    def get_all_stats_values(self) -> Dict[str, Dict[int, pd.DataFrame]]:
        """Uses the surface name information from the imaris data file
        and extracts the numerical statistics id and the associated
        statistics values for each surface

        Returns:
            Dict[str, Dict[int, pd.DataFrame]]: _description_
        """
        stats_values_dict = {}
        for _, name in enumerate(self.surface_names):
            stats_value_dict = self.get_stats_values(object_name=name)
            stats_values_dict[name] = stats_value_dict

        return stats_values_dict

    def map_name_to_id(self, stats_values: Dict, stats_names: Dict):
        # TODO: Imcomplete
        storage = {}
        for name in stats_names:
            mapping_dict = stats_names.get(name)
            dataframe = stats_values[name]
            dataframe["Names"] = dataframe["ID_StatisticsType"].map(mapping_dict)
            storage[name] = dataframe
        return storage

In [None]:
    def add_channel_info(self, stats_values: Dict) -> Dict:
        """
        Add in channel information to each redundant variable name
        Assumes there are no nested dictionaries. Example: if theres multiple ImageMean strings then each
        ImageMean in acending id order will be renamed ImageMean_channel_1 etc.

        Return:
            stats_values (Dict): a single dictionary with key = stat id, value = state name
        """
        # get counts
        counts = collections.Counter(list(stats_values.values()))

        # map names to id numbers
        inverse_dict = {k: [] for k in counts.keys()}
        for id, name in stats_values.items():
            inverse_dict[name].append(id)

        # rename each id with the updated string
        updated_stats_values = {}
        for name, data in inverse_dict.items():
            if len(data) == 1:
                updated_stats_values[data.pop()] = name
            elif len(data) > 1:
                data = sorted(data)
                for idx, stat_id in enumerate(data):
                    updated_stats_values[stat_id] = f"{name}_channel_{idx+1}"
            else:
                raise ValueError

        return updated_stats_values