In [1]:
import h5py
import numpy as np
import pandas as pd
import re
import yaml
import os
from collections.abc import MutableMapping
from typing import Dict, List, Tuple
from exceptions import *
from copy import deepcopy
import collections
from parser_base import Parser
from imaris import ImarisDataObject
import ray
from functools import partial
import time
import polars as pl

In [2]:
data_path = (
    "../../data/multi_surface_track_parser_dev_data/GFP #1 Sec2 Roi1 2x2 1h30min.ims"
)
os.path.isfile(data_path)

True

In [3]:
# @ray.remote
class TrackParser(Parser):
    """
    Extracts Track Level Information From Imaris File

    Args:
        Parser (ABCMeta): Parser Abstract Base Class
    """

    def __init__(self, ims_file_path: str) -> None:
        self.ims_file_path = ims_file_path
        self.ims = ImarisDataObject(self.ims_file_path)
        self.configure_instance()

    def configure_instance(self) -> None:
        """
        Extracts relevant information from ims object and
        instantiates it as instance variables for fast recall.

        Currently Extracts:
            - all the surface names -- List
            - all the stats_names -- {id: pd.DataFrame}
            - all the stats values -- {id: pd.DataFrame}
            - all the factor info -- {id: pd.DataFrame}
        """
        # TODO: check to ensure surfaces exist or raise error
        # extract all information and saves it as a instance var
        self.surface_names = self.ims.get_object_names("Surface")

        # get all the stats names for every surface {surf_id: stats_name_df}
        self.stats_names = {
            surface_id: self.ims.get_stats_names(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

        # get all the stats values for every surface {surf_id: stats_values_df}
        self.stats_values = {
            surface_id: self.ims.get_stats_values(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

        # get all the factor table info for every surface {surf_id: factor_df}
        self.factors = {
            surface_id: self.ims.get_object_factor(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

        # get all the factor table info for every surface {surf_id: factor_df}
        self.track_ids = {
            surface_id: self.ims.get_track_ids(surface_name)
            for surface_id, surface_name in enumerate(self.surface_names)
        }

    def organize_stats(self, stats_values: pd.DataFrame) -> Dict:
        """Organized the data such that it looks like
        {ID_Object: {Stats Name: Value}}

        Args:
            surface_stats_values (pd.DataFrame): a single dataframe
            that contains the statistics for a single surface

        Returns:
            Dict: _description_
        """
        grouped_stats = (
            stats_values.groupby("ID_Object")[["ID_StatisticsType", "Value"]]
            .apply(lambda x: x.set_index("ID_StatisticsType").to_dict(orient="dict"))
            .to_dict()
        )
        grouped_stats = {k: v["Value"] for k, v in grouped_stats.items()}
        return grouped_stats

    def generate_csv(
        self, stats_values: Dict, stat_names: pd.DataFrame
    ) -> pd.DataFrame:
        """_summary_

        Args:
            organized_stats (Dict): _description_

        Returns:
            pd.DataFrame: _description_
        """
        # create a dict that maps stat_id to stat_name
        column_names_dict = dict(zip(stat_names["ID"], stat_names["Name"]))
        dataframe = pd.DataFrame(stats_values).transpose()

        # replaces id columns with respective stat name and add idx
        dataframe = dataframe.rename(column_names_dict, axis=1)
        dataframe["Object_ID"] = dataframe.index
        return dataframe

    def save_csv(self):
        # a function to write csv information to disk
        pass

    def process(self, surface_id: int) -> None:
        """
        Runs a single end to end parser pipeline on a single surface
        Steps:
            - get stat names for a single surface
            - get stat values for a single surface
            - filter stat values to keep only track ids
            - filter stats values to remove track level stat information
            - rename certian columns (if needed)(need a custom func for this to add channel info)
            - organize the filtered stats
            - generate csv
            - save csv

        Args:
            surface_id (int): _description_
        """
        # gather info for current surface
        start = time.perf_counter()
        surface_name = self.surface_names[surface_id]
        print(f"surface_name: {time.perf_counter() - start}")
        stat_names = self.stats_names.get(surface_id)
        print(f"stat_names: {time.perf_counter() - start}")
        stat_values = self.stats_values.get(surface_id)
        print(f"stat_values: {time.perf_counter() - start}")
        track_id = self.track_ids.get(surface_id)
        print(f"object_id: {time.perf_counter() - start}")
        factor = self.factors.get(surface_id)
        print(f"factor: {time.perf_counter() - start}")

        # update channel and surface names
        stat_names = self.update_channel_info(stats_names=stat_names, factor=factor)
        print(f"stat_names_channel: {time.perf_counter() - start}")
        stat_names = self.update_surface_info(stats_names=stat_names, factor=factor)
        print(f"stat_names_surfaces: {time.perf_counter() - start}")

        # filter stats values by object ids (ie: ignore info related to trackids)
        filtered_stat_values = self.filter_stats(
            stats_values=stat_values,
            filter_col_names=["ID_Object"],
            filter_values=[track_id],
        )
        print(f"filtered_stat_values: {time.perf_counter() - start}")

        # organize stats values
        organized_stats = self.organize_stats(filtered_stat_values)
        print(f"organized_stats: {time.perf_counter() - start}")

        # generate csv
        stats_df = self.generate_csv(organized_stats, stat_names=stat_names)
        print(f"stats_df: {time.perf_counter() - start}")

        return stats_df

    def filter_stats(
        self,
        stats_values: pd.DataFrame,
        filter_col_names: List[str],
        filter_values: List[pd.Series],
    ) -> pd.DataFrame:
        """
        Filters the stats values dataframe. It keeps information
        from col_names and filter_values that is passed in as arguments.

        Args:
            stats_values (pd.DataFrame): _description_
            filter_col_name (str): name of the column we want to use to filter
            filter_values (str): values that we want to keep

        Returns:
            pd.DataFrame: _description_
        """
        # for surface parser need to filter out track id information
        # and statistics related to track information.
        for col_names, values in zip(filter_col_names, filter_values):
            stats_values = stats_values[stats_values[col_names].isin(values=values)]

        return stats_values

    def extract_and_save(self):
        # this function is the funtion that gets called externally
        # we can have this function as a ray method to help with distributed execution
        pass

    def get_available_stat_names(self):
        # interacts with data object and returns requested data for inspection
        pass

    def update_channel_info(
        self, stats_names: pd.DataFrame, factor: pd.DataFrame
    ) -> pd.DataFrame:
        """
        Updates the channel information for the relavent rows
        based on th ID_FactorList information in stats_names

        Args:
            stats_names (pd.DataFrame): _description_
            factor (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """

        # create function get channel number from a pandas row from stats_names
        # inner func
        def get_channel_id(row_info, factor: pd.DataFrame):
            factor_id = row_info["ID_FactorList"]  # factor id
            name = row_info["Name"]  # stat name

            # filter factor to only include items related to Channel
            channel_info = factor[factor["Name"] == "Channel"]

            # main logic to select the right channel given the factor id
            if factor_id in channel_info["ID_List"].to_list():
                channel = channel_info[channel_info["ID_List"] == factor_id][
                    "Level"
                ].item()
                return f"{name} Channel_{channel}"
            # if factor id is not in the channel list no channel info is needed
            else:
                return name

        # create partial
        get_channel_id_partial = partial(get_channel_id, factor=factor)

        # update stats name with the newly mapped stats names values
        stats_names["Name"] = stats_names.apply(func=get_channel_id_partial, axis=1)

        return stats_names

    def update_surface_info(
        self, stats_names: pd.DataFrame, factor: pd.DataFrame
    ) -> pd.DataFrame:
        """
        Updates the surface name information for the relavent rows
        based on th ID_FactorList information in stats_names

        Args:
            stats_names (pd.DataFrame): _description_
            factor (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """

        # create function get channel number from a pandas row from stats_names
        # inner func
        def get_surface_name(row_info, factor: pd.DataFrame):
            factor_id = row_info["ID_FactorList"]  # factor id
            name = row_info["Name"]  # stat name

            # filter factor to only include items related to Channel
            channel_info = factor[factor["Name"] == "Surfaces"]

            # main logic to select the right channel given the factor id
            if factor_id in channel_info["ID_List"].to_list():
                channel = channel_info[channel_info["ID_List"] == factor_id][
                    "Level"
                ].item()
                return channel
            # if factor id is not in the channel list no channel info is needed
            else:
                return name

        # create partial
        get_surface_name_partial = partial(get_surface_name, factor=factor)

        # update stats name with the newly mapped stats names values
        stats_names["Name"] = stats_names.apply(func=get_surface_name_partial, axis=1)

        return stats_names

In [4]:
parser = TrackParser(data_path)

In [5]:
df = parser.process(0)

surface_name: 2.079061232507229e-06
stat_names: 0.0004632690688595176
stat_values: 0.0004776510177180171
object_id: 0.0004885890521109104
factor: 0.0004991560708731413
stat_names_channel: 3.0527786220191047
stat_names_surfaces: 5.464682255988009
filtered_stat_values: 5.625629409099929
organized_stats: 6.286501188063994
stats_df: 6.370199923054315


In [6]:
df

Unnamed: 0,Track Ar1 Mean,Track Ar1 X,Track Ar1 Y,Track Ar1 Z,Track Area Mean,Track Center of Homogeneous Mass X,Track Center of Homogeneous Mass Y,Track Center of Homogeneous Mass Z,Track Center of Image Mass X Channel_1,Track Center of Image Mass X Channel_2,...,Surfaces Unnamed 4 CG.GC.C-.G-.icsx_[ibrx_2023-09-27T11-04-45.364],Track Speed Max,Track Speed Mean,Track Speed Min,Track Speed StdDev,Track Speed Variation,Track Sphericity Mean,Track Straightness,Track Volume Mean,Object_ID
1000000000,-0.550184,-0.591558,-0.706574,-0.352421,184.048630,57305.355469,38205.421875,38.931080,57305.492188,57305.222656,...,58.586376,0.041609,0.022603,0.010553,0.007755,0.343085,0.730995,0.027259,145.742294,1000000000
1000000001,-0.476468,-0.305364,-0.885106,-0.238933,173.122635,57245.156250,38255.199219,4.065064,57245.222656,57245.167969,...,81.714478,0.026032,0.021547,0.016280,0.003183,0.147718,0.785525,0.140231,149.012894,1000000001
1000000003,-0.451769,-0.417780,-0.491516,-0.446013,3441.188721,57338.707031,38082.734375,26.736328,57339.503906,57338.050781,...,22.588594,0.072173,0.027683,0.010563,0.011372,0.410796,0.426913,0.035962,5119.760742,1000000003
1000000005,0.461317,0.605533,0.359849,0.418570,340.100220,57337.539062,38049.863281,37.823231,57337.769531,57338.199219,...,7.265651,0.030330,0.019375,0.015690,0.005034,0.259816,0.641248,0.401058,328.207428,1000000005
1000000006,,,,,3731.450195,57325.714844,38074.410156,23.871647,57325.816406,57325.382812,...,19.686106,0.040204,0.033214,0.026096,0.004988,0.150175,0.381708,0.937294,5147.984375,1000000006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000222046,,,,,179.472153,56780.675781,37800.671875,39.741333,56780.726562,56780.429688,...,42.001484,0.008724,0.006214,0.003706,0.001774,0.285481,0.859895,0.420234,180.827316,1000222046
1000222137,,,,,436.964447,56510.496094,37968.679688,11.268145,56510.390625,56511.085938,...,13.002812,0.101189,0.094573,0.087954,0.004679,0.049478,0.835397,0.300250,656.148987,1000222137
1000222184,,,,,280.326080,56928.988281,37485.148438,28.417866,56928.992188,56929.156250,...,58.057255,0.007953,0.007820,0.007687,0.000094,0.012023,0.773547,0.709031,300.273499,1000222184
1000222245,,,,,943.829407,56642.617188,37709.156250,19.873278,56642.761719,56642.667969,...,34.618137,0.012492,0.011468,0.010444,0.000724,0.063117,0.613272,0.507340,1304.447754,1000222245
