In [1]:
from utils import *
from rich import print
import os
from numba import jit, njit
import copy

In [2]:
# load h5py data
data = load_data('../data/P1 DHBR Roi3 6x6_TileScan_001_Merging_Crop_0_batch.ims')

In [3]:
data.keys()


<KeysViewHDF5 ['DataSet', 'DataSetEvents', 'DataSetInfo', 'DataSetTimes', 'Scene', 'Scene8', 'Thumbnail']>

In [4]:
data.get('Scene').get('Content').keys()

<KeysViewHDF5 ['MegaSurfaces0', 'MegaSurfaces1', 'MegaSurfaces2', 'MegaSurfaces3', 'MegaSurfaces4', 'MegaSurfaces5', 'MegaSurfaces6']>

In [5]:
data.get('Scene8').get('Content').get('MegaSurfaces6').keys()

<KeysViewHDF5 ['BlockData', 'BlockInfo', 'BlockPath', 'Category', 'CategoryFunction', 'CreationParameters', 'Factor', 'FactorFunction', 'FactorList', 'FactorListFunction', 'LabelColor', 'LabelColorData', 'LabelColorLabelGroupNames', 'LabelColorLabelValues', 'LabelGroupNames', 'LabelSetLabelIDs', 'LabelSetObjectIDs', 'LabelSets', 'LabelValues', 'LevelInfo', 'MainTrackSegmentTable', 'MainTrackSegmentTable_Focus', 'SplitOffset', 'StatisticsType', 'StatisticsTypeFunction', 'StatisticsValue', 'StatisticsValueFunction', 'StatisticsValueTimeOffset', 'StatisticsValueTimeOffsetFunction', 'SurfaceModel', 'SurfaceModelInfo', 'SurfaceTimeOffset', 'Time', 'TimeBegin', 'TrackSegment0', 'TrackSegment0_Focus']>

In [6]:
## can we get surfaces by modifying following function
def get_object_names(full_data_file: h5py.File, search_for: str) -> list:
    """
    _summary_

    Args:
        full_data_file (h5py._hl.files.File): full imaris file in h5py File format
        search_for (str): string containing full or partial filename to search for

    Returns:
        (list): a list of all the object names that match search_for parameter
    """

    values = full_data_file.get('Scene').get('Content').keys()
    storage = list()
    for item in values:
        if len(re.findall(search_for, item)):
            storage.append(item)
    return storage

# test function -- working
out = get_object_names(data, 'Surface')
print(out)

In [7]:
# once we know the object names we want to extract data from use following function
# this function doesnt grab any data values, just names
def get_statistics_names(full_data_file: h5py.File, object_name: str) -> dict:
    """
    for a given object_name, extracts the statistics names and ids into a dict
    ex: statistics name = mean intensity, associated id=404

    Args:
        full_data_file (h5py._hl.files.File): full imaris file in h5py File format
        object_name (str): name of the object to get statistic names from

    Returns:
        dict: a dict where the keys=unique stats ID, value=static name
    """

    # get object specific data
    obj_specific_data = full_data_file['Scene8']['Content'][object_name]
    
    # rearrange data
    statistics_name = np.asarray(obj_specific_data['StatisticsType'])
    statistics_name = pd.DataFrame(statistics_name)
    
    # extract statistics names
    stats_name = statistics_name['Name']
    
    # extract statistics ID names
    stats_type = statistics_name['ID']
    
    # combine stats type and stats names
    return  dict(zip(stats_type, stats_name))

# test -- working
out_stats_names = get_statistics_names(data, 'MegaSurfaces6')


In [8]:
np.savetxt('test.txt', list(out_stats_names.values()), fmt='%s')

In [9]:
# once we have the statistics names we can get numerical statistics values
# for each object id within the specified object
def get_stats_values(full_data_file: h5py.File, object_name: str) -> pd.DataFrame:
    """
    for a given object_name, extracts the statistics values for all object ids
    within the object

    Args:
        full_data_file (h5py._hl.files.File): full imaris file in h5py File format
        object_name (str): name of the object to get statistic names from

    Returns:
        pd.DataFrame: a pandas data frame that contains information about each object id
        where each object id has a stats id and associated stats value.
    """
    obj_specific_stats = full_data_file.get('Scene8').get('Content')[object_name]['StatisticsValue']
    obj_specific_stats = np.asarray(obj_specific_stats)
    return pd.DataFrame(obj_specific_stats)

out = get_stats_values(data, 'MegaSurfaces6')
print(out)

Unnamed: 0,ID_Time,ID_Object,ID_StatisticsType,Value
0,-1,-1,4,0.0
1,-1,-1,5,26013.0
2,-1,-1,6,26013.0
3,-1,-1,7,38798504.0
4,-1,-1,8,17930884.0
2185097,0,-1,389,26013.0
2185098,0,-1,388,26013.0


In [44]:
# create a empty dict where key=numeric stats ids and value = None
empty_stats_dict = {key: None for key in out_stats_names.keys()}

# create a empty dict where key=object_id, and value=empty stats dict
empty_data_dict = {key: copy.deepcopy(empty_stats_dict) for key in list(out['ID_Object'])}

In [45]:
counter = 6
for idx in range(len(out)):
        
    current_data = out.iloc[idx]
    object_id = current_data['ID_Object']
    stats_type = current_data['ID_StatisticsType']
    value = current_data['Value']
            
    # insert
    try:
        empty_data_dict[object_id][stats_type] = value
    except KeyError:
        print('here', idx)
        pass   
    
# empty_data_dict is done after this cell executes

In [46]:
empty_data_dict[27816][499]

1902.9423828125

In [47]:
def invert_stats_dict(stats_dict: dict=None):
    '''
    creates a inverted_stats_dict --> [Statistics Type: Numeric Value]
    '''
    
    # sort the dict
    sorted_dict_key = sorted(stats_dict.keys())
    stats_dict = {key: str(stats_dict[key]).strip('b')[1:-1] for key in sorted_dict_key }
    
    # create an empty dictionary
    storage = {}
    
    for key in stats_dict.keys():
        # if the word is not in the new storage dict
        if stats_dict[key] not in storage.keys():
            storage[stats_dict[key]] = key
        else:
            # get the value inside the key 
            current_value = storage[stats_dict[key]]
            
            # if its a single value create a dict else create a dict
            if type(current_value) != dict:
                # then its the first value
                storage[stats_dict[key]] = {}
                storage[stats_dict[key]]['channel_1'] = current_value
                
                # current value
                storage[stats_dict[key]]['channel_2'] = key
            else:
                # get the length of the dict
                count = len(current_value.keys())
                # updated count
                count += 1 
                # create new key
                new_key = f"channel_{count}"
                current_value[new_key] = key
                storage[stats_dict[key]] = current_value
                
    return storage
# we can directly use this to invert the dict
# this inverted dict can be used to create the final excel file

In [48]:
inverted_dict =  invert_stats_dict(out_stats_names)
categories = [
    'Position X',
    'Position Y',
    'Position Z',
    'Intensity Mean_channel_1',
    'Intensity Mean_channel_2',
    'Intensity Mean_channel_3',
    'Intensity Mean_channel_4',
    'Intensity Mean_channel_5',
    'Volume'
]

In [49]:
out_stats_names[499]

b'Volume'

In [50]:
inverted_dict['Position X']

493

In [51]:
# flatten dict 
from collections.abc import MutableMapping

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [52]:
inverted_flatten = flatten(inverted_dict)
# can we provide an editable list before anything starts so he can manually select which ones he wants
# that way the names match exactly because I control it. 

# reverse dict again, key=num, value=name
final_dict = {v: k for k,v in inverted_flatten.items()}

# can we use the categories reqested to create a list of numbers we dont want?
del_list = [None] * (len(inverted_flatten) - len(categories))
counter = 0
for idx, (k, v) in enumerate(inverted_flatten.items()):
    
    if k not in categories and v != None:
        del_list[counter] = v
        counter += 1
    

In [53]:
inverted_flatten

{'Number of Voxels': 2,
 'Number of Tracks': 4,
 'Total Number of Disconnected Components': 5,
 'Total Number of Surfaces': 6,
 'Total Number of Triangles': 7,
 'Total Number of Voxels': 8,
 'Number of Surfaces per Time Point': 388,
 'Number of Disconnected Components per Time Point': 389,
 'Area': 408,
 'BoundingBoxAA Length X': 409,
 'BoundingBoxAA Length Y': 410,
 'BoundingBoxAA Length Z': 411,
 'BoundingBoxOO Length A': 412,
 'BoundingBoxOO Length B': 413,
 'BoundingBoxOO Length C': 414,
 'Center of Homogeneous Mass X': 415,
 'Center of Homogeneous Mass Y': 416,
 'Center of Homogeneous Mass Z': 417,
 'Center of Image Mass X_channel_1': 418,
 'Center of Image Mass X_channel_2': 419,
 'Center of Image Mass X_channel_3': 420,
 'Center of Image Mass X_channel_4': 421,
 'Center of Image Mass X_channel_5': 422,
 'Center of Image Mass Y_channel_1': 423,
 'Center of Image Mass Y_channel_2': 424,
 'Center of Image Mass Y_channel_3': 425,
 'Center of Image Mass Y_channel_4': 426,
 'Center of

In [54]:
final_df = pd.DataFrame(empty_data_dict).transpose()
f = final_df.drop(labels=del_list, axis=1)
print(f)


In [55]:
# clean up df by changing number to names and rearranging columns
columns = f.columns
print(columns)

new_names = {key: final_dict[key] for key in columns}
print(new_names)

# rename dict


In [56]:
final_df = f.rename(new_names, axis=1)
print(final_df.head())

In [57]:
# create the final id column
final_df['ID'] = final_df.index


# rearrange
final_order = [
    'ID',
    'Position X',
    'Position Y',
    'Position Z',
    'Intensity Mean_channel_1',
    'Intensity Mean_channel_2',
    'Intensity Mean_channel_3',
    'Intensity Mean_channel_4',
    'Intensity Mean_channel_5',
]

final = final_df[final_order]
print(final)