In [2]:
from utils import *
from rich import print
import os
from numba import jit, njit
import copy

In [3]:
# load h5py data
data = load_data('../Surface_parser_column_swap_issue/Exp116-BACH1IKO42dpi_clec4f+tim4_10202022_New_[ii39_F480-AF532_FMO_3D_Merged_Image_40]_[ims1_2022-11-14T13-54-07.151].ims')

In [4]:
data.keys()


<KeysViewHDF5 ['DataSet', 'DataSetEvents', 'DataSetInfo', 'DataSetTimes', 'Scene', 'Scene8', 'Thumbnail']>

In [5]:
data.get('Scene').get('Content').keys()

<KeysViewHDF5 ['MegaSurfaces0', 'MegaSurfaces1', 'MegaSurfaces2', 'MegaSurfaces3', 'MegaSurfaces4', 'MegaSurfaces5', 'MegaSurfaces6', 'MegaSurfaces7']>

In [6]:
data.get('Scene8').get('Content').get('MegaSurfaces0').keys()

<KeysViewHDF5 ['BlockData', 'BlockInfo', 'BlockPath', 'Category', 'CategoryFunction', 'CreationParameters', 'Factor', 'FactorFunction', 'FactorList', 'FactorListFunction', 'LabelColor', 'LabelColorData', 'LabelColorLabelGroupNames', 'LabelColorLabelValues', 'LabelGroupNames', 'LabelSetLabelIDs', 'LabelSetObjectIDs', 'LabelSets', 'LabelValues', 'LevelInfo', 'MainTrackSegmentTable', 'MainTrackSegmentTable_Focus', 'SplitOffset', 'StatisticsType', 'StatisticsTypeFunction', 'StatisticsValue', 'StatisticsValueFunction', 'StatisticsValueTimeOffset', 'StatisticsValueTimeOffsetFunction', 'SurfaceModel', 'SurfaceModelInfo', 'SurfaceTimeOffset', 'Time', 'TimeBegin', 'TrackSegment0', 'TrackSegment0_Focus']>

In [8]:
## can we get surfaces by modifying following function
def get_object_names(full_data_file: h5py.File, search_for: str) -> list:
    """
    _summary_

    Args:
        full_data_file (h5py._hl.files.File): full imaris file in h5py File format
        search_for (str): string containing full or partial filename to search for

    Returns:
        (list): a list of all the object names that match search_for parameter
    """

    values = full_data_file.get('Scene').get('Content').keys()
    storage = list()
    for item in values:
        if len(re.findall(search_for, item)):
            storage.append(item)
    return storage

# test function -- working
out = get_object_names(data, 'Surface')
print(out)

In [39]:
# once we know the object names we want to extract data from use following function
# this function doesnt grab any data values, just names
def get_statistics_names(full_data_file: h5py.File, object_name: str) -> dict:
    """
    for a given object_name, extracts the statistics names and ids into a dict
    ex: statistics name = mean intensity, associated id=404

    Args:
        full_data_file (h5py._hl.files.File): full imaris file in h5py File format
        object_name (str): name of the object to get statistic names from

    Returns:
        dict: a dict where the keys=unique stats ID, value=static name
    """

    # get object specific data
    obj_specific_data = full_data_file['Scene8']['Content'][object_name]
    
    # rearrange data
    statistics_name = np.asarray(obj_specific_data['StatisticsType'])
    statistics_name = pd.DataFrame(statistics_name)
    
    # extract statistics names
    stats_name = statistics_name['Name']
    
    # extract statistics ID names
    stats_type = statistics_name['ID']
    
    # combine stats type and stats names
    return  dict(zip(stats_type, stats_name))

# test -- working
out_stats_names = get_statistics_names(data, 'MegaSurfaces0')

out_stats_names


{407: b'Area',
 408: b'BoundingBoxAA Length X',
 409: b'BoundingBoxAA Length Y',
 410: b'BoundingBoxAA Length Z',
 411: b'BoundingBoxOO Length A',
 412: b'BoundingBoxOO Length B',
 413: b'BoundingBoxOO Length C',
 414: b'Center of Homogeneous Mass X',
 415: b'Center of Homogeneous Mass Y',
 416: b'Center of Homogeneous Mass Z',
 417: b'Center of Image Mass X',
 418: b'Center of Image Mass X',
 419: b'Center of Image Mass X',
 420: b'Center of Image Mass X',
 421: b'Center of Image Mass X',
 422: b'Center of Image Mass Y',
 423: b'Center of Image Mass Y',
 424: b'Center of Image Mass Y',
 425: b'Center of Image Mass Y',
 426: b'Center of Image Mass Y',
 427: b'Center of Image Mass Z',
 428: b'Center of Image Mass Z',
 429: b'Center of Image Mass Z',
 430: b'Center of Image Mass Z',
 431: b'Center of Image Mass Z',
 432: b'Distance from Origin',
 433: b'Distance to Image Border XY',
 434: b'Distance to Image Border XYZ',
 435: b'Ellipsoid Axis A X',
 436: b'Ellipsoid Axis A Y',
 437: b'E

In [8]:
#np.savetxt('test.txt', list(out_stats_names.values()), fmt='%s')

In [11]:
# once we have the statistics names we can get numerical statistics values
# for each object id within the specified object
def get_stats_values(full_data_file: h5py.File, object_name: str) -> pd.DataFrame:
    """
    for a given object_name, extracts the statistics values for all object ids
    within the object

    Args:
        full_data_file (h5py._hl.files.File): full imaris file in h5py File format
        object_name (str): name of the object to get statistic names from

    Returns:
        pd.DataFrame: a pandas data frame that contains information about each object id
        where each object id has a stats id and associated stats value.
    """
    obj_specific_stats = full_data_file.get('Scene8').get('Content')[object_name]['StatisticsValue']
    obj_specific_stats = np.asarray(obj_specific_stats)
    return pd.DataFrame(obj_specific_stats)

out = get_stats_values(data, 'MegaSurfaces0').drop('ID_Time', axis=1)
print(out)

In [40]:
out[out['ID_StatisticsType'] == 460].sort_values('ID_Object')

Unnamed: 0,ID_Object,ID_StatisticsType,Value
174907,2,460,1.864865
174908,7,460,0.873950
174909,10,460,0.939024
174910,14,460,1.077029
174911,16,460,1.421053
...,...,...,...
173420,6627,460,1.775000
173421,6629,460,1.607143
173422,6630,460,1.903226
173423,6632,460,0.850000


In [12]:
extracted_stats = out.groupby('ID_Object')[
    ['ID_StatisticsType', 'Value']].apply(lambda x: x.set_index('ID_StatisticsType').to_dict(orient='dict')).to_dict()
extracted_stats = {k: v['Value'] for k, v in extracted_stats.items()} # clean up step for line above

In [13]:
def invert_stats_dict(stats_dict: dict=None):
    '''
    creates a inverted_stats_dict --> [Statistics Type: Numeric Value]
    '''
    
    # sort the dict
    sorted_dict_key = sorted(stats_dict.keys())
    stats_dict = {key: str(stats_dict[key]).strip('b')[1:-1] for key in sorted_dict_key }
    
    # create an empty dictionary
    storage = {}
    
    for key in stats_dict.keys():
        # if the word is not in the new storage dict
        if stats_dict[key] not in storage.keys():
            storage[stats_dict[key]] = key
        else:
            # get the value inside the key 
            current_value = storage[stats_dict[key]]
            
            # if its a single value create a dict else create a dict
            if type(current_value) != dict:
                # then its the first value
                storage[stats_dict[key]] = {}
                storage[stats_dict[key]]['channel_1'] = current_value
                
                # current value
                storage[stats_dict[key]]['channel_2'] = key
            else:
                # get the length of the dict
                count = len(current_value.keys())
                # updated count
                count += 1 
                # create new key
                new_key = f"channel_{count}"
                current_value[new_key] = key
                storage[stats_dict[key]] = current_value
                
    return storage
# we can directly use this to invert the dict
# this inverted dict can be used to create the final excel file

In [14]:
inverted_dict =  invert_stats_dict(out_stats_names)
categories = [
    'Position X',
    'Position Y',
    'Position Z',
    'Intensity Mean_channel_1',
    'Intensity Mean_channel_2',
    'Intensity Mean_channel_3',
    'Intensity Mean_channel_4',
    'Intensity Mean_channel_5',
    'Volume'
]

In [15]:
# flatten dict 
from collections.abc import MutableMapping

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [16]:
inverted_flatten = flatten(inverted_dict)
# can we provide an editable list before anything starts so he can manually select which ones he wants
# that way the names match exactly because I control it. 

# reverse dict again, key=num, value=name
final_dict = {v: k for k,v in inverted_flatten.items()}

# can we use the categories reqested to create a list of numbers we dont want?
del_list = [None] * (len(inverted_flatten) - len(categories))
counter = 0
for idx, (k, v) in enumerate(inverted_flatten.items()):
    
    if k not in categories and v != None:
        del_list[counter] = v
        counter += 1
    

In [28]:
final_dict[170]

'Intensity Mean_channel_1'

In [17]:
# map what the user wants to numeric values
user_request_numeric = [inverted_flatten[x] for x in categories]

# we can look at the final dict where key=num, value=name to filter
final_dict_copy = copy.deepcopy(final_dict)
for value in user_request_numeric:
    final_dict_copy.pop(value)
    
new_del_list = list(final_dict_copy.keys())

In [18]:
print(len(final_dict_copy.keys()), len(final_dict.keys()), len(user_request_numeric))

In [21]:
final_df = pd.DataFrame(extracted_stats).transpose()
f = final_df.drop(labels=del_list, axis=1)
print(f)


In [23]:
final_df = pd.DataFrame(extracted_stats).transpose()
f = final_df.drop(labels=[None], axis=1)
print(f)

KeyError: '[None] not found in axis'

In [25]:
# clean up df by changing number to names and rearranging columns
columns = f.columns
print(columns)

new_names = {key: final_dict[key] for key in columns}
print(new_names)

# rename dict


In [26]:
final_df = f.rename(new_names, axis=1)
print(final_df.head())

In [57]:
# create the final id column
final_df['ID'] = final_df.index


# rearrange
final_order = [
    'ID',
    'Position X',
    'Position Y',
    'Position Z',
    'Intensity Mean_channel_1',
    'Intensity Mean_channel_2',
    'Intensity Mean_channel_3',
    'Intensity Mean_channel_4',
    'Intensity Mean_channel_5',
]

final = final_df[final_order]
print(final)