In [1]:
import h5py
import numpy as np
from numba import jit, njit
import matplotlib.pyplot as plt
from rich import print
import copy
import pandas as pd
import re
from tqdm import tqdm
import timeit
import re
from utils import *

from stats import Stats
from statscols import StatsColumns
import yaml
import glob
import os
import time

In [2]:
def run(config: dict, data_path: str, categories: list, drop_duplicates: bool):
    '''
    args:
        data_path: path to imaris file
    '''
    # load the data
    full_data_file = load_data(data_path)

    # get the points info inside the file
    points = get_points(full_data_file)
    
    # storage to store multiple dataframes
    dataframe_storage = list()

    full_storage = {}
    
    # metadata storage
    metadata_storage = {}

    # loop over each point
    for idx, point in tqdm(enumerate(points)):

        # create a dictionary that maps the statistics name to the 
        stats_dict = get_statistics_dict(full_data_file, point)
        
        # create the functions dict
        remove_list = read_txt(config['remove_list_path'])
        functions_dict = create_functions_dict(categories, remove_list, stats_dict)

        try:
            # get the track information
            track_id_data = get_stats(full_data_file, point, 'Track0')

            # get the track object information
            track_object_data = get_stats(full_data_file, point, 'TrackObject0')

            # get the stistics value information
            stats_values = get_stats(full_data_file, point, 'StatisticsValue')

            # get the track and object id information in one np array
            track_and_object_id_info = convert_to_matrix(track_id_data, track_object_data)

            # create a dict to extract the data 
            stats_data = extract_data(track_and_object_id_info, stats_values)

            # invert the stats dict ie: swap key and values
            inv_stats_dict = invert_stats_dict(stats_dict)

            # initialize the class to create all the necessary columns
            statscols = StatsColumns(
                idx,
                stats_dict,
                track_id_data,
                track_object_data,
                stats_values,
                track_and_object_id_info,
                stats_data,
                inv_stats_dict)

            # get the number of object in current point
            num_points = statscols.obj_ids.shape[0]

            # create a empty storage dict to store the data from each point
            storage_dict = {}

            # update metadata
            metadata_storage[point] = {'num_obj_ids': num_points, 'num_track_ids': statscols.track_and_object_id_info.shape[0]}
            
            # grab the special items
            for key in functions_dict.keys():

                if type(functions_dict[key]) == list: 
                    storage_dict[key] = getattr(statscols, 'universal_create_track_channel_value_column')(*functions_dict[key])
                else:
                    if key not in config['special_items']:
                        storage_dict[key] = getattr(statscols, 'universal_create_stats_column')(functions_dict[key])
                    else:
                        storage_dict[key] = getattr(statscols, functions_dict[key])()
                        
            full_storage[point] = storage_dict

            # update dataframe
            points_data_arr = pd.DataFrame(
                data=np.hstack(list(full_storage[point].values())),
                columns=list(functions_dict.keys()))

            dataframe_storage.append(points_data_arr)

            print(f'info: found track')
            
        except (KeyError, AttributeError):
            print(f'info: no track')
            pass
        
    # concatenate all the points
    
    return pd.concat(dataframe_storage), metadata_storage

In [3]:
def main(config_path: str, drop_duplicates: bool=True) -> None:
    '''
    args:
        config_path: path to the config yaml file
    '''
    # load yaml file as a dictionary
    config = load_yaml(config_path)
    
    # get the statistics categories
    categories = read_txt(config['stats_category_path'])
    
    # create saving directory
    None if os.path.exists(config['save_dir']) else os.mkdir(config['save_dir'])
    
    for data_path in glob.glob(os.path.join(config['data_dir'], '*.ims')):
        
        print(f"\ninfo: data path -- {data_path}")
        # get the name of the imaris file
        imaris_name = os.path.basename(data_path).split('.')[0]
        csv_name = f"{imaris_name}.csv"
        metadata_name = f"{imaris_name}.yaml"
        
        print(f"info: file name -- {imaris_name}")
        
        # create storage file
        try: 
            data_frame, metadata = run(config, data_path, categories, drop_duplicates)
            
            # remove unwanted columns with no values
            data_frame.dropna(how='all', axis=1, inplace=True)
            
            # save data_frame
            # create directory to store csv file
            save_path = os.path.join(config['save_dir'], config['data_dir'])

#             # save a csv 
            if drop_duplicates:
                data_frame = data_frame.drop_duplicates(subset=['ID'], keep='last', inplace=False, ignore_index=True)

            data_frame.to_csv(os.path.join(config['save_dir'], csv_name), index=False)
            # save metadata
            dict_to_yaml(metadata, os.path.join(config['save_dir'], metadata_name))
            
        except (ValueError, AttributeError):
            print(f'info: Skipping File - No Tracks Found\n')
            pass
        

         

In [4]:
start = time.time()
main('config.yaml', True)
end = time.time()
print(end - start)

0it [00:00, ?it/s]

1it [00:56, 56.95s/it]

2it [02:05, 64.00s/it]

3it [02:47, 55.92s/it]


0it [00:00, ?it/s]

1it [00:30, 30.98s/it]

2it [00:51, 24.68s/it]

4it [00:59, 11.78s/it]

5it [00:59, 11.97s/it]
