* Notes
    * Filename: launch_parser_mp_files.ipynb
    * Multiprocessing performed for each directory
    * Example: For all files in each directory all surfaces for every file are executed in parallel
    * High Memory Consumption IF imaris files are large

In [1]:
import sys

sys.path.append(".")

import os
import gc
import ray
import glob
import time
import numpy as np
from utils import utils
import concurrent.futures
from tqdm.notebook import tqdm

Get Available Categories

In [2]:
def generate_available_categories(config_path: str, save_path: str):
    # load config path
    yaml = utils.load_yaml(config_path)

    # type of parser
    parser_type = yaml["parser_type"]

    # extract category func
    extract_categories = utils.get_category_function(parser_type)

    # files to scan
    directories = yaml["data_dir"]

    # valid surface
    valid_surface = int(yaml["valid_surface"][0]) - 1

    for directory in directories:
        # grab all the files in the directory w/ .ims
        filenames = list(glob.glob(os.path.join(directory, "*.ims")))

        for filename in filenames:
            # file path
            file_path = filename

            # get and save the available categories csv file
            extract_categories(file_path, valid_surface, save_path)

            break

        break

    print("[info] Please Edit The Statistics File")

In [3]:
# RUN GENERATE CATEGORIES
# generate_available_categories("configs/config.yaml", ".")

In [4]:
# stats_categories = utils.read_txt('stats_categories_track.txt')

Generate Statistics 

In [5]:
def generate_statistics(config_path: str):
    # init ray
    ray.init(
        ignore_reinit_error=True,
    )

    chunk_size = 4

    # load config path
    yaml = utils.load_yaml(config_path)

    # type of parser
    parser_type = yaml["parser_type"]

    if parser_type == "track":
        from parsers.track_parser import extract_data, process_and_save

        print(f"[info] -- parser mode: {parser_type}")

    elif parser_type == "surface":
        from parsers.surface_parser import extract_data, process_and_save

        print(f"[info] -- parser mode: {parser_type}")

    elif parser_type == "first":
        from parsers.first_surface_parser import extract_data, process_and_save

        print(f"[info] -- parser mode: {parser_type}")

    else:
        raise ValueError("Invalid Parser Type")

    # files to scan
    directories = yaml["data_dir"]

    # get the stats categories
    # stats_categories = utils.read_txt(yaml["stats_category_path"])

    # valid surface
    valid_surfaces = yaml["valid_surface"]

    # iterate, process and save
    # parallel processes all files within 1 directory at at time
    skip_list = []
    for idx, directory in tqdm(enumerate(directories)):
        print(f"[info] -- processing directory index: {idx+1}/{len(directories)}")

        # save dir
        save_dir = yaml["save_dir"][idx]

        # grab all the files in the directory w/ .ims
        filenames = list(glob.glob(os.path.join(directory, "*.ims")))
        print("Len Files: ", len(filenames), filenames[0])

        filenames = np.array(filenames, dtype=str)
        num_chunks = np.ceil(len(filenames) / chunk_size)
        file_chunks = np.array_split(filenames, num_chunks)

        # create a list to hold ray subprocess

        for filenames in file_chunks:
            processes = []

            for filename in filenames.tolist():
                try:
                    print(f"\n[info] -- processing file {os.path.basename(filename)}\n")

                    # load the imaris file
                    data = utils.load_ims(filename)

                    for surface in valid_surfaces:
                        # get the stats categories available for current surface
                        extract_categories = utils.get_category_function(parser_type)
                        stats_categories = extract_categories(
                            filename, int(surface), save_path="."
                        )
                        # create folder
                        folder_path = os.path.join(save_dir, str(surface))
                        if not os.path.isdir(folder_path):
                            os.makedirs(folder_path)

                        # convert to zero indexed surface value
                        current_surface = int(surface) - 1

                        # save_file_path
                        save_path = utils.get_save_filepath(
                            parser_type, folder_path, filename, (current_surface + 1)
                        )

                        # extract data
                        extracted_data = extract_data(
                            filename, data, current_surface, save_path
                        )

                        # process and save
                        if extracted_data:
                            # append stats categories to extracted data
                            extracted_data.update({"categories_list": stats_categories})
                            task_ref = process_and_save.remote(extracted_data)
                            processes.append(task_ref)

                        else:
                            # there is no surface deleting folder
                            if not os.listdir(folder_path):
                                os.rmdir(folder_path)

                except Exception as e:
                    # for any reason it cannot execute a file it will skip that file
                    print(f"\t[info] -- error file: {filename}")
                    print(f"\t[info] -- raised Exception [{e}]")
                    print(f"\t[info] -- skipping file\n")
                    skip_list.append(filename)
                    pass

            print("\n\t[info] -- finished data extraction")
            print(f"\t[info] -- found {len(processes)} surfaces")
            print(f"\t[info] -- processing {len(processes)} surfaces\n")

            ray.wait(processes, num_returns=len(processes))

            # memory clean up
            del data
            del processes
            gc.collect()

    np.savetxt("skipped_files.txt", np.array(skip_list, dtype=str), fmt="%s")

    ray.shutdown()
    print(f"\n[info] -- DONE")

In [6]:
# RUN GENERATE STATISTICS
start = time.perf_counter()
generate_statistics(config_path="configs/config_first.yaml")
stop = time.perf_counter()
print(f"Total Run Time: {stop - start}")

[info] -- parser mode: first


0it [00:00, ?it/s]

[info] -- processing directory index: 1/1
Len Files:  3 ../../data/multi_surface_track_parser_dev_data/GFP #1 Sec1 Roi2 2x2 1h30min.ims

[info] -- processing file GFP #1 Sec1 Roi2 2x2 1h30min.ims

	[info] extracting data: GFP #1 Sec1 Roi2 2x2 1h30min.ims -- surface: 0
[2m[36m(process_and_save pid=209975)[0m 	[info] working on file: GFP #1 Sec1 Roi2 2x2 1h30min.ims -- surface 0
[2m[36m(process_and_save pid=209975)[0m 	[info] finished: GFP #1 Sec1 Roi2 2x2 1h30min.ims -- surface 0 -- processed 1098 items
	[info] extracting data: GFP #1 Sec1 Roi2 2x2 1h30min.ims -- surface: 1
[2m[36m(process_and_save pid=209975)[0m 	[info] working on file: GFP #1 Sec1 Roi2 2x2 1h30min.ims -- surface 1
[2m[36m(process_and_save pid=209975)[0m 	[info] finished: GFP #1 Sec1 Roi2 2x2 1h30min.ims -- surface 1 -- processed 293 items
	[info] extracting data: GFP #1 Sec1 Roi2 2x2 1h30min.ims -- surface: 2
[2m[36m(process_and_save pid=209975)[0m 	[info] working on file: GFP #1 Sec1 Roi2 2x2 1h30min.i

In [None]:
# Working