* Notes
    * Filename: launch_parser_mp_surface.ipynb
    * Multiprocessing performed for each file
    * Example: For each file all surfaces are executed in parallel
    * Better for Memory Consumption IF imaris files are large

In [1]:
import sys

sys.path.append(".")

import os
import gc
import ray
import glob
import time
import numpy as np
from utils import utils
import concurrent.futures
from tqdm.notebook import tqdm

os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"  # disables read filelock

Get Available Categories

In [2]:
def generate_available_categories(config_path: str, save_path: str):
    # load config path
    yaml = utils.load_yaml(config_path)

    # type of parser
    parser_type = yaml["parser_type"]

    # extract category func
    extract_categories = utils.get_category_function(parser_type)

    # files to scan
    directories = yaml["data_dir"]

    # valid surface
    valid_surface = int(yaml["valid_surface"][0]) - 1

    for directory in directories:
        # grab all the files in the directory w/ .ims
        filenames = list(glob.glob(os.path.join(directory, "*.ims")))

        for filename in filenames:
            # file path
            file_path = filename

            # get and save the available categories csv file
            extract_categories(file_path, valid_surface, save_path)

            break

        break

    print("[info] Please Edit The Statistics File")

In [3]:
# RUN GENERATE CATEGORIES
# generate_available_categories("configs/config.yaml", ".")

Generate Statistics 

In [4]:
import logging

logging.basicConfig(format="%(message)s", level=logging.INFO)


def generate_statistics(config_path: str):
    # initialize ray
    ray.init(
        ignore_reinit_error=True,
    )

    # load config path
    yaml = utils.load_yaml(config_path)

    # type of parser
    parser_type = yaml["parser_type"]

    if parser_type == "track":
        from parsers.track_parser import extract_data, process_and_save

        print(f"[info] -- parser mode: {parser_type}")

    elif parser_type == "surface":
        from parsers.surface_parser import extract_data, process_and_save

        print(f"[info] -- parser mode: {parser_type}")

    elif parser_type == "first":
        from parsers.first_surface_parser import extract_data, process_and_save

        print(f"[info] -- parser mode: {parser_type}")

    else:
        raise ValueError("Invalid Parser Type")

    # files to scan
    directories = yaml["data_dir"]

    # get the stats categories
    stats_categories = utils.read_txt(yaml["stats_category_path"])

    # valid surface
    valid_surfaces = yaml["valid_surface"]

    # iterate, process and save
    # parallel processes all files within 1 directory at at time
    for idx, directory in tqdm(enumerate(directories)):
        print(f"[info] -- processing directory index: {idx+1}/{len(directories)}")
        # save dir
        save_dir = yaml["save_dir"][idx]

        # grab all the files in the directory w/ .ims
        filenames = list(glob.glob(os.path.join(directory, "*.ims")))

        for filename in filenames:
            print(f"\n[info] -- processing file {os.path.basename(filename)}\n")
            # load the imaris file
            data = utils.load_ims(filename)

            # create a list to hold ray subprocess
            processes = []
            for surface in valid_surfaces:
                # create folder
                folder_path = os.path.join(save_dir, str(surface))
                if not os.path.isdir(folder_path):
                    os.makedirs(folder_path)

                # convert to zero indexed surface value
                current_surface = int(surface) - 1

                # save_file_path
                save_path = utils.get_save_filepath(
                    parser_type, folder_path, filename, (current_surface + 1)
                )

                # extract data
                extracted_data = extract_data(
                    filename, data, current_surface, save_path
                )

                # process and save
                if extracted_data:
                    # append stats categories to extracted data
                    extracted_data.update({"categories_list": stats_categories})
                    task_ref = process_and_save.remote(extracted_data)
                    processes.append(task_ref)

                else:
                    # there is no surface deleting folder
                    if not os.listdir(folder_path):
                        os.rmdir(folder_path)

            print("\n\t[info] -- finished data extraction")
            print(f"\t[info] -- found {len(processes)} surfaces")
            print(f"\t[info] -- processing {len(processes)} surfaces\n")

            ray.wait(processes, num_returns=len(processes))

            # memory clean up
            del data
            del processes
            gc.collect()

    print(f"\n[info] -- DONE")

In [5]:
# RUN GENERATE STATISTICS
start = time.perf_counter()
generate_statistics(config_path="configs/config.yaml")
stop = time.perf_counter()
print(f"Total Run Time: {stop - start}")

2023-08-18 01:01:21,482	INFO worker.py:1621 -- Started a local Ray instance.


[info] -- parser mode: surface


0it [00:00, ?it/s]

[info] -- processing directory index: 1/1

[info] -- processing file ALN Sec2 Roi1 2x2 1h20min.ims

	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 0
	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 1
	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 2
	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 3
	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 4
		[info] -- raised index error
		[info] -- skipping file ALN Sec2 Roi1 2x2 1h20min.ims -- surface 4 not found
	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 5
		[info] -- raised index error
		[info] -- skipping file ALN Sec2 Roi1 2x2 1h20min.ims -- surface 5 not found
	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 7
		[info] -- raised index error
		[info] -- skipping file ALN Sec2 Roi1 2x2 1h20min.ims -- surface 7 not found
	[info] extracting data: ALN Sec2 Roi1 2x2 1h20min.ims -- surface: 9
		[info] -- raised

[2m[33m(raylet)[0m [2023-08-18 01:01:31,480 E 74732 74745] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-08-18_01-01-19_771660_74587 is over 95% full, available space: 5696790528; capacity: 1958874054656. Object creation will fail if spilling is required.


[2m[36m(process_and_save pid=74823)[0m 	ERROR in FILE: ILN with VLP Tiled 12x18 incomplete.ims - surface 2 raised Exception ['Value']


KeyboardInterrupt: 

[2m[36m(process_and_save pid=74829)[0m 	ERROR in FILE: ILN with VLP Tiled 12x18 incomplete.ims - surface 1 raised Exception ['Value']


[2m[33m(raylet)[0m [2023-08-18 01:01:41,493 E 74732 74745] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-08-18_01-01-19_771660_74587 is over 95% full, available space: 5696720896; capacity: 1958874054656. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-08-18 01:01:51,505 E 74732 74745] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-08-18_01-01-19_771660_74587 is over 95% full, available space: 5696962560; capacity: 1958874054656. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-08-18 01:02:01,516 E 74732 74745] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-08-18_01-01-19_771660_74587 is over 95% full, available space: 5696950272; capacity: 1958874054656. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-08-18 01:02:11,528 E 74732 74745] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-08-18_01-01-19_771660_74587 is over 95% full, available space: 

In [None]:
# Working