In [1]:
import os
import numpy as np
import pandas as pd

from parsers.surface_parser import SurfaceParser
from imaris.imaris import ImarisDataObject

from typing import List

# from utils.process import run_surface_parser_parallel

In [2]:
data_path = ["../data/multi_surface_track_parser_dev_data/"]
os.path.isdir(data_path[0])

True

In [3]:
# def configure_actor()

In [4]:
import os
import glob
import ray
from typing import List
from utils.utils import get_num_surfaces
from parsers.surface_parser import SurfaceParserDistributed
from imaris.exceptions import NoSurfaceException

# ray.init(num_cpus=28)


#########################################################################################
def run_surface_parser_parallel(
    data_paths: List[str],
    save_dir: str = "processed_data",
    cpu_cores: int = None,
) -> None:
    """
    Runs ALL surfaces in an ims file in parallel.
    Pipeline:
        - For every file in every directory
        - Create a folder a with the same name as the filename inside save_dir provided
        - For every surface inside each file we create a remote actor
        - Once all actors are created we can use the cpu cores provided by user to
            run a fixed chunk of actors in parallel so we dont start too many instances at one
        - If no cpu_cores provided we will run all actors in parallel

    Args:
        data_paths (List[str]): a list of directories
        save_dir (str, optional): dir to save files to. Defaults to "processed_data".
        cpu_cores (int, optional): num of cpu cores allocated for processing. Defaults to None.
    """
    actors = []
    for data_path in data_paths:
        # get all imaris files
        imaris_files = glob.glob(os.path.join(os.path.abspath(data_path), "*.ims"))

        if len(imaris_files) == 0:
            print(f"[info] -- skipping folder no files found ")
            pass

        else:
            for file_path in imaris_files:
                # get filename from path
                filename = os.path.basename(file_path).split(".")[0]

                # create dir with same name as filename
                save_path = os.path.join(save_dir, filename)
                if not os.path.isdir(save_path):
                    os.makedirs(save_path)

                # get num surfaces for file
                num_surfaces = get_num_surfaces(file_path)

                # create actors
                for idx in range(num_surfaces):
                    try:
                        actor = SurfaceParserDistributed.remote(
                            file_path, surface_id=idx, save_dir=save_path
                        )
                    except NoSurfaceException:
                        print(
                            f"[info] -- no surface found in {filename}..skipping file"
                        )
                    actors.append(actor)

                print("adding files to actors: ", len(actors))

    # generate results
    # split if too many actors vs cores else run all
    if cpu_cores and cpu_cores < len(actors):
        num_actors = len(actors)
        num_splits = np.round(num_actors / cpu_cores)
        splits = np.array_split(np.asarray(actors, dtype=object), num_splits)
        for split in splits:
            results = ray.get(
                [
                    actor.extract_and_save.remote(surface_id=0)
                    for _, actor in enumerate(split)
                ]
            )
    else:
        results = ray.get(
            [
                actor.extract_and_save.remote(surface_id=0)
                for _, actor in enumerate(actors)
            ]
        )

In [5]:
# out = run_surface_parser_parallel(
#     data_path,
#     "save_2",
# )

In [6]:
#########################################################################################
def run_surface_parser_parallel_index(
    data_paths: List[str],
    surface_ids: List[int],
    save_dir: str = "processed_data",
    cpu_cores: int = None,
) -> None:
    """
    Runs SPECIFIED surfaces in an ims file in parallel.
    Pipeline:
        - For every file in every directory
        - Create a folder a with the same name as the filename inside save_dir provided
        - For every surface specified in surface_ids we create a remote actor
        - Once all actors are created we can use the cpu cores provided by user to
            run a fixed chunk of actors in parallel so we dont start too many instances at one
        - If no cpu_cores provided we will run all actors in parallel.

    Args:
        data_paths (List[str]): _description_
        surface_ids (List[int]): _description_
        save_dir (str, optional): _description_. Defaults to "processed_data".
        cpu_cores (int, optional): _description_. Defaults to None.
    """
    actors = []
    for data_path in data_paths:
        # get all imaris files
        imaris_files = glob.glob(os.path.join(os.path.abspath(data_path), "*.ims"))

        if len(imaris_files) == 0:
            print(f"[info] -- skipping folder no files found ")
            pass

        else:
            for file_path in imaris_files:
                # get filename from path
                filename = os.path.basename(file_path).split(".")[0]

                # create dir with same name as filename
                save_path = os.path.join(save_dir, filename)
                if not os.path.isdir(save_path):
                    os.makedirs(save_path)

                # get num surfaces for file
                num_surfaces = get_num_surfaces(file_path)

                # create actors
                for idx in range(num_surfaces):
                    if (idx + 1) in surface_ids:
                        try:
                            actor = SurfaceParserDistributed.remote(
                                file_path, surface_id=idx, save_dir=save_path
                            )
                        except NoSurfaceException:
                            print(
                                f"[info] -- no surface found in {filename}..skipping file"
                            )
                        actors.append(actor)

                print("adding files to actors: ", len(actors))

    # generate results
    if cpu_cores and cpu_cores < len(actors):
        num_actors = len(actors)
        num_splits = np.round(num_actors / cpu_cores)
        splits = np.array_split(np.asarray(actors, dtype=object), num_splits)
        for split in splits:
            results = ray.get(
                [
                    actor.extract_and_save.remote(surface_id=0)
                    for _, actor in enumerate(split)
                ]
            )
    else:
        results = ray.get(
            [
                actor.extract_and_save.remote(surface_id=0)
                for _, actor in enumerate(actors)
            ]
        )

In [7]:
out = run_surface_parser_parallel_index(
    data_path,
    surface_ids=[1, 4, 7],  # STARTS AT 1
    save_dir="save_4",
)

2023-11-15 18:08:54,086	INFO worker.py:1621 -- Started a local Ray instance.


adding files to actors:  3
adding files to actors:  5
adding files to actors:  8


In [None]:
np.array_split(np.asarray(out), 3)

In [None]:
20 / 15

In [None]:
f = np.arange(1, 21)

d = np.array_split(f, 3)
d

In [None]:
for i in d:
    print(i.min(), i.max())
    for s in range(i.min() - 1, i.max() + 1):
        print(s)

In [None]:
parser = SurfaceParser(data_path, surface_id=0)
storage = parser.inspect(surface_id=0)

In [None]:
storage["surface_name"]

In [None]:
ims_obj = ImarisDataObject(data_path)
num_surfaces = len(ims_obj.get_object_names("Surface"))

In [None]:
def get_num_surfaces(data_path: str) -> int:
    ims_obj = ImarisDataObject(data_path)
    num_surfaces = len(ims_obj.get_object_names("Surface"))
    return num_surfaces


def surface_contains_tracks(
    data_path: str,
    surface_id: int = 0,
) -> bool:
    """Checks ims file to see if track information is avilable for a given surface.

    Args:
        data_path (str): path to ims file
        surface_id (int, optional): surface id value. Defaults to 0.

    Returns:
        bool: Returns True if track information is avilable else False
    """
    ims_obj = ImarisDataObject(data_path)
    surface_names = ims_obj.get_object_names("Surface")
    return ims_obj.contains_tracks(surface_names[surface_id])


def contains_sufaces(data_path: str, surface_id: int = 0) -> bool:
    """Checks ims file to see if surface information is avilable.

    Args:
        data_path (str): path to ims file
        surface_id (int, optional): surface id value. Defaults to 0.

    Returns:
        bool: Returns True if track information is avilable else False
    """
    ims_obj = ImarisDataObject(data_path)
    surface_names = ims_obj.get_object_names("Surface")
    return ims_obj.contains_sufaces(surface_names[surface_id])


def get_surface_stats(data_path: str, surface_id: int = -1) -> List[str]:
    """_summary_

    Args:
        data_path (str): _description_
        surface_id (int, optional): _description_. Defaults to -1.

    Raises:
        ValueError: _description_

    Returns:
        List[str]: _description_
    """
    num_surfaces = get_num_surfaces(data_path)  # num available surfaces
    parser = SurfaceParser(data_path)
    # grab all
    if surface_id == -1:  # grab all
        available_stats_storage = [
            parser.get_surface_info(surface_id=idx) for idx in range(num_surfaces)
        ]
        return available_stats_storage

    elif surface_id <= num_surfaces:
        available_stats_storage = parser.get_surface_info(surface_id)
        return available_stats_storage

    else:
        raise ValueError("requested surface id exceeeds number of surfaces available")

In [None]:
%timeit contains_tracks(data_path)

In [14]:
from typing import List 

d = [12]
assert (type(d) == List)

AssertionError: 

In [9]:
type(d)

list