In [1]:
import utils
import numpy as np
import glob
import os
import pandas as pd
import ray

Get Available Categories

In [2]:
def generate_available_categories(config_path: str):
    # load config path
    yaml = utils.load_yaml("config.yaml")

    # type of parser
    parser_type = yaml["parser_type"]

    # extract category func
    extract_categories = utils.get_category_function(parser_type)

    # files to scan
    directories = yaml["data_dir"]

    # valid surface
    valid_surface = int(yaml["valid_surface"]) - 1

    for directory in directories:
        # grab all the files in the directory w/ .ims
        filenames = list(glob.glob(os.path.join(directory, "*.ims")))

        for filename in filenames:
            # file path
            file_path = filename

            # get and save the available categories csv file
            extract_categories(file_path, valid_surface)

            break

        break

    print("[info] Please Edit The Statistics File")

In [3]:
# RUN GENERATE CATEGORIES
generate_available_categories("config.yaml")

Saved Stats Categories in directory: /home/shehan/Documents/nih_projects/parsers/surface_track_parser/main
[info] Please Edit The Statistics File


Generate Statistics 

In [4]:
def generate_statistics(config_path: str):
    # load config path
    yaml = utils.load_yaml(config_path)

    ray.init()

    # type of parser
    parser_type = yaml["parser_type"]

    if parser_type == "track":
        from track_parser import extract_and_save

    elif parser_type == "surface":
        from surface_parser import extract_and_save

    elif parser_type == "first":
        from first_surface_parser import extract_and_save

    else:
        raise ValueError("Invalid Parser Type")

    # files to scan
    directories = yaml["data_dir"]

    # get the stats categories
    stats_categories = utils.read_txt(yaml["stats_category_path"])

    # valid surface
    valid_surface = int(yaml["valid_surface"]) - 1

    # create a list to hold ray subprocess
    processes = []

    for directory in directories:
        # grab all the files in the directory w/ .ims
        filenames = list(glob.glob(os.path.join(directory, "*.ims")))

        for filename in filenames:
            # file path
            file_path = filename  # os.path.join(directory, filename)

            # save_file_path
            save_path = utils.get_save_filepath(parser_type, filename)

            # extract and save
            processes.append(
                extract_and_save.remote(
                    file_path, valid_surface, stats_categories, save_path
                )
            )

    print("info -- running subprocesses:")

    ray.get(processes)

In [5]:
# RUN GENERATE STATISTICS
generate_statistics(config_path="config.yaml")

2023-07-20 22:01:43,045	INFO worker.py:1636 -- Started a local Ray instance.


info -- running subprocesses:
[2m[36m(extract_and_save pid=39692)[0m [info] working on file ../../data/surface_track_parser_dev_data/ALN Sec2 Roi1 2x2 1h20min.ims


In [None]:
# Working

[2m[33m(raylet)[0m [2023-07-20 22:01:28,602 E 36282 36295] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-07-20_22-00-57_080349_35179 is over 95% full, available space: 74698108928; capacity: 1958874054656. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-07-20 22:01:38,614 E 36282 36295] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-07-20_22-00-57_080349_35179 is over 95% full, available space: 74698088448; capacity: 1958874054656. Object creation will fail if spilling is required.
