In [1]:
import os
import ray
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from imaris.imaris import ImarisDataObject
from parsers.surface_parser import SurfaceParserDistributed
from typing import List, Dict

In [2]:
data_path = '//niaid-fas1/Bioimaging/Users/rogandc/04052023 SHIV aLN iLN 6-color/ALN P3 2x2 Roi1 1h20min.ims'
os.path.isfile(data_path)

True

In [3]:
data_path

'//niaid-fas1/Bioimaging/Users/rogandc/04052023 SHIV aLN iLN 6-color/ALN P3 2x2 Roi1 1h20min.ims'

In [4]:
def get_ims_obj(data_path):
    ims_obj = ImarisDataObject(data_path)
    # assert ims_obj type


def get_final_df(data_path, init_surface_id=0):
    parser = SurfaceParserDistributed.remote(data_path, surface_id=init_surface_id)
    final_df = parser.inspect.remote(surface_id=0)
    final_df = ray.get(final_df)["stats_df"]
    # assert data type


def get_stats_dict_reversed(data_path, init_surface_id=0):
    parser = SurfaceParserDistributed.remote(data_path, surface_id=init_surface_id)
    stats_dict = parser.get_surface_stats_info.remote(surface_id=0)
    stats_dict = ray.get(stats_dict)
    stats_dict = {v: k for k, v in stats_dict.items()}
    # assert data type


def verify_stats(data_path, init_surface_id=0):
    parser = SurfaceParserDistributed.remote(data_path, surface_id=init_surface_id)
    # get generated df
    final_df = parser.inspect.remote(surface_id=0)
    final_df = ray.get(final_df)
    generated_df = final_df["stats_df"].T
    # get stats dict that maps ID to Name
    stats_dict = parser.get_surface_stats_info.remote(surface_id=0)
    stats_dict = ray.get(stats_dict)
    stats_dict = {v: k for k, v in stats_dict.items()}
    # get original data
    ims_obj = ImarisDataObject(data_path)
    raw_stats = ims_obj.get_stats_values(final_df["surface_name"])

In [5]:
parser1 = SurfaceParserDistributed.remote(data_path, surface_id=0)
parser2 = SurfaceParserDistributed.remote(data_path, surface_id=0)

2023-11-28 20:21:20,955	INFO worker.py:1673 -- Started a local Ray instance.


In [6]:
# get generated df
final_df = parser1.inspect.remote(surface_id=0)
final_df = ray.get(final_df)
generated_df = final_df["stats_df"]

# get stats dict that maps ID to Name
stats_dict = parser2.get_surface_stats_info.remote(surface_id=0)
stats_dict = ray.get(stats_dict)

# get original data
ims_obj = ImarisDataObject(data_path)
raw_stats = ims_obj.get_stats_values(final_df["surface_name"])

In [7]:
def validate(
    gen_stats: pd.DataFrame,
    raw_obj_stats: pd.DataFrame,
    stats_dict: Dict,
) -> None:
    """
    Given a single row of statistics ie: all the stats for a single
    objectid and the raw stats values dictionary for the same object id
    this function will compare each statistics value between the generated
    version and the raw stats values dictionary obtained directly from the
    imaris file to ensure every stat values match for the given object id.

    Args:
        gen_stats (pd.DataFrame): dataframe for all the stats for a single object id
        raw_obj_stats (pd.DataFrame): raw stats values for a single object id
        stats_dict (Dict): dictionary that maps Stats ID Number to Stat Name

    Raises:
        ValueError: _description_
    """
    raw_obj_id = raw_obj_stats["ID_Object"].unique().item()
    gen_obj_id = gen_stats["Object_ID"].item()
    assert raw_obj_id == gen_obj_id, "Object IDs Do NOT Match"

    gen_values = {}
    # key = stats name, value=generated stats value for stats name
    for stats_name in gen_stats.columns.to_list():
        gen_values[stats_name] = gen_stats[stats_name].item()
    gen_values.pop("Object_ID")

    raw_values = {}
    for stat_id in raw_obj_stats["ID_StatisticsType"].to_list():
        stats_name = stats_dict[stat_id]
        stats_value = raw_obj_stats[raw_obj_stats["ID_StatisticsType"] == stat_id]
        raw_values[stats_name] = stats_value["Value"].item()

    # double check all keys in both dicts are the same
    assert list(gen_values.keys()) == list(raw_values.keys())

    for key in gen_values.keys():
        raw_value = raw_values[key]
        gen_value = gen_values[key]
        if raw_value != gen_value:
            print(key, raw_value, gen_value)
            raise ValueError

    print(f"[info] -- confirmed all values for object {raw_obj_id}")

In [8]:
import concurrent.futures


def run_stats_validation(
    generated_df: pd.DataFrame,
    raw_stats: pd.DataFrame,
    stats_dict: Dict,
) -> None:
    """_summary_

    Args:
        generated_df (pd.DataFrame): _description_
        raw_stats (pd.DataFrame): _description_
        stats_dict (Dict): _description_
    """
    with concurrent.futures.ProcessPoolExecutor() as executor:
        executor_objects = [None] * len(generated_df["Object_ID"].to_list())
        for idx, object_id in tqdm(enumerate(generated_df["Object_ID"].to_list())):
            # get generated stats info for object id
            gen_stats = generated_df[generated_df["Object_ID"] == object_id]
            # get raw stats values for object id
            raw_obj_stats = raw_stats[raw_stats["ID_Object"] == object_id]

            executor_objects[idx] = executor.submit(
                validate, gen_stats, raw_obj_stats, stats_dict
            )
    for obj in executor_objects:
        obj.result()

In [9]:
run_stats_validation(generated_df, raw_stats, stats_dict[0])

0it [00:00, ?it/s]