# Batch Process Quantification - part 12
--------------------

## OVERVIEW



## OBJECTIVE: ✅ Quantify organelle composition, morphology, distribution and contacts from multiple cells/images (batch process).
In this notebook, the logic for betach processing the quantification for mulitple images is outlined.



## IMPORTS

In [1]:
# top level imports
from pathlib import Path
import os, sys
from typing import Optional, Union, Dict, List
import itertools
import warnings

import time

import numpy as np
import pandas as pd

import napari

### import local python functions in ../infer_subc
sys.path.append(os.path.abspath((os.path.join(os.getcwd(), '..'))))

from infer_subc.core.file_io import (read_czi_image,
                                        export_inferred_organelle,
                                        import_inferred_organelle,
                                        export_tiff,
                                        list_image_files,
                                        read_tiff_image)


from infer_subc.constants import *
from infer_subc.organelles import *
from infer_subc.utils.stats import *
from infer_subc.utils.stats_helpers import *
from infer_subc.utils.stats_helpers import make_all_metrics_tables
from infer_subc.utils.stats import _assert_uint16_labels
from infer_subc.core.img import label_uint16
from infer_subc.utils.batch import explode_mask


%load_ext autoreload
%autoreload 2

_______________________

## 1. Get the path for image that will be analyzed

In [2]:
data_root_path = Path(os.path.expanduser("~")) / "Documents/Python_Scripts/Infer-subc"

raw_data_path = data_root_path / "raw"
raw_file_list = list_image_files(raw_data_path,".czi")

# adding an additional list of image paths for the matching segmentation files
seg_data_path = data_root_path / "out"
seg_file_list = list_image_files(seg_data_path, "tiff")

# changing output directory for this notebook to a new folder called "quant"
out_data_path = data_root_path / "quant"
if not Path.exists(out_data_path):
    Path.mkdir(out_data_path)
    print(f"making {out_data_path}")

raw_file_list, seg_file_list

([WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/24hrs-Ctrl +oleicAcid 50uM_2_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a24hrs-Ctrl +oleicAcid 50uM_4_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a24hrs-Ctrl_10_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a24hrs-Ctrl_14_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a48hrs-Ctrl + oleic acid_01_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a48hrs-Ctrl + oleic acid_02_Unmixing.czi'),
  WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a48hrs-Ctrl + oleic

## 2. Import the matching raw and segmentation files

In [3]:
def _find_segmentation_tiff_files(prototype:Union[Path,str],
                                  name_list:List[str], 
                                  seg_path:Union[Path,str],
                                  suffix:Union[str, None]=None) -> Dict:
    """
    Find the matching segmentation files to the raw image file based on the raw image file path.

    Paramters:
    ---------
    prototype:Union[Path,str]
        the file path (as a string) for one raw image file; this file should have matching segmentation 
        output files with the same file name root and different file name ending that match the strings 
        provided in name_list
    name_list:List[str]
        a list of file name endings related to what segmentation is that file
    seg_path:Union[Path,str]
        the path (as a string) to the matching segmentation files.
    suffix:Union[str, None]=None
        any additional text that exists between the file root and the name_list ending
        Ex) Prototype = "C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a48hrs-Ctrl_9_Unmixing.czi"
            Name of organelle file = a48hrs-Ctrl_9_Unmixing-20230426_test_cell.tiff
            result of .stem = "a48hrs-Ctrl_9_Unmixing"
            organelle/cell area type = "cell"
            suffix = "-20230426_test_"
    
    Returns:
    ----------
    a dictionary of file paths for each image type (raw and all the different segmentations)

    """
    # raw
    prototype = Path(prototype)
    if not prototype.exists():
        print(f"bad prototype. please choose an existing `raw` file as prototype")
        return dict()

    out_files = {"raw":prototype}
    seg_path = Path(seg_path) 

    # raw
    if not seg_path.is_dir():
        print(f"bad path argument. please choose an existing path containing organelle segmentations")
        return out_files

    # segmentations
    for org_n in name_list:
        org_name = Path(seg_path) / f"{prototype.stem}{suffix}{org_n}.tiff"
        if org_name.exists(): 
            out_files[org_n] = org_name
        else: 
            print(f"{org_n} .tiff file not found in {seg_path} returning")
            out_files[org_n] = None
    
    return out_files 


In [4]:
prototype = raw_file_list[2]
organelles = ["lyso","mito","golgi","perox","ER","LD","masks"] #"cell", "cyto", "nuc"]
test_suffix = "-20230426_test_"

filez = _find_segmentation_tiff_files(prototype, organelles, seg_data_path, test_suffix)

filez

{'raw': WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/raw/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing.czi'),
 'lyso': WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/out/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing-20230426_test_lyso.tiff'),
 'mito': WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/out/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing-20230426_test_mito.tiff'),
 'golgi': WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/out/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing-20230426_test_golgi.tiff'),
 'perox': WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/out/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing-20230426_test_perox.tiff'),
 'ER': WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/out/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing-20230426_test_ER.tiff'),
 'LD': WindowsPath('C:/Users/Shannon/Documents/Python_Scripts/Infer-subc/out/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing-20230426_test_LD.tiff'),
 'm

In [5]:
from infer_subc.utils.batch import find_segmentation_tiff_files

filez_final = find_segmentation_tiff_files(prototype, organelles, seg_data_path, test_suffix)

filez==filez_final

True

## 3. Batch process the data from multiple images at one time

In [6]:
# names of organelles we have
organelle_names = ["lyso", "mito","golgi","perox","ER","LD"]
region_names = ["cell", "cyto", "nuc"]

# get the intensities
organelle_channels = [LYSO_CH,MITO_CH,GOLGI_CH,PEROX_CH,ER_CH,LD_CH]

In [12]:
# for a list of "prefixes"  collect stats + cross stats masked by cytosol (including nuclei masked by cellmask)

# NOTE: the convex hull regionprops error is a know issue that occurs when the objects being measured have too few voxels. 
# Here's the github reference:https://github.com/scikit-image/scikit-image/issues/5363

# TODO: it may be beneficial in the future to make the analysis possible without the analysis of the regions, but for now, it can stay

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

def _batch_process_quantification(out_file_name: str,
                                  seg_path: Union[Path,str],
                                  out_path: Union[Path, str], 
                                  raw_path: Union[Path,str], 
                                  raw_file_type: str,
                                  organelle_names: List[str],
                                  organelle_channels: List[int],
                                  region_names: List[str],
                                  masks_file_name: str,
                                  mask: str,
                                  dist_centering_obj:str, 
                                  dist_num_bins: int,
                                  dist_center_on: bool=False,
                                  dist_keep_center_as_bin: bool=True,
                                  dist_zernike_degrees: Union[int, None]=None,
                                  include_contact_dist: bool = True,
                                  scale:bool=True,
                                  seg_suffix:Union[str, None]=None) -> int :
    """  
    batch process segmentation quantification (morphology, distribution, contacts); this function is currently optimized to process images from one file folder per image type (e.g., raw, segmentation)
    the output csv files are saved to the indicated out_path folder

    Parameters:
    ----------
    out_file_name: str
        the prefix to use when naming the output datatables
    seg_path: Union[Path,str]
        Path or str to the folder that contains the segmentation tiff files
    out_path: Union[Path, str]
        Path or str to the folder that the output datatables will be saved to
    raw_path: Union[Path,str]
        Path or str to the folder that contains the raw image files
    raw_file_type: str
        the file type of the raw data; ex - ".tiff", ".czi"
    organelle_names: List[str]
        a list of all organelle names that will be analyzed; the names should be the same as the suffix used to name each of the tiff segmentation files
        Note: the intensity measurements collect per region (from get_region_morphology_3D function) will only be from channels associated to these organelles 
    organelle_channels: List[int]
        a list of channel indices associated to respective organelle staining in the raw image; the indices should listed in same order in which the respective segmentation name is listed in organelle_names
    region_names: List[str]
        a list of regions, or masks, to measure; the order should correlate to the order of the channels in the "masks" output segmentation file
    masks_file_name: str
        the suffix of the "masks" segmentation file; ex- "masks_B", "masks", etc.
        this function currently does not accept indivial region segmentations 
    mask: str
        the name of the region to use as the mask when measuring the organelles; this should be one of the names listed in regions list; usually this will be the "cell" mask
    dist_centering_obj:str
        the name of the region or object to use as the centering object in the get_XY_distribution function
    dist_num_bins: int
        the number of bins for the get_XY_distribution function
    dist_center_on: bool=False,
        for get_XY_distribution:
        True = distribute the bins from the center of the centering object
        False = distribute the bins from the edge of the centering object
    dist_keep_center_as_bin: bool=True
        for get_XY_distribution:
        True = include the centering object area when creating the bins
        False = do not include the centering object area when creating the bins
    dist_zernike_degrees: Union[int, None]=None
        for get_XY_distribution:
        the number of zernike degrees to include for the zernike shape descriptors; if None, the zernike measurements will not 
        be included in the output
    include_contact_dist:bool=True
        whether to include the distribution of contact sites in get_contact_metrics_3d(); True = include contact distribution
    scale:bool=True
        a tuple that contains the real world dimensions for each dimension in the image (Z, Y, X)
    seg_suffix:Union[str, None]=None
        any additional text that is included in the segmentation tiff files between the file stem and the segmentation suffix


    Returns:
    ----------
    count: int
        the number of images processed
        
    """
    start = time.time()
    count = 0

    if isinstance(raw_path, str): raw_path = Path(raw_path)
    if isinstance(seg_path, str): seg_path = Path(seg_path)
    if isinstance(out_path, str): out_path = Path(out_path)
    
    if not Path.exists(out_path):
        Path.mkdir(out_path)
        print(f"making {out_path}")
    
    # reading list of files from the raw path
    img_file_list = list_image_files(raw_path, raw_file_type)

    # list of segmentation files to collect
    segs_to_collect = organelle_names + [masks_file_name]

    # containers to collect data tabels
    org_tabs = []
    contact_tabs = []
    dist_tabs = []
    region_tabs = []
    for img_f in img_file_list:
        count = count + 1
        filez = find_segmentation_tiff_files(img_f, segs_to_collect, seg_path, seg_suffix)

        # read in raw file and metadata
        img_data, meta_dict = read_czi_image(filez["raw"])

        # create intensities from raw file as list based on the channel order provided
        intensities = [img_data[ch] for ch in organelle_channels]

        # define the scale
        # if scale is True:
        if scale:
            scale_tup = meta_dict['scale']
        else:
            scale_tup = None

        # load regions as a list based on order in list (should match order in "masks" file)
        masks = read_tiff_image(filez[masks_file_name]) 
        regions = [masks[r] for r, region in enumerate(region_names)] #TODO: add in option for multiple mask files

        # store organelle images as list
        organelles = [read_tiff_image(filez[org]) for org in organelle_names]

        org_metrics, contact_metrics, dist_metrics, region_metrics = make_all_metrics_tables(source_file=img_f,
                                                                                             list_obj_names=organelle_names,
                                                                                             list_obj_segs=organelles,
                                                                                             list_intensity_img=intensities, 
                                                                                             list_region_names=region_names,
                                                                                             list_region_segs=regions, 
                                                                                             mask=mask,
                                                                                             dist_centering_obj=dist_centering_obj,
                                                                                             dist_num_bins=dist_num_bins,
                                                                                             dist_center_on=dist_center_on,
                                                                                             dist_keep_center_as_bin=dist_keep_center_as_bin,
                                                                                             dist_zernike_degrees=dist_zernike_degrees,
                                                                                             scale=scale_tup,
                                                                                             include_contact_dist=include_contact_dist)

        org_tabs.append(org_metrics)
        contact_tabs.append(contact_metrics)
        dist_tabs.append(dist_metrics)
        region_tabs.append(region_metrics)
        end2 = time.time()
        print(f"Completed processing for {count} images in {(end2-start)/60} mins.")

    final_org = pd.concat(org_tabs, ignore_index=True)
    final_contact = pd.concat(contact_tabs, ignore_index=True)
    final_dist = pd.concat(dist_tabs, ignore_index=True)
    final_region = pd.concat(region_tabs, ignore_index=True)

    org_csv_path = out_path / f"{out_file_name}_organelles.csv"
    final_org.to_csv(org_csv_path)

    contact_csv_path = out_path / f"{out_file_name}_contacts.csv"
    final_contact.to_csv(contact_csv_path)

    dist_csv_path = out_path / f"{out_file_name}_distributions.csv"
    final_dist.to_csv(dist_csv_path)

    region_csv_path = out_path / f"{out_file_name}_regions.csv"
    final_region.to_csv(region_csv_path)

    end = time.time()
    print(f"Quantification for {count} files is COMPLETE! Files saved to '{out_path}'.")
    print(f"It took {(end - start)/60} minutes to quantify these files.")
    return count

In [13]:
# all the imaging data goes here.
data_root_path = Path(os.path.expanduser("~")) / "Documents/Python_Scripts/Infer-subc"
# linearly unmixed ".czi" files are here
raw_data_path = data_root_path / "raw"
# save output ".tiff" files here
seg_data_path = data_root_path / "out"
seg_suffix = "-20230426_test_"
# save stats here
out_data_path = data_root_path / "quant"

# names of organelles and regions to quantify
organelle_names = ["lyso", "mito","golgi","perox","ER","LD"]
region_names = ["nuc", "cell", "cyto"]

# channels for each organelle listed in the order they appear above
organelle_channels = [LYSO_CH,MITO_CH,GOLGI_CH,PEROX_CH,ER_CH,LD_CH]

In [14]:
n_files = _batch_process_quantification(out_file_name = "20231101_testa_",
                                  seg_path=seg_data_path,
                                  out_path=out_data_path, 
                                  raw_path=raw_data_path, 
                                  raw_file_type=".czi",
                                  organelle_names=organelle_names,
                                  organelle_channels=organelle_channels,
                                  region_names=region_names,
                                  masks_file_name="masks",
                                  mask="cell",
                                  dist_centering_obj="nuc", 
                                  dist_num_bins=5,
                                  dist_center_on=False,
                                  dist_keep_center_as_bin=True,
                                  dist_zernike_degrees=9,
                                  include_contact_dist=True,
                                  scale=True,
                                  seg_suffix=seg_suffix)

It took 1.3795927206675211 minutes to quantify one image.
Completed processing for 1 images in 1.3852489312489829 mins.
It took 1.5802902698516845 minutes to quantify one image.
Completed processing for 2 images in 2.9696336507797243 mins.
It took 2.0810999711354574 minutes to quantify one image.
Completed processing for 3 images in 5.056293797492981 mins.
It took 2.9021807511647544 minutes to quantify one image.
Completed processing for 4 images in 7.96484758456548 mins.
It took 1.4628190676371255 minutes to quantify one image.
Completed processing for 5 images in 9.431865572929382 mins.
It took 1.8559329668680826 minutes to quantify one image.
Completed processing for 6 images in 11.291857075691222 mins.
It took 1.075549558798472 minutes to quantify one image.
Completed processing for 7 images in 12.372037422657012 mins.
It took 2.0390569965044656 minutes to quantify one image.
Completed processing for 8 images in 14.417853037516275 mins.
It took 2.74224986632665 minutes to quantify 