# Batch process segmentation made with Ilastik

This code performs the identical steps as in notebook 3, but in a more streamlined way, allowing for batch processing without visual inspection.

The processing functions have been moved to the process_colonies.py script.

In [1]:
#main data analysis packages
import numpy as np
import pandas as pd
import dask.array as da

#path handling
import pathlib
import h5py

#dask cash
from dask.cache import Cache
cache = Cache(4e9)  # Leverage 4 gigabytes of memory
cache.register()    # Turn cache on globally

import process_colonies as pc

## Set Paths and Settings

In [3]:
#set path to registered file
path_reg_im = pathlib.Path("/Users/simonvanvliet/TempData")

#set path to Ilastik output file
path_seg_im = pathlib.Path("/Users/simonvanvliet/TempData")

#set path to temporary store label images (use local machine folder for speed)
temp_data_path = pathlib.Path.home() / 'TempData'
temp_data_path.mkdir(exist_ok=True)

#set path to output csv files
path_data_files = pathlib.Path('/Users/simonvanvliet/TempData/DataFiles/') 
path_data_files.mkdir(exist_ok=True)

#set filenames
exp_name_list = ["20230411","20230427"]

#specify properties to extract 
prop_list = ['label', 
            'area', 'centroid', 
            'axis_major_length', 'axis_minor_length']

#specify processing settings
settings = {
    'calc_edge_dist'    : True, #set to true to calculate distance between colony edges, more accurate that center to center distance, but very slow
    'prop_list'         : prop_list,
    #specify the order of the strains in the Ilastik layers
    #specify the order of the strains in the Ilastik layers
    'idx_SA1'   : 1, #SA1 is GFP
    'idx_SA2'   : 0, #SA2 is RFP
    'idx_BG'    : 2,
    'idx_PA'    : 3,
    #specify the segementation processing parameters for pseudomonas
    'sigma'             : 1, # sigma for gaussian filter
    'threshold_PA'      : 0.5, # threshold for segmentation
    'closing_radius_PA' : 5, # radius for closing operation
    'min_cell_area_PA'  : 50, # minimum area for a cell to be considered
    'max_hole_area_PA'  : 5000, # maximum area for a hole to be filled
    #specify the segementation processing parameters for staph
    'sigma'             : 2, # sigma for gaussian filter
    'threshold_SA'      : 0.5, # threshold for segmentation
    'closing_radius_SA' : 5, # radius for closing operation
    'min_cell_area_SA'  : 50, # minimum area for a cell to be considered
    'max_hole_area_SA'  : 1000, # maximum area for a hole to be filled    
    #store path metadata
    'temp_path'         : temp_data_path,
    'path_seg_im_root'  : path_seg_im,
    'path_data_files'    : path_data_files,
    'exp_name_list'         : exp_name_list
}

## Loop experiments and positions
First make sure that all positions are found

In [3]:
for exp_name in exp_name_list:
    seg_path = settings['path_seg_im_root'] / f'{exp_name}-agar-pad-processed' 
    pos_list = [f.name for f in sorted(seg_path.glob('*_Probabilities.h5'))]
    for pos in pos_list: print(pos)

20230411_reg_p000_Probabilities.h5
20230411_reg_p001_Probabilities.h5
20230411_reg_p002_Probabilities.h5
20230411_reg_p003_Probabilities.h5
20230411_reg_p004_Probabilities.h5
20230411_reg_p005_Probabilities.h5
20230411_reg_p006_Probabilities.h5
20230411_reg_p007_Probabilities.h5
20230411_reg_p008_Probabilities.h5
20230411_reg_p009_Probabilities.h5
20230411_reg_p010_Probabilities.h5
20230411_reg_p011_Probabilities.h5
20230411_reg_p012_Probabilities.h5
20230411_reg_p013_Probabilities.h5
20230411_reg_p014_Probabilities.h5
20230411_reg_p015_Probabilities.h5
20230411_reg_p016_Probabilities.h5
20230411_reg_p017_Probabilities.h5
20230411_reg_p018_Probabilities.h5
20230411_reg_p019_Probabilities.h5
20230411_reg_p020_Probabilities.h5
20230411_reg_p021_Probabilities.h5
20230411_reg_p022_Probabilities.h5
20230411_reg_p023_Probabilities.h5
20230427_reg_p000_Probabilities.h5
20230427_reg_p001_Probabilities.h5
20230427_reg_p002_Probabilities.h5
20230427_reg_p003_Probabilities.h5
20230427_reg_p004_Pr

now we loop positions, this will take a while

In [5]:
for exp_name in exp_name_list:   
    #load metadata and add to settings
    metadata_path = settings['path_data_files'] / f'agarpad_{exp_name}.csv'
    pos_metadata = pd.read_csv(metadata_path, index_col=0)
    
    settings['pos_metadata'] = pos_metadata
    settings['exp_name'] = exp_name
    settings['path_seg_im'] = settings['path_seg_im_root'] / f'{exp_name}-agar-pad-processed' 

    csv_dir_pos = settings['path_data_files'] / exp_name
    csv_dir_pos.mkdir(exist_ok=True)
    
    #get positions
    pos_list = [f.name for f in sorted(settings['path_seg_im'].glob('*_Probabilities.h5'))]
    
    #segment track and process all positions
    for pos in pos_list:
        #check if csv file already exists and skip if already processed
        
        file_name = pos.split('_Prob')[0]
        pos_idx = int(file_name.split('_p')[-1])
    
        csv_dir_pos = settings['path_data_files'] / settings['exp_name']
        csv_name = csv_dir_pos / f"{settings['exp_name']}_pos{pos_idx:03d}.csv"
                
        if not csv_name.exists():
            print(f"   Processing position nr {pos_idx} name {file_name}")
            #try:
                #segment colony and store label image
            df = pc.process_pos(pos_idx, settings, store_2_disk=True, clean_disk=True, max_frame_e2e=20, ignore_edge_e2e=300)
            # except:
            #     print("X-> Error processing position {}".format(pos))      

   Processing position nr 0 name 20230504_reg_p000
   Processing position nr 1 name 20230504_reg_p001
   Processing position nr 2 name 20230504_reg_p002
   Processing position nr 3 name 20230504_reg_p003
   Processing position nr 4 name 20230504_reg_p004
   Processing position nr 5 name 20230504_reg_p005
   Processing position nr 6 name 20230504_reg_p006
   Processing position nr 7 name 20230504_reg_p007
   Processing position nr 8 name 20230504_reg_p008
   Processing position nr 9 name 20230504_reg_p009
   Processing position nr 10 name 20230504_reg_p010
   Processing position nr 11 name 20230504_reg_p011
   Processing position nr 12 name 20230504_reg_p012
   Processing position nr 13 name 20230504_reg_p013
   Processing position nr 14 name 20230504_reg_p014
   Processing position nr 15 name 20230504_reg_p015
   Processing position nr 16 name 20230504_reg_p016
   Processing position nr 17 name 20230504_reg_p017
   Processing position nr 18 name 20230504_reg_p018
   Processing position

In [4]:
df_all = []
for exp_name in exp_name_list:

    csv_dir_pos = settings['path_data_files'] / exp_name      
    df_exp = [pd.read_csv(pos, index_col=0) for pos in sorted(csv_dir_pos.glob('*_pos*.csv'))]
    df_all.append(pd.concat(df_exp).reset_index(drop=True))
    
df_combined = pd.concat(df_all).reset_index(drop=True)  
csv_name = settings['path_data_files'] / "all_data.csv"
df_combined.to_csv(csv_name)        