# Create training Data

In this notebook we will create a training dataset by sampling images from a large set of data.

This notebook needs to be adapted to the organization of your own data. 

In [15]:
#path handling
import pathlib

#file handling
import h5py

import numpy as np

#out of memory computation
from dask_image.imread import imread
import dask.array as da

import tifffile

from glob import glob
import os


from dask.cache import Cache
cache = Cache(2e9)  # Leverage two gigabytes of memory
cache.register()    # Turn cache on globally

In [26]:
#Specify folder with experimental data, we assume it contains a bunch of subfolders with each a bunch of tiff stacks.
#Each subfolder is assume to be of a different strain/condition

#set root dir
root = pathlib.Path('/Volumes/RG-UJ01-Data01/Andreas/pQFmcs-D11-scarREF datasets for Simon/CauloMutants_MoviesCropped/')

#find subfolders
folder_names = [f.name for f in sorted(root.glob('AKS*'))]
print(folder_names)

#show images in first subfolder
im_names = [f.name for f in sorted((root / folder_names[0]).glob('*.tif*'))]
print(im_names)

#set folder where to store output files
process_dir = pathlib.Path('/Users/simonvanvliet/Andreas')

['AKS1088', 'AKS1089', 'AKS1090', 'AKS1091', 'AKS1092', 'AKS1093', 'AKS1094', 'AKS1095', 'AKS1096', 'AKS1097', 'AKS1098', 'AKS1099', 'AKS1101', 'AKS1102', 'AKS1103', 'AKS1104', 'AKS1105', 'AKS1106']
['20220311_AKS1088-1093_dense_TL01_01_R3D-1.tif', '20220311_AKS1088-1093_dense_TL01_02_R3D-1.tif', '20220311_AKS1088-1093_dense_TL01_03_R3D-1.tif', '20220311_AKS1088-1093_dense_TL01_04_R3D-1.tif', '20220311_AKS1088-1093_dense_TL01_05_R3D-1.tif']


In [30]:
#set number of color channels in image
n_channel = 3

#set the sample interval, in each subfolder we will sample images at this interval (e.g. with sample_int=3 we will sample image 0, 3, 6, etc)
#adjust this number to have ~2 images per subfolder
sample_int = 3 

#we will extract two channels here: phase contrast + fluorescence 
ch0 = [] # phase contrast channel
ch1 = [] # fluorescence channel of constitutive marker

for f in folder_names: # loop subfolders
    im_names = [i.name for i in sorted((root / f).glob('*.tif*'))]
    
    for idx, i in enumerate(im_names): #loop images
        if idx%sample_int==0:
            im_path = root / f / i
            im_stack = imread(im_path)
            nfr = im_stack.shape[0]/n_channel
            
            #here we set which frames we sample, adapt to your own images, take frames at start, middle, and end
            frames = [0, max(nfr - 31, 0), max(nfr - 21, 0), nfr-11]
            
            for fr in frames:
                ch0_idx = fr*n_channel
                ch1_idx = fr*n_channel + 2
                ch0.append(np.squeeze(im_stack[ch0_idx,:,:]))
                ch1.append(np.squeeze(im_stack[ch1_idx,:,:]))


ch0_stack = np.stack(ch0, axis=0)    
ch1_stack = np.stack(ch1, axis=0)   
mc_stack = np.stack([ch0_stack, ch1_stack], axis=0)                 

In [31]:
#store as hdf5
outname = process_dir /  'traning_data_excl_last.hdf5'

h5f = h5py.File(outname, 'w')
h5f.create_dataset('dataset_1', data=mc_stack)
h5f.close()

In [34]:
#store as tiff
ch0n = process_dir / 'ph_training.tif'
ch1n = process_dir / 'gfp_training.tif'

tifffile.imwrite(ch0n, ch0_stack)
tifffile.imwrite(ch1n, ch1_stack)