# Create training Data

In this notebook we will create a training dataset by sampling images from a large set of data.

In [27]:
#path handling
import pathlib
import numpy as np

#out of memory computation
import dask.array as da
import h5py 

#set dask cash
from dask.cache import Cache
cache = Cache(2e9)  # Leverage two gigabytes of memory
cache.register()    # Turn cache on globally

Set folders and settings

In [28]:
#SET root dir
root = pathlib.Path('/Volumes/ScientificData/Users/Giulia(botgiu00)/Collaborations/Ashley')

#Set folders to experiments
folders = ['20230411-agar-pad-processed',
           '20230427-agar-pad-processed',
           '20230504-agar-pad-processed']

process_dir = pathlib.Path('/Users/simonvanvliet/TempData')
outname = 'traning_data_PASA.h5'

pos_int = 4 #interval of positions to export
t_int = 15 #interval of timepoints to export

/Volumes/ScientificData/Users/Giulia(botgiu00)/Collaborations/Ashley


Create list of images to export

In [None]:
im_names = []
#find subfolders
for i, folder in enumerate(folders):
    file_names = [f.name for f in sorted((root / folder).glob('20??????_reg_*p???.h5'))]
    file_names = file_names[::pos_int]
    for file in file_names:
        im_names.append(root / folder / file)

Load images and save to file

In [30]:
for i, file_name_im in enumerate(im_names):
    dask_im_file = h5py.File(file_name_im, 'r') #open 
    chunk_size = (1, *dask_im_file['images'].shape[-3:])
    data = da.from_array(dask_im_file['images'], chunks=chunk_size)
    data = data[::t_int]
    all_data = data if i == 0 else da.concatenate([all_data, data], axis=0)

#store as hdf5
outname = process_dir /  outname
all_data.to_hdf5(outname, '/images')    