# General Nuclear Dataset Construction - Large

In [1]:
import os
import errno

import numpy as np

import deepcell

In [None]:
from deepcell_tracking.utils import load_trks
from sklearn.model_selection import train_test_split
from deepcell.utils.data_utils import reshape_movie
from deepcell.utils.transform_utils import erode_edges

# Download the data (saves to ~/.keras/datasets)
hela_filename = 'HeLa_S3.trks'
hek_filename = 'HEK293.trks'
nih_filename = '3T3_NIH.trks'
raw_filename = 'RAW2647.trks'

filenames = [hela_filename, hek_filename, nih_filename, raw_filename]
data_dir = '/data/training_data'

seed = 0

X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

cell_type_list = []
train_batch_number_list = []
test_batch_number_list = []


for filename in filenames:
    path = os.path.join(data_dir, filename)
    training_data = load_trks(path)
    X = training_data['X']
    y = training_data['y']
    
    X = X[:,:,:,:]
    y = y[:,:,:,:]
    
    if filename == hela_filename:
        X = X[:,:30,:,:]
        y = y[:,:30,:,:]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    
    X_train_list.append(X_train)
    y_train_list.append(y_train)
    X_test_list.append(X_test)
    y_test_list.append(y_test)


In [49]:
X_test_reshaped = []
y_test_reshaped = []

n_unique_test_cells = 0
n_test_cells = 0

for xt, yt in zip(X_test_list, y_test_list): 
    for i in range(yt.shape[0]):
        n_unique_test_cells += len(np.unique(yt[i])) - 1
        for k in range(yt.shape[1]):
            n_test_cells += len(np.unique(yt[i, k])) - 1
    xtr, ytr = reshape_movie(xt, yt, reshape_size=128)
    
    xtr = xtr.reshape((-1,128,128,1))
    ytr = ytr.reshape((-1,128,128,1))
    
    X_test_reshaped.append(xtr)
    y_test_reshaped.append(ytr)

X_test_reshaped = np.concatenate(X_test_reshaped, axis=0)
y_test_reshaped = np.concatenate(y_test_reshaped, axis=0)

print(X_test_reshaped.shape)
print(y_test_reshaped.shape)
print("n_unique_test_cells", n_unique_test_cells)
print("n_test_cells", n_test_cells)

Reshaped feature data from (36, 30, 216, 256, 1) to (144, 30, 128, 128, 1)
Reshaped training data from (36, 30, 216, 256, 1) to (144, 30, 128, 128, 1)
Reshaped feature data from (52, 30, 135, 160, 1) to (208, 30, 128, 128, 1)
Reshaped training data from (52, 30, 135, 160, 1) to (208, 30, 128, 128, 1)
Reshaped feature data from (48, 30, 154, 182, 1) to (192, 30, 128, 128, 1)
Reshaped training data from (48, 30, 154, 182, 1) to (192, 30, 128, 128, 1)
Reshaped feature data from (25, 30, 202, 240, 1) to (100, 30, 128, 128, 1)
Reshaped training data from (25, 30, 202, 240, 1) to (100, 30, 128, 128, 1)
(19320, 128, 128, 1)
(19320, 128, 128, 1)
n_unique_test_cells 2488
n_test_cells 57620


In [24]:
total_num_batches = np.sum([x.shape[0] for x in X_train_list])

def get_file_index(index):
    if index < X_train_list[0].shape[0]:
        return 0, index
    elif index < X_train_list[0].shape[0] + X_train_list[1].shape[0]:
        return 1, index - X_train_list[0].shape[0]
    elif index < X_train_list[0].shape[0] + X_train_list[1].shape[0] + X_train_list[2].shape[0]:
        return 2, index - (X_train_list[0].shape[0] + X_train_list[1].shape[0])
    else:
        return 3, index - (X_train_list[0].shape[0] + X_train_list[1].shape[0] + X_train_list[2].shape[0])

In [44]:
total_num_batches = np.sum([x.shape[0] for x in X_train_list])
print("total_num_batches: ", total_num_batches)
permutation = np.random.permutation(total_num_batches)

batch_sizes = [1, 10, 50, 100, 200, 400, total_num_batches]
batch_n_unique_dict = dict()
batch_n_cells_dict = dict()

reshaped_X_train = []
reshaped_y_train = []
    
file_idx, batch_idx = get_file_index(permutation[0])
batch_n_unique_dict[1] = len(np.unique(y_train_list[file_idx][batch_idx])) - 1
batch_n_cells_dict[1] = 0
for k in range(y_train_list[file_idx][batch_idx].shape[0]):
    batch_n_cells_dict[1] += len(np.unique(y_train_list[file_idx][batch_idx][k])) - 1



Xt, yt = reshape_movie(X_train_list[file_idx][batch_idx:batch_idx+1], 
                       y_train_list[file_idx][batch_idx:batch_idx+1], reshape_size=128)
Xt = Xt.reshape((-1,128,128,1))
yt = yt.reshape((-1,128,128,1))
reshaped_X_train.append(Xt)
reshaped_y_train.append(yt)

for i in range(1, len(batch_sizes)):
    start = batch_sizes[i-1]
    batch_size = batch_sizes[i]
    batch_n_unique_dict[batch_size] = batch_n_unique_dict[batch_sizes[i-1]]
    batch_n_cells_dict[batch_size] = batch_n_cells_dict[batch_sizes[i-1]]
    for j in range(start, batch_size):
        file_idx, batch_idx = get_file_index(permutation[j])
        batch_n_unique_dict[batch_size] += len(np.unique(y_train_list[file_idx][batch_idx])) - 1
        
        for k in range(y_train_list[file_idx][batch_idx].shape[0]):
            batch_n_cells_dict[batch_size] += len(np.unique(y_train_list[file_idx][batch_idx][k])) - 1
            
        Xt, yt = reshape_movie(X_train_list[file_idx][batch_idx:batch_idx+1], 
                               y_train_list[file_idx][batch_idx:batch_idx+1], reshape_size=128)
        
        Xt = Xt.reshape((-1,128,128,1))
        yt = yt.reshape((-1,128,128,1))
        reshaped_X_train.append(Xt)
        reshaped_y_train.append(yt)
    

total_num_batches:  642
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30

Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30

Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30

Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30

Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30

Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30

Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30

Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30

Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30

Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30

Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30

Reshaped feature data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 202, 240, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 154, 182, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 135, 160, 1) to (4, 30, 128, 128, 1)
Reshaped feature data from (1, 30, 216, 256, 1) to (4, 30, 128, 128, 1)
Reshaped training data from (1, 30, 216, 256, 1) to (4, 30

In [45]:
reshaped_X_train_arr = np.concatenate(reshaped_X_train, axis=0)
reshaped_y_train_arr = np.concatenate(reshaped_y_train, axis=0)

print(reshaped_X_train_arr.shape)
print(reshaped_y_train_arr.shape)

(77040, 128, 128, 1)
(77040, 128, 128, 1)


In [46]:
print("batch_n_unique_dict", batch_n_unique_dict)
print("batch_n_cells_dict", batch_n_cells_dict)

batch_n_unique_dict {1: 8, 10: 152, 50: 802, 100: 1481, 200: 2955, 400: 6164, 642: 10097}
batch_n_cells_dict {1: 119, 10: 3376, 50: 18365, 100: 34222, 200: 68284, 400: 142083, 642: 231641}


In [50]:
train_name = 'general_nuclear_train_large_new.npz'
test_name = 'general_nuclear_test_large_new.npz'

DATA_FILE_TRAIN = os.path.join(data_dir, train_name)
DATA_FILE_TEST = os.path.join(data_dir, test_name)

np.savez(DATA_FILE_TRAIN, X=reshaped_X_train_arr, y=reshaped_y_train_arr, 
         batch_n_unique_dict=batch_n_unique_dict, batch_n_cells_dict=batch_n_cells_dict)
np.savez(DATA_FILE_TEST, X=X_test_reshaped, y=y_test_reshaped, 
         n_unique_test_cells=n_unique_test_cells, n_test_cells=n_test_cells)

In [52]:
dataset_sizes = 4 * 30 * np.array([1, 10, 50, 100, 200, 400, total_num_batches])


In [53]:
dataset_sizes

array([  120,  1200,  6000, 12000, 24000, 48000, 77040])