In [2]:
from talk_hemato_utils import load_data
import h5py
import pickle
import numpy as np
from talk_hemato_utils import ismember

Using TensorFlow backend.


# Preprocessing

To make it more accessible, I took the data provided with the [original code](http://github.com/QSCD/HematoFatePrediction), and rearange the data somewhat.
Get the original data from [here](https://hmgubox.helmholtz-muenchen.de:8001/d/ccbfb5f1ac/).

# add inverted generations
Let's put the information about inverted generation into the latent dataset directly, currently its stored in a separate hdf5 file



In [3]:
outfile = '../data/images_round3_test_latent_inverted_generations.pickle'  # the merged dataset will end up here
latent_file = '../data_original/images_round3_test_latent.pickle'
annotation_file = '../data_original/anno_file.h5'

X_l, y_l, movement_l, cellIDs_l = load_data(latent_file, N=None, randomize=True)

mat = h5py.File(annotation_file, 'r')
gens = mat['anno']['latent'][:][0]
gens = np.array([gens[i] for i in cellIDs_l])  # expand into a vector, one element for each patch

assert len(gens) == len(X_l)== len(y_l)== len(movement_l)== len(cellIDs_l)

with open(outfile, 'wb') as fh:
    pickle.dump([X_l, y_l, movement_l, cellIDs_l, gens], fh)

# Shrink the dataset sizes
The original dataset (which is still only one experiment) is quite large (4GB+4GB).
Let's shrink it to more managable pieces.

In [6]:
def create_subset_ix(cellids, Ncells):
    """
    selects a specd. number of cells.
    cellids: a vector of cellids. 
             contains one element per image patch/sample, i.e. a single cellid will occur multiple times
    Ncells: the desired number of cells to select at random
    
    returns: a boolean vector of cellids.shape; can be used to subindex the total dataset
    """
    
    unique_cells = np.unique(cellids)

    cell_subset = np.random.choice(unique_cells, Ncells, replace=False)
    subset_ix = ismember(cellids, cell_subset)
    subset_ix = np.array([_ is not None for _ in subset_ix])

    return subset_ix

## Annotated data
We use only the testset in the notebooks anyways. Still the testset is quite large (in MB), so pick out a few cells (remember that one cell has multiple image patches). 
**Note**: I retrained the CNN, so we have to use that particular datasplit, and create a smaller version of that split

In [4]:
full_annotated_split = '../data/retrained_datasplit.pickle'
outfile_annotated = '../data_small/small_retrained_datasplit.pickle'
with open(full_annotated_split, 'rb') as fh:
    X_train,X_val,X_test,\
    y_train,y_val,y_test,\
    mov_train, mov_val, mov_test,\
    cell_train, cell_val, cell_test = pickle.load(fh)

print(" %d train data\n %d val. data\n %d test data" % (len(X_train), len(X_val), len(X_test)))

 251524 train data
 83842 val. data
 111789 test data


In [7]:
Ncells = 100 
subset_ix = create_subset_ix(cell_test, Ncells) # extract patches of 100 cells

X_test_subset = X_test[subset_ix]
y_test_subset = y_test[subset_ix]
mov_test_subset = mov_test[subset_ix]
cell_test_subset = cell_test[subset_ix]

print("%d patches, %d cells in total" % (len(X_test_subset), len(np.unique(cell_test_subset))))

11647 patches, 100 cells in total


In [53]:
with open(outfile_annotated, 'wb') as fh:
    pickle.dump([X_test_subset, y_test_subset, mov_test_subset, cell_test_subset], fh)

## Latent cells

In [8]:
outfile = '../data/images_round3_test_latent_inverted_generations.pickle'  # latent dataset + inverted generations
with open(outfile, 'rb') as fh:
    X_l, y_l, movement_l, cellIDs_l, gens = pickle.load(fh)

In [9]:
Ncells = 100
subset_ix = create_subset_ix(cellIDs_l, Ncells) # extract patches of 100 cells

X_l_subset = X_l[subset_ix]
y_l_subset = y_l[subset_ix]
mov_l_subset = movement_l[subset_ix]
cell_l_subset = cellIDs_l[subset_ix]
gens_l_subset = gens[subset_ix]

some stats about that dataset. In particular, lets check how many cells in each inverted generation we have

In [10]:
print("%d patches, %d cells in total" % (len(X_l_subset), len(np.unique(cell_l_subset))))

import toolz
print('\npatches per inv.gen')
print(toolz.frequencies(gens_l_subset))

print('\ncells per inv.gen')
print({_:np.unique(cell_l_subset[gens_l_subset==_]).shape[0] for _ in range(-8,0)})

46800 patches, 100 cells in total

patches per inv.gen
{-5.0: 2353, inf: 20237, -1.0: 10017, -3.0: 5851, -2.0: 6031, -4.0: 2311}

cells per inv.gen
{-8: 0, -7: 0, -6: 0, -5: 4, -4: 6, -3: 13, -2: 15, -1: 21}


In [30]:
with open('../data_small/small_images_round3_test_latent_inverted_generations.pickle', 'wb') as fh:
    pickle.dump([X_l_subset, y_l_subset, mov_l_subset, cell_l_subset, gens_l_subset], fh)