In [1]:
# Imports
from __future__ import print_function
import numpy as np
import os
import sys
from IPython.display import display, Image
from scipy import ndimage
import scipy.misc
from six.moves import cPickle as pickle
import random

In [2]:
# Constants
data_root='pickles'
final_pickle='blood_cells_temp.pickle'
train_folders=['TRAIN/EOSINOPHIL','TRAIN/LYMPHOCYTE','TRAIN/MONOCYTE','TRAIN/NEUTROPHIL']

num_classes = 4
np.random.seed(133)
image_height = 120 
image_width = 160
pixel_depth = 255.0
num_channels = 1

train_size = 800
valid_size = 160
test_size = 160

In [3]:
# Function for converting image from RGB to grayscale
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

# Function for loading data for a single bloodcell label
def load_bloodcell(folder):
    image_files = os.listdir(folder)
    random.shuffle(image_files)
    dataset = np.ndarray(shape=(len(image_files), image_height, image_width),dtype=np.float32)
    print(folder)
    num_images = 0
    for image in image_files:
        image_file = os.path.join(folder, image)
        try:
            image_data = (ndimage.imread(image_file).astype(float))
            image_data=scipy.misc.imresize(image_data,(image_height,image_width))
            image_data=rgb2gray(image_data)
            dataset[num_images, :, :] = image_data
            num_images = num_images + 1
        except IOError as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')

    dataset = dataset[0:num_images, :, :]
    mean=np.mean(dataset)
    dataset-=mean
    dataset/=pixel_depth
    print('Full dataset tensor:', dataset.shape)
    print('Mean:', np.mean(dataset))
    print('Standard deviation:', np.std(dataset))
    return dataset


In [4]:
# Function for converting data into separate pickle files for each label
def maybe_pickle(data_folders, force=False):
    dataset_names = []
    for folder in data_folders:
        set_filename = folder + '.pickle'
        dataset_names.append(set_filename)
        if os.path.exists(set_filename) and not force:
          # You may override by setting force=True.
            print('%s already present - Skipping pickling.' % set_filename)
        else:
            print('Pickling %s.' % set_filename)
            dataset = load_bloodcell(folder, min_num_images_per_class)
            try:
                with open(set_filename, 'wb') as f:
                    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            except Exception as e:
                print('Unable to save data to', set_filename, ':', e)
    return dataset_names

train_datasets = maybe_pickle(train_folders)


TRAIN/EOSINOPHIL.pickle already present - Skipping pickling.
TRAIN/LYMPHOCYTE.pickle already present - Skipping pickling.
TRAIN/MONOCYTE.pickle already present - Skipping pickling.
TRAIN/NEUTROPHIL.pickle already present - Skipping pickling.


In [5]:
# Function for formatting the data according to image size and number of examples
def make_arrays(nb_rows, img_height,img_width):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_height, img_width), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

In [6]:
# Function for merging the pickle files of different labels and creating training, validation and test datasets
def merge_datasets(pickle_files, train_size,test_size, valid_size):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size, image_height,image_width)
    test_dataset, test_labels = make_arrays(test_size, image_height,image_width)
    train_dataset, train_labels = make_arrays(train_size, image_height,image_width)
    vsize_per_class = valid_size // num_classes
    testsize_per_class=test_size//num_classes
    tsize_per_class = train_size // num_classes

    start_v, start_t ,start_test= 0, 0, 0
    end_v, end_t ,end_test= vsize_per_class, tsize_per_class,testsize_per_class
    end_l = vsize_per_class+tsize_per_class+testsize_per_class
    for label, pickle_file in enumerate(pickle_files):
        try:
            with open(pickle_file, 'rb') as f:
                bloodcell_set = pickle.load(f)
                # let's shuffle the bloodcells to have random validation and training set
                np.random.shuffle(bloodcell_set)
                if valid_dataset is not None:
                    valid_bloodcell = bloodcell_set[:vsize_per_class, :, :]
                    valid_dataset[start_v:end_v, :, :] = valid_bloodcell
                    valid_labels[start_v:end_v] = label
                    start_v += vsize_per_class
                    end_v += vsize_per_class

                test_bloodcell = bloodcell_set[vsize_per_class:vsize_per_class+testsize_per_class, :, :]
                test_dataset[start_test:end_test, :, :] = test_bloodcell
                test_labels[start_test:end_test] = label
                start_test += testsize_per_class
                end_test += testsize_per_class

                train_bloodcell = bloodcell_set[vsize_per_class+testsize_per_class:end_l, :, :]
                train_dataset[start_t:end_t, :, :] = train_bloodcell
                train_labels[start_t:end_t] = label
                start_t += tsize_per_class
                end_t += tsize_per_class
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise

    return valid_dataset, valid_labels,test_dataset,test_labels, train_dataset, train_labels

valid_dataset, valid_labels,test_dataset, test_labels, train_dataset, train_labels = merge_datasets(train_datasets, train_size,test_size, valid_size)
print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

Training: (800, 120, 160) (800,)
Validation: (160, 120, 160) (160,)
Testing: (160, 120, 160) (160,)


In [7]:
# Randomly shuffling the data
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

In [8]:
# Saving the final pickle file containing training, validation and test data
final_pickle = os.path.join(data_root,final_pickle)
try:
    with open(final_pickle, 'wb') as f:
        save = {
            'train_dataset': train_dataset,
            'train_labels': train_labels,
            'valid_dataset': valid_dataset,
            'valid_labels': valid_labels,
            'test_dataset': test_dataset,
            'test_labels': test_labels,
            }
        pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
        f.close()
except Exception as e:
    print('Unable to save data to', final_pickle, ':', e)
    raise

statinfo = os.stat(final_pickle)
print('Compressed pickle size:', statinfo.st_size)


Compressed pickle size: 86020913
