Dataset: notMNIST
=============



In [None]:
from __future__ import print_function
import os
import sys
from six.moves.urllib.request import urlretrieve

Download the compressed notMNIST files.

In [None]:
url = 'http://commondatastorage.googleapis.com/books1000/'

def maybe_download(filename, source_url):    
  if not os.path.exists(filename):
    filename, _ = urlretrieve(source_url + filename, filename)
    statinfo = os.stat(filename)
    print('Successfully downloaded ' + filename + ' ' + str(statinfo.st_size) + ' bytes.')
  else:
    print(filename + ' already exists - Skip downloading.')   
  return filename

train_filename = maybe_download('notMNIST_large.tar.gz', url)
test_filename = maybe_download('notMNIST_small.tar.gz', url)

Extract and get the dataset.

In [None]:
import tarfile

def maybe_extract(filename):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz

  if os.path.exists(root):
    print('%s already presents - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
    
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]

  return data_folders

train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

Display a sample of the images.

In [None]:
from IPython.display import display, Image

def convert_to_filepath(folders, alphabet, idx):
    folder = folders[ord(alphabet)-ord('A')]
    file_name = os.listdir(folder)[idx]
    return (folder + '/' + file_name)

file_path = convert_to_filepath(train_folders, 'B', 10)
print(file_path)
Image(file_path)

Load the data for a single letter label. Scale the pixel values (-0.5 to 0.5) and remove invalid images. All images are saved as 3-D array.

In [None]:
import numpy as np
from scipy import ndimage

image_size = 28  # pixel width and height
pixel_depth = 255.0  # number of levels per pixel

def load_letter(folder, min_num_images):
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size), dtype=np.float32)

  image_index = 0

  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (ndimage.imread(image_file).astype(float) - pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[image_index, :, :] = image_data
      image_index += 1
    except IOError as e:
      print('Could not read:', image_file, ':', e, '- skipping.')
    
  num_images = image_index
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' % (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset

Do it for all labels and save each as pickle file.

In [None]:
from six.moves import cPickle as pickle

def maybe_pickle(data_folders, min_num_images_per_class):
  dataset_names = []
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    
    if os.path.exists(set_filename):
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)

      f = open(set_filename, 'wb')
      pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      f.close()
  
  return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)

Merge and make train/validation/test sets. Set the manageable size of each dataset, and distribute labels uniformly. (train/validation from large set, test from small set)

In [None]:
np.random.seed(22)

def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class + tsize_per_class

  for label, pickle_file in enumerate(pickle_files):       
    f = open(pickle_file, 'rb')
    letter_set = pickle.load(f)
    # let's shuffle the letters to have random validation and training set
    np.random.shuffle(letter_set)
    
    if valid_dataset is not None:
      valid_letter = letter_set[:vsize_per_class, :, :]
      valid_dataset[start_v:end_v, :, :] = valid_letter
      valid_labels[start_v:end_v] = label
      start_v += vsize_per_class
      end_v += vsize_per_class

    train_letter = letter_set[vsize_per_class:end_l, :, :]
    train_dataset[start_t:end_t, :, :] = train_letter
    train_labels[start_t:end_t] = label
    start_t += tsize_per_class
    end_t += tsize_per_class
    f.close()

  return valid_dataset, valid_labels, train_dataset, train_labels
                    
train_size = 100000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:\t', train_dataset.shape, train_labels.shape)
print('Validation:\t', valid_dataset.shape, valid_labels.shape)
print('Testing:\t', test_dataset.shape, test_labels.shape)

Shuffle each dataset.

In [None]:
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels

train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

Let's check the data to be balanced across classes.

In [None]:
print("Training Set Frequency:\t", np.bincount(train_labels))
print("Valid Set Frequency:\t", np.bincount(valid_labels))
print("Test Set Frequency:\t", np.bincount(test_labels))

Last step. Save the data for reuse.

In [None]:
pickle_file = 'notMNIST.pickle'

f = open(pickle_file, 'wb')
save = {
  'train_dataset': train_dataset,
  'train_labels': train_labels,
  'valid_dataset': valid_dataset,
  'valid_labels': valid_labels,
  'test_dataset': test_dataset,
  'test_labels': test_labels,
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()