In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from six.moves import cPickle as pickle

In [2]:
train_datasets = os.listdir('notMNIST_large')
train_datasets = [i for i in train_datasets if i[-7:] == ".pickle"]
print(train_datasets)

test_datasets = os.listdir('notMNIST_small')
test_datasets = [i for i in test_datasets if i[-7:] == ".pickle"]
print(test_datasets)

['F.pickle', 'J.pickle', 'C.pickle', 'A.pickle', 'D.pickle', 'I.pickle', 'H.pickle', 'B.pickle', 'G.pickle', 'E.pickle']
['F.pickle', 'J.pickle', 'C.pickle', 'A.pickle', 'D.pickle', 'I.pickle', 'H.pickle', 'B.pickle', 'G.pickle', 'E.pickle']


In [3]:
image_size = 28

In [4]:
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

In [5]:
def merge_datasets(pickle_files,train_size,valid_size=0):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size,image_size)
    train_dataset, train_labels = make_arrays(train_size,image_size)
    
    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes
    
    start_v, start_t= 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class+tsize_per_class
    
    for label, pickle_file in enumerate(pickle_files): 
        try:
            
            if(valid_size is not 0):
                pickle_file = 'notMNIST_large/' + pickle_file
            else:
                pickle_file = 'notMNIST_small/' + pickle_file
            
            print(pickle_file)
            with open(pickle_file, 'rb') as f:
                letter_set = pickle.load(f)
                
                np.random.shuffle(letter_set)
                if valid_dataset is not None:
                    valid_letter = letter_set[:vsize_per_class, :, :]
                    valid_dataset[start_v:end_v, :, :] = valid_letter
                    valid_labels[start_v:end_v] = label
                    start_v += vsize_per_class
                    end_v += vsize_per_class
                    
                train_letter = letter_set[vsize_per_class:end_l, :, :]
                print(train_letter.shape)
                print(end_l-vsize_per_class)
                print(end_t-start_t)
                train_dataset[start_t:end_t, :, :] = train_letter
                train_labels[start_t:end_t] = label
                start_t += tsize_per_class
                end_t += tsize_per_class
        
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise
            
    return valid_dataset, valid_labels, train_dataset, train_labels

In [6]:
train_size = 200000
valid_size = 10000
test_size = 10000

In [7]:
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)

notMNIST_large/F.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/J.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/C.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/A.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/D.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/I.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/H.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/B.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/G.pickle
(20000, 28, 28)
20000
20000
notMNIST_large/E.pickle
(20000, 28, 28)
20000
20000


In [8]:
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

notMNIST_small/F.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/J.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/C.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/A.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/D.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/I.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/H.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/B.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/G.pickle
(1000, 28, 28)
1000
1000
notMNIST_small/E.pickle
(1000, 28, 28)
1000
1000


In [9]:
print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

Training: (200000, 28, 28) (200000,)
Validation: (10000, 28, 28) (10000,)
Testing: (10000, 28, 28) (10000,)


In [10]:
def randomize(dataset, labels):
  print(labels.shape)
  permutation = np.random.permutation(labels.shape[0])
  print(permutation)
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels


In [11]:
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

(200000,)
[ 71901  68816 104943 ... 117402 126177  43233]
(10000,)
[7659 9240 8746 ... 1331 9960 8649]
(10000,)
[8605 4321 8246 ... 8529 2283 2263]


In [12]:
data_root = '.'
pickle_file = os.path.join(data_root, 'notMNIST.pickle')

In [13]:
try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

In [14]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

Compressed pickle size: 690800512
