# Preprocess the ubyte files into images

In [1]:
import gzip
import csv
import numpy as np
from scipy.misc import imsave
import os

In [2]:
try:
    os.makedirs("train")
except:
    pass

In [3]:
try:
    os.makedirs("test")
except:
    pass

In [5]:
def extract_data(filename, num_images):
    """Extract the images into a 4D tensor [image index, y, x, channels].
    Values are rescaled from [0, 255] down to [-0.5, 0.5].
    """
    print('Extracting', filename)
    with gzip.open(filename) as bytestream:
        bytestream.read(16)
        buf = bytestream.read(28 * 28 * num_images)
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
        #data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
        data = data.reshape(num_images, 28, 28, 1)
    return data

In [3]:
def extract_labels(filename, num_images):
    """Extract the labels into a vector of int64 label IDs."""
    print('Extracting', filename)
    with gzip.open(filename) as bytestream:
        bytestream.read(8)
        buf = bytestream.read(1 * num_images)
        labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
    return labels

In [13]:
train_data = extract_data("data/emnist-byclass-train-images-idx3-ubyte.gz", 70000)
train_labels = extract_labels("data/emnist-byclass-train-labels-idx1-ubyte.gz", 70000)

Extracting data/emnist-byclass-train-images-idx3-ubyte.gz
Extracting data/emnist-byclass-train-labels-idx1-ubyte.gz


In [17]:
with open("train-labels.csv", 'w') as csvFile:
    writer = csv.writer(csvFile, delimiter=',', quotechar='"')
    for i in range(len(train_data)):
        imsave("train/" + str(i) + ".jpg", train_data[i][:,:,0])
        writer.writerow(["train/" + str(i) + ".jpg", train_labels[i]])


In [9]:
test_data = extract_data("data/emnist-byclass-test-images-idx3-ubyte.gz", 15000)
test_labels = extract_labels("data/emnist-byclass-test-labels-idx1-ubyte.gz", 15000)

Extracting data/emnist-byclass-test-images-idx3-ubyte.gz
Extracting data/emnist-byclass-test-labels-idx1-ubyte.gz


In [12]:
with open("test-labels.csv", 'w') as csvFile:
    writer = csv.writer(csvFile, delimiter=',', quotechar='"')
    for i in range(len(test_data)):
        imsave("test/" + str(i) + ".jpg", test_data[i][:,:,0])
        writer.writerow(["test/" + str(i) + ".jpg", test_labels[i]])