In [21]:
# For Fashion MNIST
#strip the first bytes and combine the labels and data
#outputs 2 files, a test and training dataset
#the same code will work for regular MNIST too
import numpy as np


dir = ''
testFiles = ['t10k-images-idx3-ubyte','t10k-labels-idx1-ubyte']
trainFiles = ['train-images-idx3-ubyte','train-labels-idx1-ubyte']

newTestFile = 'fashion_mnist_test_noheader'
newTrainFile = 'fashion_mnist_train_noheader'

In [22]:
def _read32(bytestream):
  dt = np.dtype(np.uint32).newbyteorder('>')
  return np.frombuffer(bytestream.read(4), dtype=dt)[0]

In [23]:
def readLabels(file):
    with open(file, 'rb') as f:
        magicNum = _read32(f)
        numItems = _read32(f)
        buf = f.read(numItems)
        labels = np.frombuffer(buf, dtype=np.uint8)
    return labels

def readImages(file):
    with open(file, 'rb') as f:
        magicNum = _read32(f)
        num_images = _read32(f)
        rows = _read32(f)
        cols = _read32(f)
        buf = f.read(rows * cols * num_images)
        data = np.frombuffer(buf, dtype=np.uint8)
        data = data.reshape(num_images, 784)
    return data

In [24]:
def writeNewFile(file, labels, images):
    with open(file, 'wb') as nf:
        for i in range(len(labels)):
            nf.write(labels[i])
            nf.write(images[i])

In [25]:
testLabels = readLabels(dir+testFiles[1])
trainLabels = readLabels(dir+trainFiles[1])
testImages = readImages(dir + testFiles[0])
trainImages = readImages(dir + trainFiles[0])

In [26]:

print(testLabels.shape)
print(trainLabels.shape)
print(testImages.shape)
print(trainImages.shape)

(10000,)
(60000,)
(10000, 784)
(60000, 784)


## MNIST

In [27]:
testLabels = readLabels(dir+testFiles[1])
trainLabels = readLabels(dir+trainFiles[1])
testImages = readImages(dir + testFiles[0])
trainImages = readImages(dir + trainFiles[0])

In [28]:
writeNewFile(dir+newTestFile, testLabels, testImages)
writeNewFile(dir+newTrainFile, trainLabels, trainImages)

## Big MNIST

In [45]:
# multiplier of 100 produces roughly 750 mb test file and 4.4 GB train file
multiplier = 10 
newTestFile_big = 'mnist_test_noheader_med'
newTrainFile_big = 'mnist_train_noheader_med'

# multiplier of 100 produces roughly 750 mb test file and 4.4 GB train file
#multiplier = 100
#newTestFile_big = 'mnist_test_noheader_big'
#newTrainFile_big = 'mnist_train_noheader_big'

In [46]:
testLabels = readLabels(dir+testFiles[1])
trainLabels = readLabels(dir+trainFiles[1])
testImages = readImages(dir + testFiles[0])
trainImages = readImages(dir + trainFiles[0])

print('Base Shapes: ', testLabels.shape, trainLabels.shape, testImages.shape, trainImages.shape)

Base Shapes:  (10000,) (60000,) (10000, 784) (60000, 784)


In [47]:
testLabels_big = testLabels
trainLabels_big = trainLabels
testImages_big = testImages
trainImages_big = trainImages


for _ in range(multiplier - 1):
    testLabels_big = np.append(testLabels_big, testLabels)
    trainLabels_big = np.append(trainLabels_big, trainLabels)
    testImages_big = np.append(testImages_big, testImages, axis=0)
    trainImages_big = np.append(trainImages_big, trainImages, axis=0)

In [48]:
print('Final shapes after resize: ', testLabels_big.shape, trainLabels_big.shape, testImages_big.shape, trainImages_big.shape)


Final shapes after resize:  (100000,) (600000,) (100000, 784) (600000, 784)


In [49]:
writeNewFile(dir+newTestFile_big, testLabels_big, testImages_big)
writeNewFile(dir+newTrainFile_big, trainLabels_big, trainImages_big)

del testLabels_big, trainLabels_big, testImages_big, trainImages_big
print('MNIST_BIG files created')

MNIST_BIG files created
