### Process Datasets

In this notebook we'll process the data in order to use it later in the training of the neural networks, and apply the metrics that will indicate the results obtained by them.

Import All The Libraries

In [9]:
import gzip
import os, shutil
from os.path import isfile, join
from PIL import Image
import zipfile
import numpy as np
import cv2 as cv2

mnist_raw_path = './../raw_data/MNIST/raw/'
mnist_new_path = '../datasets/'

This function will list all the required files for the MNIST Dataset.

In [10]:
def list_files(path):
    return [join(path, f) for f in os.listdir(path) if isfile(join(mnist_raw_path, f))]

In [11]:
for f in list_files(mnist_raw_path): print(f)

./../raw_data/MNIST/raw/t10k-images-idx3-ubyte.gz
./../raw_data/MNIST/raw/t10k-labels-idx1-ubyte.gz
./../raw_data/MNIST/raw/train-images-idx3-ubyte.gz
./../raw_data/MNIST/raw/train-labels-idx1-ubyte.gz


In [12]:
def get_images(mnist_path):
    for f in list_files(mnist_path):
        if "train-images" in f:
            with gzip.open(f, 'rb') as data:                                 #RB = Read Binary
                _ = int.from_bytes(data.read(4), 'big')                      #Magic Number = Useless
                num_imgs = int.from_bytes(data.read(4), 'big')             #Number of Images
                rows = int.from_bytes(data.read(4), 'big')                   #Number of Rows
                cols = int.from_bytes(data.read(4), 'big')                #Number of Columns
                train_imgs = data.read()                                     #Read all last bytes
                x_train = np.frombuffer(train_imgs, dtype = np.uint8)      
                x_train = x_train.reshape((num_imgs, rows, cols))
                
        elif "t10k-images" in f:
            with gzip.open(f, 'rb') as data:                                 
                _ = int.from_bytes(data.read(4), 'big')                     
                num_imgs = int.from_bytes(data.read(4), 'big')             
                rows = int.from_bytes(data.read(4), 'big')                   
                cols = int.from_bytes(data.read(4), 'big')                
                test_imgs = data.read()                                     
                x_test = np.frombuffer(test_imgs, dtype = np.uint8)      
                x_test = x_test.reshape((num_imgs, rows, cols))
                
    a = np.concatenate((x_test, x_train), axis=0)

    return a


In [13]:
data = get_images(mnist_raw_path)
print("Training Shape", data.shape)

Training Shape (70000, 28, 28)


In [8]:
#Para probar el customLoader
counter = 1
for i in data[:300]:
    filename = "mnist_" + str(counter) + ".jpeg"
    im = Image.fromarray(i)
    im.save('./../datasets/pruebas/pruebas2/' + filename)
    counter+=1

In [14]:
def SaveZipImages(images, dest_folder, num_samples, dataset):
        '''
        images:
                --> MNIST: array of Images
                --> CELEBA: path for Images Zip
                --> CIFAR: ... 
        '''
        counter = 1
        new_folder = dest_folder+dataset+'_'+str(num_samples) # ../datasets/mnist_xxxx
        if not os.path.exists(new_folder):
                os.makedirs(new_folder)
        folder = new_folder + '/' + dataset + '_'+ str(num_samples)
        if not os.path.exists(folder):
                os.makedirs(folder)
        if dataset == 'MNIST':
                for i in images[:num_samples]:
                        filename = "mnist_" + str(counter) + ".jpeg"
                        im = Image.fromarray(i)
                        im.save(folder +"/" +filename)
                        counter+=1

        if dataset == 'CELEBA':
                imgzip = zipfile.ZipFile(images)
                for f in imgzip.infolist()[1:num_samples]:
                        filename = 'celeba_'+str(counter)+'.jpeg'
                        img = Image.open(imgzip.open(f))
                        img.save(folder +"/" +filename)
                        counter+=1
                
        shutil.make_archive(dataset+"_"+str(num_samples), format='zip', root_dir="./"+new_folder)
        shutil.rmtree(new_folder)
        shutil.move(dataset+'_'+str(num_samples)+'.zip', './../datasets/')


In [16]:
dest_folder = '../datasets/'
celeba_raw_path = './../raw_data/img_align_celeba.zip'
SaveZipImages(images=data, dest_folder=dest_folder, num_samples=70000, dataset='MNIST')
SaveZipImages(images=celeba_raw_path, dest_folder=dest_folder, num_samples=70000, dataset='CELEBA')
SaveZipImages(images=data, dest_folder=dest_folder, num_samples=30000, dataset='MNIST')
SaveZipImages(images=celeba_raw_path, dest_folder=dest_folder, num_samples=30000, dataset='CELEBA')
