### Libraries

In [1]:
import requests    # send HTTP requests
import os          # interact with the operating system 

import gzip        # compress (or decompress) like the GNU program gzip (or gunzip)
import shutil      # perform high-level operations on files and collection of files

import numpy as np # numerical computation

import pickle      # save data structure

### Download necessary materials from Mixed National Institute of Standards and Technology ([MNIST](http://yann.lecun.com/exdb/mnist/))

Hypertext Transfer Protocol (HTTP) is designed to enable communication between clients and servers.  
**GET**  is used to *request* data from a specified ressource.  
**POST** is used to *send* data to a server to create/update a ressource.
[Ref.](https://www.w3schools.com/tags/ref_httpmethods.asp)

In [2]:
list_file = ['train-images-idx3-ubyte.gz', 
             'train-labels-idx1-ubyte.gz',
             't10k-images-idx3-ubyte.gz',
             't10k-labels-idx1-ubyte.gz']

folder = './data'
if not os.path.exists(folder):                                     # create folder data if it does not exist
    os.makedirs(folder)

url = 'http://yann.lecun.com/exdb/mnist/'
bool_ind = 0
for file in list_file:
    cond = (os.path.isfile(os.path.join(folder, file)) or \
            os.path.isfile(os.path.join(folder, file.split('.')[0])))
    new_file = os.path.join(folder, file)
    if not cond:                                                    # check if the given file is present (with or without its extension)
        with open(new_file, 'wb') as f:                             # if does not create a file with filename without extentsion
            r = requests.get(url+file)                              # download it 
            f.write(r.content)                                      # copy the content in the created file
    
    print(file,
          ' is already there' if cond else ' has been downloaded',  # In order to check if the sizes are the same as the ones indicated on the MNIST site.
          ' with a size of ', os.path.getsize(new_file), ' bytes')

train-images-idx3-ubyte.gz  has been downloaded  with a size of  9912422  bytes
train-labels-idx1-ubyte.gz  has been downloaded  with a size of  28881  bytes
t10k-images-idx3-ubyte.gz  has been downloaded  with a size of  1648877  bytes
t10k-labels-idx1-ubyte.gz  has been downloaded  with a size of  4542  bytes


### Decompress the files

In [3]:
for file in list_file:
    cond = os.path.isfile(os.path.join(folder, file.split('.gz')[0]))
    if not cond:
        zip_file = os.path.join(folder, file)
        unzip_file = os.path.join(folder, file.split('.gz')[0])
        with gzip.open(zip_file, 'rb') as zip_f: # automatically compress or decompress the data so that it looks like an ordinary "file object"
            with open(unzip_file, 'wb') as unzip_f:
                shutil.copyfileobj(zip_f, unzip_f) # copy the content of a file object to another one           
    print('Already unzip ' if cond else 'Unzip ' , file, ' as ', file.split('.gz')[0])

Unzip  train-images-idx3-ubyte.gz  as  train-images-idx3-ubyte
Unzip  train-labels-idx1-ubyte.gz  as  train-labels-idx1-ubyte
Unzip  t10k-images-idx3-ubyte.gz  as  t10k-images-idx3-ubyte
Unzip  t10k-labels-idx1-ubyte.gz  as  t10k-labels-idx1-ubyte


### Adapting the data in order to operate on it

I have been inspired by the work of **Ghosh 4 AI** with his [video](https://www.youtube.com/watch?v=6xar6bxD80g) and his [github repository](https://github.com/Ghosh4AI/Data-Processors), thanks to him! 

Organization of the binary files  
The training files for labels and images are below, for the test files just the number of labels/images change.
![training](./images/image_training.png)  

![training](./images/label_training.png)  

In [4]:
dct_type = {'images': {'train': 0, 'test': 0, 'row_nb': 0, 'col_nb': 0},              # to retrieve the header information of the binary files
            'labels': {'train': 0, 'test': 0}} 

dct_data = {'images': {'train': None, 'test': None},                                  # to create the data structure
            'labels': {'train': None, 'test': None}}

           

list_file = [f for f in os.listdir(folder) if not f.endswith('.gz')]                  # consider only decompressed files
for file in list_file:
    nature = 'images' if 'images' in file else 'labels'
    category = 'train' if 'train' in file else 'test'
    offset = 16 if 'images' in file else 8                                            # index from which the header information stops and the data appears in the binary files
    with open(os.path.join(folder, file), 'rb') as f:
        content = f.read()
    dct_type[nature][category] = int.from_bytes(content[4: 8], byteorder='big')       # fill out the 'train' and 'test' size information from the header
    data = np.frombuffer(content, dtype=np.dtype('u1'), offset=offset)                # translate the binary data in interger (np.dtype('u1'): unsigned byte)
    tpl = (dct_type[nature][category], )                                              # tuple containing the number of images ( or labels)
    if nature=='images':
        dct_type[nature]['row_nb'] = int.from_bytes(content[8: 12], byteorder='big')  # for images the number of pixels by row
        dct_type[nature]['col_nb'] = int.from_bytes(content[12: 16], byteorder='big') # by column
        tpl += (dct_type[nature]['row_nb'], dct_type[nature]['col_nb'], )             # add the 2 last information to the tuple 
    dct_data[nature][category] = data.reshape(tpl)                                    # reshape the data as a vector (for labels) or matrix (for images)

#### Save data structure

In [5]:
with open(os.path.join(folder, 'mnist_data.pkl'), 'wb') as pickle_data:
    pickle.dump(dct_data, pickle_data)                                  # keep the data structure, in order to reuse it directly 