In [1]:
import hashlib
import os
import pickle
from urllib.request import urlretrieve

import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import resample
from tqdm import tqdm
from zipfile import ZipFile

print("Modules successfully imported")

Modules successfully imported


In [2]:
# This function downloads the file in the given url and store it to the current directory
def download(url, file):
    # Check if the file exist
    if not os.path.isfile(file):
        print('Downloading ' + file + ' ...' )
        urlretrieve(url, file)
        print('Downloading Complete')

# Datasets are provided by udacity
download('https://s3.amazonaws.com/udacity-sdc/notMNIST_train.zip', 'notMNIST_train.zip')
download('https://s3.amazonaws.com/udacity-sdc/notMNIST_test.zip', 'notMNIST_test.zip')

# Make sure that the file is not currepted byt checking the hashes
assert hashlib.md5(open('notMNIST_train.zip', 'rb').read()).hexdigest() == 'c8673b3f28f489e9cdf3a3d74e2ac8fa',\
        'notMNIST_train.zip file is corrupted.  Remove the file and try again.'
assert hashlib.md5(open('notMNIST_test.zip', 'rb').read()).hexdigest() == '5d3c7e653e63471c88df796156a9dfa9',\
        'notMNIST_test.zip file is corrupted.  Remove the file and try again.'
    
print('All files downloaded.')

All files downloaded.


In [8]:
# Given the file uncompress the images and store them in an array, do the same for the labels
def uncompress_features_labels(file):
    features = []
    labels = []
    
    with ZipFile(file) as zipf:
        # this is for showing progressbar
        filenames_pbar = tqdm(zipf.namelist(), unit='files')
        
        for filename in filenames_pbar:
            if not filename.endswith('/'):
                with zipf.open(filename) as image_file:
                    image = Image.open(image_file)
                    image.load()
                    feature = np.array(image, dtype=np.float32).flatten()
                label = os.path.split(filename)[1][0]
                features.append(feature)
                labels.append(label)
    return np.array(features), np.array(labels)
        
train_features, train_labels = uncompress_features_labels('notMNIST_train.zip')
test_features, test_labels = uncompress_features_labels('notMNIST_test.zip')


  0%|          | 0/210001 [00:00<?, ?files/s][A
  0%|          | 2/210001 [00:00<4:01:59, 14.46files/s][A
  0%|          | 419/210001 [00:00<2:49:18, 20.63files/s][A
  0%|          | 856/210001 [00:00<1:58:30, 29.41files/s][A
  1%|          | 1297/210001 [00:00<1:23:01, 41.90files/s][A
  1%|          | 1745/210001 [00:00<58:13, 59.62files/s]  [A
  1%|          | 2185/210001 [00:00<40:54, 84.68files/s][A
  1%|▏         | 2645/210001 [00:00<28:47, 120.02files/s][A
  1%|▏         | 3073/210001 [00:00<20:21, 169.42files/s][A
  2%|▏         | 3528/210001 [00:00<14:26, 238.22files/s][A
  2%|▏         | 3948/210001 [00:01<10:22, 330.91files/s][A
  2%|▏         | 4366/210001 [00:01<07:29, 457.18files/s][A
  2%|▏         | 4818/210001 [00:01<05:27, 625.95files/s][A
  3%|▎         | 5267/210001 [00:01<04:02, 843.66files/s][A
  3%|▎         | 5696/210001 [00:01<03:04, 1110.27files/s][A
  3%|▎         | 6145/210001 [00:01<02:22, 1433.67files/s][A
  3%|▎         | 6607/210001 [00:0

 53%|█████▎    | 110481/210001 [00:26<00:22, 4357.99files/s][A
 53%|█████▎    | 110932/210001 [00:26<00:22, 4401.28files/s][A
 53%|█████▎    | 111387/210001 [00:27<00:22, 4440.02files/s][A
 53%|█████▎    | 111832/210001 [00:27<00:22, 4411.05files/s][A
 53%|█████▎    | 112288/210001 [00:27<00:21, 4454.46files/s][A
 54%|█████▎    | 112734/210001 [00:27<00:22, 4295.31files/s][A
 54%|█████▍    | 113171/210001 [00:27<00:22, 4315.49files/s][A
 54%|█████▍    | 113641/210001 [00:27<00:21, 4422.75files/s][A
 54%|█████▍    | 114085/210001 [00:27<00:21, 4390.93files/s][A
 55%|█████▍    | 114526/210001 [00:27<00:22, 4275.27files/s][A
 55%|█████▍    | 114978/210001 [00:27<00:21, 4343.48files/s][A
 55%|█████▍    | 115439/210001 [00:28<00:21, 4419.05files/s][A
 55%|█████▌    | 115883/210001 [00:28<00:21, 4387.24files/s][A
 55%|█████▌    | 116323/210001 [00:28<00:21, 4346.13files/s][A
 56%|█████▌    | 116779/210001 [00:28<00:21, 4405.58files/s][A
 56%|█████▌    | 117237/210001 [00:28<00

In [9]:
docker_size_limit = 150000
train_features, train_labels = resample(train_features, train_labels, n_samples=docker_size_limit)

# Set flags which will be used later
is_features_normal = False
is_labels_encod = False

print('All features and labels uncompressed.')

All features and labels uncompressed.


In [10]:
# normalize the data with max min scaling. It simple converts images in range 0-255 to 0.1-0.9
def normalize_grayscale(image_data):
    a = 0.1
    b = 0.9
    grayscale_min = 0
    grayscale_max = 255
    return a + ( ( (image_data - grayscale_min)*(b - a) )/( grayscale_max - grayscale_min ) )


if not is_features_normal:
    train_features = normalize_grayscale(train_features)
    test_features = normalize_grayscale(test_features)
    is_features_normal = True

print('Data normalized')

Data normalized


In [11]:
# doing one hot encoding using the LabelBinarizer in sklearn 
if not is_labels_encod:
    encoder = LabelBinarizer()
    encoder.fit(train_labels)
    train_labels = encoder.transform(train_labels)
    test_labels = encoder.transform(test_labels)
    train_labels = train_labels.astype(np.float32)
    test_labels = test_labels.astype(np.float32)
    is_labels_encod = True

In [12]:
# now split the training data into validation dataset with 0.05% of the data
train_features, valid_features, train_labels, valid_lables = train_test_split(train_features, train_labels, test_size=0.05,
    random_state=832289)


In [13]:
# saving the whole data in pickle file which can be later retrived for learning
pickle_file = 'notMNIST.pickle'
if not os.path.isfile(pickle_file):
    print('creating pickle file')
    
    try:
        with open(pickle_file, 'rb') as pfile:
            pickle.dump(
                {
                    'train_dataset': train_features,
                    'train_labels': train_labels,
                    'valid_dataset': valid_features,
                    'valid_labels': valid_labels,
                    'test_dataset': test_features,
                    'test_labels': test_labels,
                },
                pfile, pickle.HIGHEST_PROTOCOL)
    except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
        raise

print('Data cached in pickle file.')

Data cached in pickle file.
