# notMNIST Character Recognition using Tensorflow

In this small project, we develope a simple neural network to detect characters in [nonMNIST](http://yaroslavvb.blogspot.ca/2011/09/notmnist-dataset.html) dataset using *Tensorflow*. 

The original example has been presented by [Udacity](https://www.udacity.com/).

**Input:**
- A set of nonMNIST character images from letter A to J

**Goal:**
- Prediction of a sample nonMNIST image

In [1]:
# Import required modules
import hashlib
import os
import pickle
from urllib.request import urlretrieve

import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import resample
from tqdm import tqdm
from zipfile import ZipFile

print('All modules imported.')

All modules imported.


## Download and uncompressed the nonMNIST dataset

In [None]:
def download(url, file):
    """
    Download file from <url>
    :param url: URL to file
    :param file: Local file path
    """
    if not os.path.isfile(file):
        print('Downloading ' + file + '...')
        urlretrieve(url, file)
        print('Download Finished')

# Download the training and test dataset.
download('https://s3.amazonaws.com/udacity-sdc/notMNIST_train.zip', 'notMNIST_train.zip')
download('https://s3.amazonaws.com/udacity-sdc/notMNIST_test.zip', 'notMNIST_test.zip')

# Make sure the files aren't corrupted
assert hashlib.md5(open('notMNIST_train.zip', 'rb').read()).hexdigest() == 'c8673b3f28f489e9cdf3a3d74e2ac8fa',\
        'notMNIST_train.zip file is corrupted.  Remove the file and try again.'
assert hashlib.md5(open('notMNIST_test.zip', 'rb').read()).hexdigest() == '5d3c7e653e63471c88df796156a9dfa9',\
        'notMNIST_test.zip file is corrupted.  Remove the file and try again.'

# Wait until you see that all files have been downloaded.
print('All files downloaded.')

In [None]:
def uncompress_features_labels(file):
    """
    Uncompress features and labels from a zip file
    :param file: The zip file to extract the data from
    """
    features = []
    labels = []

    with ZipFile(file) as zipf:
        # Progress Bar
        filenames_pbar = tqdm(zipf.namelist(), unit='files')
        
        # Get features and labels from all files
        for filename in filenames_pbar:
            # Check if the file is a directory
            if not filename.endswith('/'):
                with zipf.open(filename) as image_file:
                    image = Image.open(image_file)
                    image.load()
                    # Load image data as 1 dimensional array
                    # We're using float32 to save on memory space
                    feature = np.array(image, dtype=np.float32).flatten()

                # Get the the letter from the filename.  This is the letter of the image.
                label = os.path.split(filename)[1][0]

                features.append(feature)
                labels.append(label)
    return np.array(features), np.array(labels)

# Get the features and labels from the zip files
train_features, train_labels = uncompress_features_labels('notMNIST_train.zip')
test_features, test_labels = uncompress_features_labels('notMNIST_test.zip')

# Limit the amount of data to work with a docker container
docker_size_limit = 150000
train_features, train_labels = resample(train_features, train_labels, n_samples=docker_size_limit)

# Set flags for feature engineering.  This will prevent you from skipping an important step.
is_features_normal = False
is_labels_encod = False

# Wait until you see that all features and labels have been uncompressed.
print('All features and labels uncompressed.')

## Data preperation
### Normalize image values
For better convergence, we need to normalize images to be near zero (i.e. $\mu = 0$ and $\sigma = 1$). In face we normalize the values to be betweek 0.1 and 0.9.

In [None]:
def normalize_grayscale(image_data):
    """
    Normalize the image data with Min-Max scaling to a range of [0.1, 0.9]
    :param image_data: The image data to be normalized
    :return: Normalized image data
    """
    v_min = min(image_data)
    v_max = max(image_data)
    
    return 0.1 + ((image_data - v_min) / (v_max - v_min) * 0.8) 

### Label encoding

In [None]:
if not is_labels_encod:
    # Turn labels into numbers and apply One-Hot Encoding
    encoder = LabelBinarizer()
    encoder.fit(train_labels)
    train_labels = encoder.transform(train_labels)
    test_labels = encoder.transform(test_labels)

    # Change to float32, so it can be multiplied against the features in TensorFlow, which are float32
    train_labels = train_labels.astype(np.float32)
    test_labels = test_labels.astype(np.float32)
    is_labels_encod = True

print('Labels One-Hot Encoded')

### Splitting data into train and test sets

In [None]:
# Get randomized datasets for training and validation
train_features, valid_features, train_labels, valid_labels = train_test_split(
    train_features,
    train_labels,
    test_size=0.05,
    random_state=832289)

print('Training features and labels randomized and split.')

In [None]:
# Save the data for easy access
pickle_file = 'notMNIST.pickle'
if not os.path.isfile(pickle_file):
    print('Saving data to pickle file...')
    try:
        with open('notMNIST.pickle', 'wb') as pfile:
            pickle.dump(
                {
                    'train_dataset': train_features,
                    'train_labels': train_labels,
                    'valid_dataset': valid_features,
                    'valid_labels': valid_labels,
                    'test_dataset': test_features,
                    'test_labels': test_labels,
                },
                pfile, pickle.HIGHEST_PROTOCOL)
    except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
        raise

print('Data cached in pickle file.')

### Checkpoint
Let's load the all data we created before.

In [None]:
%matplotlib inline

# Load the modules
import pickle
import math

import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt

# Reload the data
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
  pickle_data = pickle.load(f)
  train_features = pickle_data['train_dataset']
  train_labels = pickle_data['train_labels']
  valid_features = pickle_data['valid_dataset']
  valid_labels = pickle_data['valid_labels']
  test_features = pickle_data['test_dataset']
  test_labels = pickle_data['test_labels']
  del pickle_data  # Free up memory

print('Data and modules loaded.')

## Build the model

In [None]:
features = {'train': tf.placeholder(tf.float32, shape=(None, train_features.shape[1])),
            'valid': tf.placeholder(tf.float32, shape=(None, valid_features.shape[1])),
            'test': tf.placeholder(tf.float32, shape=(None, test_features.shape[1]))}

labels = {'train': tf.placeholder(tf.float32, shape=(None, train_labels.shape[1])),
            'valid': tf.placeholder(tf.float32, shape=(None, valid_labels.shape[1])),
            'test': tf.placeholder(tf.float32, shape=(None, test_labels.shape[1]))}

weights = {'hidden_layer_1': tf.Variable(tf.truncated_normal((train_features.shape[1], 10))),
           'out': tf.Variable(tf.Variable(tf.truncated_normal((10, 10))))}


biases = {'hidden_layer_1': tf.Variable(tf.truncated_noremal((10))),
          'out': tf.Variable(tf.truncated_normal((10)))}

hidden_layer_1 = tf.add(tf.matmul(features, weights['hiddel_layer_1'], bias['hidden_layer_1']))
hidden_layer_1 = tf.nn.relu(hidden_layer_1)

logits = tf.add(tf.matmul(hidden_layer_1, weight['out']), biases['out'])

                                         
