## Context

This dataset was created by Yaroslav Bulatov by taking some publicly available fonts and extracting glyphs from them to make a dataset similar to MNIST. There are 10 classes, with letters A-J.

## Content

A set of training and test images of letters from A to J on various typefaces. The images size is 28x28 pixels.

## Acknowledgements

The dataset can be found on Tensorflow github page as well as on the blog from Yaroslav, here.

## Inspiration

This is a pretty good dataset to train classifiers! According to Yaroslav:

Judging by the examples, one would expect this to be a harder task than MNIST. This seems to be the case -- logistic regression on top of stacked auto-encoder with fine-tuning gets about 89% accuracy whereas same approach gives got 98% on MNIST. Dataset consists of small hand-cleaned part, about 19k instances, and large uncleaned dataset, 500k instances. Two parts have approximately 0.5% and 6.5% label error rate. I got this by looking through glyphs and counting how often my guess of the letter didn't match it's unicode value in the font file.
Enjoy!

In [1]:
import numpy as np
import tensorflow as tf
import os
import pandas as pd


  return f(*args, **kwds)


In [2]:
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from matplotlib import pyplot as plt


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.preprocessing import OneHotEncoder

## Getting the path names to images and labeling into numpy array

In [5]:
parentDir = 'notMNIST_small/'
print(parentDir)
data = []
total = 0
good = 0
for folder in os.listdir(parentDir):
    if folder != '.DS_Store':
        for file in os.listdir(parentDir + folder):
            if total % 10000 == 0:
                print(total, good)
            total += 1
            try:
                img_path = parentDir + folder + '/' + file
                img = Image.open(img_path)
                data.append([img_path, folder])
                good += 1
            except:
                pass
            
dataset = pd.DataFrame(data)
dataset.head()


                                

notMNIST_small/
0 0
10000 9998


Unnamed: 0,0,1
0,notMNIST_small/I/Qml0d2lzZS50dGY=.png,I
1,notMNIST_small/I/RW5nbGFuZCBCb2xkSXRhbGljLnR0Z...,I
2,notMNIST_small/I/R3JlZWsgSXRhbGljLnR0Zg==.png,I
3,notMNIST_small/I/Rmx5d2hlZWxTcXVhcmUudHRm.png,I
4,notMNIST_small/I/SGFuZGljYXAub3Rm.png,I


In [6]:
print(len(dataset))

18724


## Set batch size and epochs

Don't want batch size to be too large or not too small

In [7]:
batch_size = 32
num_epochs = 100
def input_func(features, labels, batch_size):
    
    def parser(image, label): 
        
        img = tf.image.decode_png(tf.read_file(image))
        img = tf.image.resize_images(img, tf.constant([1, 784]))
        img = tf.reshape(img, [28, 28, 1])
        img = tf.cast(img, tf.float32, "cast")
#         image = tf.reshape(image, [28, 28, 1])
#         label = tf.one_hot(indices = label, depth = 10)
        return img, label
    
# #     features = tf.convert_to_tensor(data[[i for i in range(784)]])
# #     labels = tf.convert_to_tensor(pd.factorize(data['label'])[0])

#     return tf.estimator.inputs.numpy_input_fn(
#         x = {'x' : features},
#         y = labels,
#         batch_size = batch_size,
#         num_epochs = num_epochs,
#         shuffle = True
#     )
    
    
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.map(parser)
    dataset = dataset.batch(batch_size)
    return dataset
#     feature_dict = {feature : tf.convert_to_tensor(data[feature]) for feature in data if feature != 'label'}
#     labels = tf.convert_to_tensor(pd.factorize(data['label'])[0])
#     dataset = tf.data.Dataset.from_tensor_slices((feature_dict, labels))
    dataset = dataset.batch(batch_size)
#     dataset = dataset.repeat(num_epochs)
    return dataset
#     iterator = dataset.make_one_shot_iterator()
#     features, labels = iterator.get_next()
#     return features, labels
    
    

## Define model architecture
Uses a two layer, each layer consisting of a convolutional and pooling layer, architecture. (Same architecture as original MNIST CNN)

In [8]:
def my_model(features, labels, mode, params):
    #initialize input by reshaping and casting for network
    #img = tf.image.decode_png(tf.read_file(features['x'][0]))
    # img = np.array( img, dtype='uint8' ).flatten()
    
    # FIRST LAYER
    # ---conv layer with 32 filters, 5x5 kernel, and relu activation
    # ---pool layer with 2x2 pool window and stride of 2x2
    conv1 = tf.layers.conv2d(inputs=features, filters=32, kernel_size=(5, 5), padding="same", activation=tf.nn.relu)
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=(2, 2), strides=(2, 2))
    
    # SECOND LAYER
    # ---conv layer with 64 filters, 5x5 kernel, and relu activation
    # ---pool layer with 2x2 pool window and stride of 2x2
    conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=(5, 5), padding="same", activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=(2, 2), strides=(2, 2))
    
    # DENSE LAYER
    # ---flatten output into vector
    # ---dropout to prevent overfitting
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
    dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
    
    logits = tf.layers.dense(inputs=dropout, units=10)
    predictions = {
        "classes": tf.argmax(input=logits, axis=1),
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
    onehot_labels = tf.reshape(onehot_labels, [-1, 10])
    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
#     loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(loss=loss,global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    print(labels.shape)
    print(predictions["classes"].shape)
    eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)



## Training and Evaluating
-> split data into train and test (2:1)
-> instantiate model with my_mode as cnn
-> convert dataset (np array) to dataframe to use pd.factorize to get integer labels, then convert back to np array

In [None]:
# Fetch the data
X_train, X_test, y_train, y_test = train_test_split(dataset[0], pd.factorize(dataset[1])[0], test_size=0.33, random_state=42)

# Build CNN.
classifier = tf.estimator.Estimator(model_fn=my_model)

# Train the Model.
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

# print(X_train, y_train)

# train_input_func = tf.estimator.inputs.numpy_input_fn(x = {'x' : X_train}, 
#                                                       y = y_train, 
#                                                       batch_size = batch_size, 
#                                                       num_epochs = num_epochs, 
#                                                       shuffle = True
#                                                      )
classifier.train(input_fn=lambda:input_func(X_train, y_train, batch_size), steps = 22000)

# Evaluate the model.
# eval_input_func = tf.estimator.inputs.numpy_input_fn(x = {'x' : X_test}, 
#                                                       y = y_test, 
#                                                       batch_size = batch_size, 
#                                                       num_epochs = num_epochs, 
#                                                       shuffle = True
#                                                      )
eval_result = classifier.evaluate(input_fn=lambda:input_func(X_test, y_test, batch_size))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2fc3e9b0>, '_save_summary_steps': 100, '_session_config': None, '_task_id': 0, '_keep_checkpoint_max': 5, '_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_log_step_count_steps': 100, '_master': '', '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_evaluation_master': '', '_is_chief': True, '_model_dir': '/var/folders/th/svpqqvhs62790bm9gczzcth40000gn/T/tmpu_543l74', '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_task_type': 'worker', '_train_distribute': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/th

In [11]:
print(X_test, len(X_test))
print(y_test, len(y_test))

['notMNIST_large/J/RmF0Ym95IFNsaW0gQkxUQyAyIEJSSy50dGY=.png'
 'notMNIST_large/F/Q29vcGVyIEJsYWNrIEJULnR0Zg==.png'
 'notMNIST_large/C/Q3VzaGluZy1IZWF2eS5vdGY=.png' ...
 'notMNIST_large/F/VGhyb2hhbmRQZW4tUm9tYW4ub3Rm.png'
 'notMNIST_large/G/UmFndGltZVN0ZC5vdGY=.png'
 'notMNIST_large/H/SW50ZXJzdGF0ZU1vbm8tTGd0Lm90Zg==.png'] 174608
[5 3 6 ... 3 1 4] 174608


In [12]:
print(X_train, len(X_train))
print(y_train, len(y_train))

['notMNIST_large/C/SXZhbGVuY2lhLUJvbGQub3Rm.png'
 'notMNIST_large/H/V2hpc3RsZSBTdG9wIEpMLnR0Zg==.png'
 'notMNIST_large/E/VHJhbnNpdGlvbmFsNTExQlQtQm9sZEl0YWxpYy5vdGY=.png' ...
 'notMNIST_large/C/T2xkQm9sZC1MaWdodC50dGY=.png'
 'notMNIST_large/A/UXVhTmF1dGljYWxlX0luaXRpYWxzX05vMS50dGY=.png'
 'notMNIST_large/A/Q292ZS50dGY=.png'] 354506
[6 4 8 ... 6 2 2] 354506
