## Context

This dataset was created by Yaroslav Bulatov by taking some publicly available fonts and extracting glyphs from them to make a dataset similar to MNIST. There are 10 classes, with letters A-J.

## Content

A set of training and test images of letters from A to J on various typefaces. The images size is 28x28 pixels.

## Acknowledgements

The dataset can be found on Tensorflow github page as well as on the blog from Yaroslav, here.

## Inspiration

This is a pretty good dataset to train classifiers! According to Yaroslav:

Judging by the examples, one would expect this to be a harder task than MNIST. This seems to be the case -- logistic regression on top of stacked auto-encoder with fine-tuning gets about 89% accuracy whereas same approach gives got 98% on MNIST. Dataset consists of small hand-cleaned part, about 19k instances, and large uncleaned dataset, 500k instances. Two parts have approximately 0.5% and 6.5% label error rate. I got this by looking through glyphs and counting how often my guess of the letter didn't match it's unicode value in the font file.
Enjoy!

In [3]:
import numpy as np
import tensorflow as tf
import os
import pandas as pd


  return f(*args, **kwds)


In [4]:
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from matplotlib import pyplot as plt


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import OneHotEncoder

## Getting the path names to images and labeling into numpy array

In [7]:
parentDir = 'notMNIST_large/'
labels = []
images = []
total = 0
bad = 0
for folder in os.listdir(parentDir):
    if folder != '.DS_Store':
        if total % 10000 == 0:
            print(total, bad)
            total += 1
        try:
            for file in os.listdir(parentDir + folder):
                img = Image.open(parentDir + folder + '/' + file)
                img = np.asarray(img).flatten()
                
                images.append(np.asarray(img))
                labels.append(folder)
        except:
            bad += 1
            pass
dataset = pd.DataFrame(images)
dataset['labels'] = np.asarray(labels)
dataset.head()


                                

0 0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,labels
0,112,149,180,199,214,229,224,215,197,176,...,0,0,0,4,2,0,0,0,0,I
1,0,0,4,1,0,0,1,0,20,110,...,84,84,68,34,4,0,0,0,0,I
2,0,0,0,0,0,0,0,1,2,0,...,10,4,0,0,0,0,0,0,0,I
3,132,243,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,254,240,I
4,199,215,231,235,240,246,254,255,255,255,...,255,251,243,234,227,220,217,170,120,I


In [6]:
print(len(dataset))
print(np.sum([len(os.listdir(parentDir+x)) for x in os.listdir(parentDir) if x != '.DS_Store']))
    
    

15933
18726


In [7]:
print(len(dataset))

15933


## Set batch size and epochs

Don't want batch size to be too large or not too small

In [8]:
batch_size = 16
num_epochs = 100
def input_func(features, labels, batch_size):
    
    def parser(image, label): 
        
        img = tf.image.decode_png(tf.read_file(image))
        img = tf.image.resize_images(img, tf.constant([1, 784]))
        img = tf.reshape(img, [28, 28, 1])
        img = tf.cast(img, tf.float32, "cast")

        return img, label

    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.map(parser)
    dataset = dataset.batch(batch_size)
    
    return dataset

## Define model architecture
Uses a two layer, each layer consisting of a convolutional and pooling layer, architecture. (Same architecture as original MNIST CNN)

In [9]:
def my_model(features, labels, mode, params):
    #initialize input by reshaping and casting for network
    #img = tf.image.decode_png(tf.read_file(features['x'][0]))
    # img = np.array( img, dtype='uint8' ).flatten()
    
    # FIRST LAYER
    # ---conv layer with 32 filters, 5x5 kernel, and relu activation
    # ---pool layer with 2x2 pool window and stride of 2x2
    
    input_layer = tf.reshape(features['x'], [-1, 28, 28, 1])
    input_layer = tf.cast(input_layer, tf.float32, "cast")
    conv1 = tf.layers.conv2d(inputs=input_layer, filters=32, kernel_size=(5, 5), padding="same", activation=tf.nn.relu)
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=(2, 2), strides=(2, 2))
    
    # SECOND LAYER
    # ---conv layer with 64 filters, 5x5 kernel, and relu activation
    # ---pool layer with 2x2 pool window and stride of 2x2
    conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=(5, 5), padding="same", activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=(2, 2), strides=(2, 2))
    
    # DENSE LAYER
    # ---flatten output into vector
    # ---dropout to prevent overfitting
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
    dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
    
    logits = tf.layers.dense(inputs=dropout, units=10)
    predictions = {
        "classes": tf.argmax(input=logits, axis=1),
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
    onehot_labels = tf.reshape(onehot_labels, [-1, 10])
    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
#     loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(loss=loss,global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    print(labels.shape)
    print(predictions["classes"].shape)
    eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)



## Training and Evaluating
-> split data into train and test (2:1) <br>
-> instantiate model with my_model as cnn <br>
-> convert dataset (np array) to dataframe to use pd.factorize to get integer labels, then convert back to np array

In [10]:
# Fetch the data
X_train, X_test, y_train, y_test = train_test_split(dataset[[i for i in range(784)]], pd.factorize(dataset['labels'])[0], test_size=0.33, random_state=42)

# Build CNN.
classifier = tf.estimator.Estimator(model_fn=my_model)

# Train the Model.
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

print(X_train, y_train)

train_input_func = tf.estimator.inputs.numpy_input_fn(x = {'x' : X_train}, 
                                                      y = y_train, 
                                                      batch_size = batch_size, 
                                                      num_epochs = num_epochs, 
                                                      shuffle = True
                                                     )
classifier.train(input_fn=train_input_func, steps = 5000)

# Evaluate the model.
eval_input_func = tf.estimator.inputs.numpy_input_fn(x = {'x' : X_test}, 
                                                      y = y_test, 
                                                      batch_size = batch_size, 
                                                      num_epochs = num_epochs, 
                                                      shuffle = True
                                                     )
eval_result = classifier.evaluate(input_fn=eval_input_func)

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_is_chief': True, '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a418b9cf8>, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_task_id': 0, '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_model_dir': '/var/folders/th/svpqqvhs62790bm9gczzcth40000gn/T/tmpv9ue6kpa', '_master': '', '_tf_random_seed': None, '_evaluation_master': '', '_task_type': 'worker', '_keep_checkpoint_max': 5, '_train_distribute': None, '_global_id_in_cluster': 0}
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [7 5 5 ... 0 9 5]
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:t

In [72]:
print(X_test, len(X_test))
print(y_test, len(y_test))

['notMNIST_large/A/VW5pdmVyc0xULUV4dHJhQmxhY2tFeHQub3Rm.png'
 'notMNIST_large/F/Q3JheW9uIE5vcm1hbC50dGY=.png'
 'notMNIST_large/B/QnVyb2tyYXQtT25lLm90Zg==.png' ...
 'notMNIST_large/E/SW5zdGFsbGF0aW9uIFNTaSBCb2xkLnR0Zg==.png'
 'notMNIST_large/H/QmF1ZXIgQm9kb25pIEl0YWxpYy5wZmI=.png'
 'notMNIST_large/I/VHJpYW5nZWwudHRm.png'] 174610
[2 3 9 ... 8 4 0] 174610


In [29]:
print(X_train, len(X_train))
print(y_train, len(y_train))

[0 0 0 ... 0 2 0] 10675
[7 5 5 ... 0 9 5] 10675
