In [None]:
from __future__ import print_function
import numpy as np
import os
from scipy import ndimage
import pickle as pickle
from PIL import Image
import sys
import tensorflow as tf

In [1]:
data_root = '.'
num_classes = 10
np.random.seed(133)

test_folders = ['./notMNIST_small/A', './notMNIST_small/B',
'./notMNIST_small/C', './notMNIST_small/D',
'./notMNIST_small/E', './notMNIST_small/F',
'./notMNIST_small/G', './notMNIST_small/H',
'./notMNIST_small/I', './notMNIST_small/J']

train_folders = ['./notMNIST_large_v2/A', './notMNIST_large_v2/B',
'./notMNIST_large_v2/C', './notMNIST_large_v2/D',
'./notMNIST_large_v2/E', './notMNIST_large_v2/F',
'./notMNIST_large_v2/G', './notMNIST_large_v2/H',
'./notMNIST_large_v2/I', './notMNIST_large_v2/J']

image_size = 28 #pixel eidth and height
pixel_depth = 255.0
num_of_labels = 10
batch_size = 128
no_of_neurons = 1024

In [2]:
def load_letter(folder,min_num_images):
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files),image_size,image_size),dtype= np.float32)
    num_images = 0
    for image in image_files:
        image_file = os.path.join(folder,image)
        image_data = (ndimage.imread(image_file).astype(float) - 
                      pixel_depth/2)/pixel_depth
        dataset[num_images,:,:] = image_data
        num_images +=1
    dataset = dataset[0:num_images,:,:]
    if(num_images< min_num_images):
        print('few_imges than expected')
    print('dataset tensor:', dataset.shape)
    return dataset

In [3]:
def maybe_pickle(data_folders, min_num_imagesper_class,force = False):
    dataset_names = []
    for folder in data_folders:
        set_filename = folder+".pickle"
        dataset_names.append(set_filename)
        if(os.path.exists(set_filename) and not force):
            print(set_filename,"already present")
        else:
            print(set_filename)
            dataset = load_letter(folder,100)
            try:
                with open(set_filename,'wb') as f:
                    print(pickle.HIGHEST_PROTOCOL)
                    pickle.dump(dataset,f,2)
            except Exception as e:
                print('Unable to save data',set_filename,":",e)
    return dataset_names

In [4]:
def make_arrays(nb_rows, img_size):
    if nb_rows:
        dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
        labels = np.ndarray(nb_rows, dtype=np.int32)
    else:
        dataset, labels = None, None
    return dataset, labels

In [5]:
def merge_datasets(pickle_files, train_size, valid_size=0):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size, image_size)
    train_dataset, train_labels = make_arrays(train_size, image_size)
    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes
    start_v, start_t = 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class+tsize_per_class\
    for label, pickle_file in enumerate(pickle_files):       
        try:
            with open(pickle_file, 'rb') as f:
                letter_set = pickle.load(f)
                np.random.shuffle(letter_set)
                if valid_dataset is not None:
                    valid_letter = letter_set[:vsize_per_class, :, :]
                    valid_dataset[start_v:end_v, :, :] = valid_letter
                    valid_labels[start_v:end_v] = label
                    start_v += vsize_per_class
                    end_v += vsize_per_class
                    
                train_letter = letter_set[vsize_per_class:end_l, :, :]
                train_dataset[start_t:end_t, :, :] = train_letter
                train_labels[start_t:end_t] = label
                start_t += tsize_per_class
                end_t += tsize_per_class
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise
    return valid_dataset, valid_labels, train_dataset, train_labels

In [6]:
def randomize(dataset, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels

In [7]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

train_datasets = maybe_pickle(train_folders,100)
test_datasets = maybe_pickle(test_folders,100)
train_size = 1000
valid_size = 500
test_size = 500
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
    train_datasets, train_size, valid_size)

_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

./notMNIST_large_v2/A.pickle already present
./notMNIST_large_v2/B.pickle already present
./notMNIST_large_v2/C.pickle already present
./notMNIST_large_v2/D.pickle already present
./notMNIST_large_v2/E.pickle already present
./notMNIST_large_v2/F.pickle already present
./notMNIST_large_v2/G.pickle already present
./notMNIST_large_v2/H.pickle already present
./notMNIST_large_v2/I.pickle already present
./notMNIST_large_v2/J.pickle already present
./notMNIST_small/A.pickle already present
./notMNIST_small/B.pickle already present
./notMNIST_small/C.pickle already present
./notMNIST_small/D.pickle already present
./notMNIST_small/E.pickle already present
./notMNIST_small/F.pickle already present
./notMNIST_small/G.pickle already present
./notMNIST_small/H.pickle already present
./notMNIST_small/I.pickle already present
./notMNIST_small/J.pickle already present


In [8]:
print('Training dataset and labels shape:', train_dataset.shape, train_labels.shape)
print('Validation dataset and labels shape:', valid_dataset.shape, valid_labels.shape)
print('Testing dataset and labels shape:', test_dataset.shape, test_labels.shape)
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

pickle_file = os.path.join(data_root, 'notMNIST.pickle')

try:
    f = open(pickle_file, 'wb')
    save = {
      'train_dataset': train_dataset,
      'train_labels': train_labels,
      'valid_dataset': valid_dataset,
      'valid_labels': valid_labels,
      'test_dataset': test_dataset,
      'test_labels': test_labels,
    }

    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise

    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)



Training dataset and labels shape: (1000, 28, 28) (1000,)
Validation dataset and labels shape: (500, 28, 28) (500,)
Testing dataset and labels shape: (500, 28, 28) (500,)
Compressed pickle size: 6280500


In [9]:
def reformat(dataset,labels):
    dataset = dataset.reshape((-1,image_size*image_size)).astype(np.float32)
    labels = (np.arange(num_of_labels)==labels[:,None]).astype(np.float32)
    print(dataset.shape)
    print(labels.shape)
    return dataset,labels

In [14]:
with open('notMNIST.pickle','rb') as f:
    save = pickle.load(f)
    training_dataset = save['train_dataset']
    training_labels = save['train_labels']
    validation_dataset = save['valid_dataset']
    validation_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save

print('Training dataset and labels shape:', train_dataset.shape, train_labels.shape)
print('Validation dataset and labels shape:', valid_dataset.shape, valid_labels.shape)
print('Testing dataset and labels shape:', test_dataset.shape, test_labels.shape)
training_dataset,training_labels = reformat(training_dataset,training_labels)
valid_dataset,valid_labels = reformat(validation_dataset,validation_labels)
test_dataset,test_labels = reformat(test_dataset,test_labels)


Training dataset and labels shape: (1000, 28, 28) (1000,)
Validation dataset and labels shape: (500, 784) (500, 10)
Testing dataset and labels shape: (500, 28, 28) (500,)
(1000, 784)
(1000, 10)
(500, 784)
(500, 10)
(500, 784)
(500, 10)


In [16]:

graph = tf.Graph()
with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,shape = (
        batch_size,image_size*image_size))
    tf_train_labels = tf.placeholder(tf.float32,shape = (batch_size,
                                                         num_of_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    w1 = tf.Variable(tf.truncated_normal(([image_size*image_size, 
                                           no_of_neurons])))
    b1 = tf.Variable(tf.zeros([no_of_neurons]))    
    w2 = tf.Variable(tf.truncated_normal([no_of_neurons,num_of_labels]))
    b2 = tf.Variable(tf.zeros([num_of_labels]))
    hidden1 = tf.nn.relu(tf.matmul(tf_train_dataset,w1)+b1)
    logits =tf.matmul(hidden1,w2)+b2   
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
        labels = tf_train_labels,logits = logits))
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    train_prediction = tf.nn.softmax(logits)
    valid_prediction =  tf.nn.softmax(
            tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, w1) + b1), w2) + b2)
    test_prediction = tf.nn.softmax(
            tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, w1) + b1), w2) + b2)

num_steps = 101
x = np.arange(num_steps)
minibatch_acc = []
validation_acc = []

with tf.Session(graph=graph) as sess:
    tf.initialize_all_variables().run()
    print("initialized")
    for step in range(num_steps):
        offset = (step*batch_size)%(train_labels.shape[0]-batch_size)
        batch_data = training_dataset[offset:(batch_size+offset),:]
        batch_label = training_labels[offset:(batch_size+offset),:]
        feed_dicts = {tf_train_dataset:batch_data,tf_train_labels:batch_label}
        _,l, predictions = sess.run([optimizer,loss,train_prediction],feed_dicts)
        mini_batch_accu = accuracy(predictions,batch_label)
        valid_accu = accuracy(valid_prediction.eval(),valid_labels)
        if(step%10==0):
            print("mini_batch_accu ",step," ",mini_batch_accu)
            print("valid_accu ",step," ",valid_accu)
            minibatch_acc.append(mini_batch_accu)
            validation_acc.append(valid_accu)
            t=[np.array(minibatch_acc)]
            t.append(validation_acc)

    print("test ", accuracy(test_prediction.eval(),test_labels))



initialized
mini_batch_accu  0   12.5
valid_accu  0   19.0
mini_batch_accu  10   65.625
valid_accu  10   57.2
mini_batch_accu  20   73.4375
valid_accu  20   58.6
mini_batch_accu  30   90.625
valid_accu  30   62.0
mini_batch_accu  40   95.3125
valid_accu  40   63.0
mini_batch_accu  50   96.875
valid_accu  50   62.0
mini_batch_accu  60   99.21875
valid_accu  60   61.0
mini_batch_accu  70   98.4375
valid_accu  70   62.6
mini_batch_accu  80   96.09375
valid_accu  80   62.4
mini_batch_accu  90   97.65625
valid_accu  90   62.2
mini_batch_accu  100   99.21875
valid_accu  100   61.2
test  76.4
