In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

def variable_summaries(var, name,dorelu):
  with tf.name_scope("summaries"):
    mean = tf.reduce_mean(var)
    tf.scalar_summary('mean/' + name, mean)
    if dorelu==1:
        var = tf.nn.relu6(var)
        
    with tf.name_scope('stddev'):
      stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
    tf.scalar_summary('sttdev/' + name, stddev)
    tf.scalar_summary('max/' + name, tf.reduce_max(var))
    tf.scalar_summary('min/' + name, tf.reduce_min(var))
    tf.scalar_summary('sparsity/'+ name, tf.nn.zero_fraction(var))
    tf.histogram_summary(name, var)
 

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])




In this cell we are going to perturb a set of the training data set to reflect mislabeled data. 


In [5]:
fracBad = 0.;
nBad = np.int(fracBad*train_labels.shape[0])
rangeNess = np.arange(train_labels.shape[0])
np.random.shuffle(rangeNess)
randomBreak = rangeNess[:nBad]

for i, rb in enumerate(randomBreak):
    train_labels[rb][:] = train_labels[rb][np.random.permutation(10)];

We're first going to train a multinomial logistic regression using simple gradient descent.

TensorFlow works like this:
* First you describe the computation that you want to see performed: what the inputs, the variables, and the operations look like. These get created as nodes over a computation graph. This description is all contained within the block below:

      with graph.as_default():
          ...

* Then you can run the operations on this graph as many times as you want by calling `session.run()`, providing it outputs to fetch from the graph that get returned. This runtime operation is all contained in the block below:

      with tf.Session(graph=graph) as session:
          ...

Let's load all the data into TensorFlow and build the computation graph corresponding to our training:

In [None]:


# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000

graph = tf.Graph()
with graph.as_default():

  # Input data.
  # Load the training, validation and test data into constants that are
  # attached to the graph.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  # These are the parameters that we are going to be training. The weight
  # matrix will be initialized using random valued following a (truncated)
  # normal distribution. The biases get initialized to zero.
  #with tf.name_scope('hidden') as scope:
  layer_name = 'layer1'
#   with tf.name_scope("weights"):
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]),name='weights')
#   variable_summaries(weights, layer_name + '/weights',0)
#   with tf.name_scope("biases"):
  biases = tf.Variable(tf.zeros([num_labels]),name='biases')
#   variable_summaries(weights, layer_name + '/biases',0)
#   with tf.name_scope("priors"):
  priors = tf.Variable(tf.ones([train_subset])*0.9,name='priors')
#   variable_summaries(priors, layer_name + '/priors',1) 

  # Training computation.
  # Find the classification estimation 'z' value
  logits = tf.matmul(tf_train_dataset, weights) + biases
  # Calculate the cross-entropy loss. 
  loss = tf.reduce_mean(tf.mul(
            tf.nn.relu6(priors*6),tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)
                                )
                       )/tf.reduce_mean(tf.nn.relu6(priors*6))
  tf.scalar_summary('loss/' + layer_name, loss)

  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#   optimizer = tf.train.AdagradOptimizer(learning_rate=0.5, initial_accumulator_value=0.1, use_locking=False, name='Adagrad')
#   optimizer = tf.train.AdamOptimizer(learning_rate=0.5) # Adam Optimizer
#   train_op = optimizer.apply_gradients(zip(grads, tvars))
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
 



Let's run this computation and iterate:

In [None]:
num_steps = 50000

with tf.Session(graph=graph) as session:
  # This is a one-time operation which ensures the parameters get initialized as
  # we described in the graph: random weights for the matrix, zeros for the
  # biases. 
  tf.initialize_all_variables().run()

    
  summary_op = tf.merge_all_summaries()
  train_dir = 'tmp'
  summary_writer = tf.train.SummaryWriter(train_dir, session.graph)  
  print('Initialized')
  for step in range(num_steps):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the loss value and the training predictions returned as numpy
    # arrays.
    _, l, predictions = session.run([optimizer, loss, train_prediction])
    if (step % 100 == 0):
      print('Loss at step %d: %f' % (step, l))
      acc=accuracy(predictions, train_labels[:train_subset, :])
      print('Training accuracy: %.1f%%' % acc)
      # Calling .eval() on valid_prediction is basically like calling run(), but
      # just to get that one numpy array. Note that it recomputes all its graph
      # dependencies.
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
      tf.scalar_summary('train_acc/' + layer_name, acc)
      summary_str = session.run(summary_op)
      summary_writer.add_summary(summary_str, step)  

  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
  

ToDO: 
Fix Matplotlib
Generate script to calculate multiple versions of this so that 
   - ensure has a cut-off for when validation no longer improving
   - do 'control experiment' to validate how classification would do without mislabeled data (same randomly removed datas
   - do control experiment to validate how performance would be if all mislabeled data were just removed.
   - ensure that variables are recorded.

Implement with batch-learning techniques
Determine architecture for better NN to get higher classificaton accuracy. Plug into scripts above
Determine architecture for deep NN to get higher classification accuracy. 

Implement prior weighting matrix, not just mislabeled or properly labeled priors. 

In [None]:
#GOOD VERSION
num_steps = 5000

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])



with tf.Session(graph=graph) as session:
    # This is a one-time operation which ensures the parameters get initialized as
    # we described in the graph: random weights for the matrix, zeros for the
    # biases. 
    tf.initialize_all_variables().run()

    
  summary_op = tf.merge_all_summaries()
  train_dir = 'tmp'
  summary_writer = tf.train.SummaryWriter(train_dir, session.graph)  
  print('Initialized')
  for step in range(num_steps):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the loss value and the training predictions returned as numpy
    # arrays.
    _, l, predictions = session.run([optimizer, loss, train_prediction])
    if (step % 100 == 0):
      summary_str = session.run(summary_op)
      summary_writer.add_summary(summary_str, step)  
      print('Loss at step %d: %f' % (step, l))
      print('Training accuracy: %.1f%%' % accuracy(
        predictions, train_labels[:train_subset, :]))
      # Calling .eval() on valid_prediction is basically like calling run(), but
      # just to get that one numpy array. Note that it recomputes all its graph
      # dependencies.
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
  

All have 5k steps. No regularization


Using relu6 (6*priors) 'cause was thinking it was mnist
Mislabeled 10%
Loss at step 4900: 0.056321
Training accuracy: 81.2%
Validation accuracy: 79.4%
Test accuracy: 86.2%
(It ended up zeroing out about 15% of the inputs) 

Mislabeled 10% 
Training accuracy: 81.4%
Validation accuracy: 79.2%
Test accuracy: 86.2%

Mislabeled 20% 
Loss at step 4900: 0.099275
Training accuracy: 74.3%
Validation accuracy: 77.8%
Test accuracy: 84.7%
(was still increasing accuracy at each step)

Mislabeled 30% 
Loss at step 4900: 0.200973
Training accuracy: 66.7%
Validation accuracy: 76.2%
Test accuracy: 82.7%
(was still increasing accuracy at each step)

Mislabeled 60% 
Loss at step 4900: 1.156747
Training accuracy: 44.5%
Validation accuracy: 60.9%
Test accuracy: 68.3%
needed to stop early...
Loss at step 10000: 0.346632
Validation accuracy: 64.3%

Loss at step 19900: 0.137459
Training accuracy: 50.6%
Validation accuracy: 62.8%
Test accuracy: 70.0%

Mislabeled 0%
Loss at step 4900: 0.034090
Training accuracy: 88.6%
Validation accuracy: 80.2%
Test accuracy: 87.2%
(It ended up zeroing out about 8% of the inputs) 

Mislabled 0%
Loss at step 4900: 0.033416
Training accuracy: 88.7%
Validation accuracy: 80.6%
Test accuracy: 87.2%

Mislabeled 0% Using a sigmoid 
Loss at step 4900: 0.431781
Training accuracy: 87.5%
Validation accuracy: 77.9%
Test accuracy: 84.7%
This didn't have many events zeroing out... not changing quickly enough (it was 0.5% zeroed out at this value)

Mislabeles 0% using sigmoid and 10*prior
Loss at step 9900: 0.364327
Training accuracy: 90.8%
Validation accuracy: 77.5%
Test accuracy: 84.4%

Mislabeles 0% using sigmoid and 10*prior
Loss at step 19900: 0.292983
Training accuracy: 93.1%
Validation accuracy: 76.9%
Test accuracy: 83.7%
AT 10* prior it appeared to no longer chang ethe max val of the priors. There were some really odd steps in the training curves. 



Test using complete-prior matrix.

In [None]:
train_labels.shape

In [None]:
##### 
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000

graph = tf.Graph()
with graph.as_default():

  # Input data.
  # Load the training, validation and test data into constants that are
  # attached to the graph.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer_name = 'layer1'
#   with tf.name_scope("weights"):
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]),name='weights')
  variable_summaries(weights, layer_name + '/weights',0)
#   with tf.name_scope("biases"):
  biases = tf.Variable(tf.zeros([num_labels]),name='biases')
  variable_summaries(weights, layer_name + '/biases',0)
#   with tf.name_scope("priors"):
#   priors = tf.Variable(tf.ones([num_labels,train_subset])*0.9,name='priors')
#   priors = tf.Variable(tf.ones([train_subset,num_labels])*0.9,name='priors')
  priors = tf.Variable((tf_train_labels)*0.8+.1,name='priors')
  variable_summaries(priors, layer_name + '/priors',1) 

  logits = tf.nn.softmax(tf.matmul(tf_train_dataset, weights) + biases)
# z * -log(logits) + (1 - z) * -log(1 - logits)
  allcost = tf.mul(tf_train_labels,-tf.log(logits))#+tf.mul((1-tf_train_labels),-tf.log(1-logits));
#     allcost = tf.mul(tf_train_labels,-tf.log(logits))#+tf.mul((1-tf_train_labels),-tf.log(1-logits));
#   print(allcost.get_shape())
  loss = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.nn.relu6(priors*6),allcost),1),0)/tf.reduce_mean(tf.reduce_sum(tf.nn.relu6(priors*6),1),0)
  


#   tm = tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)
# x - x * z + log(1 + exp(-x)) and x = logits z = targets
#   print(priors.get_shape())
#   print(logits.get_shape())
#   print(train_labels.get_shape())
# this is calculating sigmoid with cross entropy not softmax_cross_entropy....
#   loss = tf.reduce_mean(tf.reduce_sum(
#             tf.mul(tf.nn.relu6(priors*6),logits-tf.mul(logits,tf_train_labels)+tf.log(1+tf.exp(-logits)))
#             ,1),0)/tf.reduce_mean(tf.reduce_sum(tf.nn.relu6(priors*6),1),0)

  
  
#   loss=tm
#     tf.reduce_mean(tf.mul(
#             tf.nn.relu6(priors*6),tm
#                                 )
#                        )/tf.reduce_mean(tf.nn.relu6(priors*6))
#   tf.scalar_summary('loss/' + layer_name, loss)
#   loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)                                
#                        )
  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#   optimizer = tf.train.AdagradOptimizer(learning_rate=0.5, initial_accumulator_value=0.1, use_locking=False, name='Adagrad')
#   optimizer = tf.train.AdamOptimizer(learning_rate=0.5) # Adam Optimizer
#   train_op = optimizer.apply_gradients(zip(grads, tvars))
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  train_prediction = logits
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
 



In [None]:
num_steps = 20000

with tf.Session(graph=graph) as session:
  # This is a one-time operation which ensures the parameters get initialized as
  # we described in the graph: random weights for the matrix, zeros for the
  # biases. 
  tf.initialize_all_variables().run()

    
  summary_op = tf.merge_all_summaries()
  train_dir = 'tmp'
  summary_writer = tf.train.SummaryWriter(train_dir, session.graph)  
  print('Initialized')
  for step in range(num_steps):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the loss value and the training predictions returned as numpy
    # arrays.
    _, l, predictions = session.run([optimizer, loss, train_prediction])
    if (step % 100 == 0):
      print('Loss at step %d: %f' % (step, l))
      acc=accuracy(predictions, train_labels[:train_subset, :])
      print('Training accuracy: %.1f%%' % acc)
      # Calling .eval() on valid_prediction is basically like calling run(), but
      # just to get that one numpy array. Note that it recomputes all its graph
      # dependencies.
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
      tf.scalar_summary('train_acc/' + layer_name, acc)
      summary_str = session.run(summary_op)
      summary_writer.add_summary(summary_str, step)  

  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
  

20% random reclassification 
Using the full prior matrix, it took  longer but got to:
Loss at step 7900: 0.067337
Training accuracy: 71.3%
Validation accuracy: 76.6%
Test accuracy: 83.5%
Loss at step 19900: 0.009899
Training accuracy: 73.9%
Validation accuracy: 78.9%
    Test accuracy: 86.3%

In [None]:
batch_size = 128

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
  # These are the parameters that we are going to be training. The weight
  # matrix will be initialized using random valued following a (truncated)
  # normal distribution. The biases get initialized to zero.
  #with tf.name_scope('hidden') as scope:
  layer_name = 'layer1'
#   with tf.name_scope("weights"):
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]),name='weights')
  variable_summaries(weights, layer_name + '/weights',0)
#   with tf.name_scope("biases"):
  biases = tf.Variable(tf.zeros([num_labels]),name='biases')
  variable_summaries(weights, layer_name + '/biases',0)
#   with tf.name_scope("priors"):
  priors = tf.Variable(tf.ones([batch_size])*0.9,name='priors')
  variable_summaries(priors, layer_name + '/priors',1)

  # Training computation.x
  # We multiply the inputs with the weight matrix, and add biases. We compute
  # the softmax and cross-entropy (it's one operation in TensorFlow, because
  # it's very common, and it can be optimized). We take the average of this
  # cross-entropy across all training examples: that's our loss.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(tf.mul(
            tf.nn.relu6(priors*6),tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)
                                )
                       )/tf.reduce_mean(tf.nn.relu6(priors*6))
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

Let's run it:

In [None]:
num_steps = 3001
num_rep = 10
with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    # submini_batch
    for j in range(num_rep):
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Minibatch loss at step 3000: 0.078691
Minibatch accuracy: 96.9%
Validation accuracy: 78.4%
Test accuracy: 85.2%

In [None]:
xx need to subtract of mean and normalize the images!

In [None]:
num_neurons = 1024
num_labels=10
batch_size = 128
graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
 
  weights1 = tf.Variable(
    tf.random_normal([image_size * image_size, num_neurons], mean = 0,
                     stddev = np.sqrt(6/(image_size * image_size+num_neurons)))
    )
  biases1 = tf.Variable(tf.zeros([num_neurons]))
  weights2 = tf.Variable(tf.truncated_normal([num_neurons, num_labels], mean = 0,
                     stddev = np.sqrt(6/(num_neurons+ num_neurons)))
                        )
  biases2 = tf.Variable(tf.zeros([num_labels]))
  #Priors
  priors = tf.placeholder(tf.float32, shape=(([batch_size])))
#   variable_summaries(priors, layer_name + '/priors',1)

#   loss = tf.reduce_mean((hidden1-tf_train_labels))
  z1 = tf.nn.sigmoid(tf.matmul(tf_train_dataset, weights1) + biases1)
  z2 = tf.matmul(z1, weights2)+biases2
  loss = tf.reduce_mean(tf.mul(tf.nn.relu6(priors*6),tf.nn.softmax_cross_entropy_with_logits(z2,tf_train_labels))
                       )/tf.reduce_mean(tf.nn.relu6(priors*6))

  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(z2)
  z1 = tf.matmul(tf_valid_dataset, weights1) + biases1
  z2 = tf.matmul(z1, weights2)+biases2
  valid_prediction = tf.nn.softmax(z2)
  z1 = tf.matmul(tf_valid_dataset, weights1) + biases1
  z2 = tf.matmul(z1, weights2)+biases2
  test_prediction = tf.nn.softmax(z2)
  mean_prior = tf.reduce_mean(priors)

In [None]:
num_steps = 2001
num_rep = 3

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    step = 0;
    these_priors= np.ones([batch_size,], dtype = 'float32')
    while step < num_steps:
        step=step+1
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        #     feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, priors_priors}
        # submini_batch
        #  initialize priors

        these_priors= np.ones([batch_size,], dtype = 'float32')

        for j in range(num_rep):
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, priors : these_priors}
            _, l, predictions = session.run(
              [optimizer, loss, train_prediction], feed_dict=feed_dict)

        if (step % 50 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [None]:
reset_selective priors

In [None]:
batch_labels.shape

In [None]:
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, priors : these_priors}


In [None]:
train_dataset.dtype

In [None]:
these_priors.dtype

In [None]:


# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000
num_neurons = image_size*image_size*4

graph = tf.Graph()
with graph.as_default():

  # Input data.
  # Load the training, validation and test data into constants that are
  # attached to the graph.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  # These are the parameters that we are going to be training. The weight
  # matrix will be initialized using random valued following a (truncated)
  # normal distribution. The biases get initialized to zero.
  #with tf.name_scope('hidden') as scope:

  layer_name = 'layer1'
  weights1 = tf.Variable(
    tf.random_normal([image_size * image_size, num_neurons], mean = 0,
                     stddev = np.sqrt(6/(image_size * image_size+num_neurons)),name='weights1')
    )
  variable_summaries(weights1, layer_name + '/weights',0)
  biases1 = tf.Variable(tf.zeros([num_neurons]),name='biases1')
  variable_summaries(biases1, layer_name + '/biases1',0)  
  layer_name = 'layer2'
  weights2 = tf.Variable(tf.truncated_normal([num_neurons, num_labels], mean = 0,
                     stddev = np.sqrt(6/(num_neurons+ num_neurons)),name='weights2')
                        )
  variable_summaries(weights1, layer_name + '/weights2',0)
  biases2 = tf.Variable(tf.zeros([num_labels]),name='biases2')
  variable_summaries(biases1, layer_name + '/biases2',0)  
  #Priors
  priors = tf.Variable(tf.ones([train_subset])*0.90,name='priors')
  variable_summaries(priors, layer_name + '/priors',0)  

  beta = .001;
  #   loss = tf.reduce_mean((hidden1-tf_train_labels))
  z1 = tf.nn.sigmoid(tf.matmul(tf_train_dataset, weights1) + biases1)
  z2 = tf.matmul(z1, weights2)+biases2
  loss = tf.reduce_mean(tf.mul(tf.nn.relu6(priors*6),tf.nn.softmax_cross_entropy_with_logits(z2,tf_train_labels))
                       )/tf.reduce_mean(tf.nn.relu6(priors*6)+beta*tf.nn.l2_loss(weights1) + beta*tf.nn.l2_loss(weights2))

#   loss = tf.reduce_mean(tf.mul(
#             tf.abs(1*priors),tf.nn.softmax_cross_entropy_with_logits(z2, tf_train_labels)
#                                 )
#                        )/tf.reduce_mean(tf.abs(1*priors))
  

  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
#   optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 2.0
  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           1000, .2, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
#   optimizer = tf.train.AdagradOptimizer(learning_rate=0.5, initial_accumulator_value=0.1, use_locking=False, name='Adagrad')
#   optimizer = tf.train.AdamOptimizer(learning_rate=0.5).minimize(loss) # Adam Optimizer
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
 # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(z2)
  z1 = tf.matmul(tf_valid_dataset, weights1) + biases1
  z2 = tf.matmul(z1, weights2)+biases2
  valid_prediction = tf.nn.softmax(z2)
  z1 = tf.matmul(tf_test_dataset, weights1) + biases1
  z2 = tf.matmul(z1, weights2)+biases2
  test_prediction = tf.nn.softmax(z2)

In [None]:
#GOOD VERSION
num_steps = 50
training_accuracy = []
validation_accuracy = []
with tf.Session(graph=graph) as session:
    # This is a one-time operation which ensures the parameters get initialized as
    # we described in the graph: random weights for the matrix, zeros for the
    # biases. 
    tf.initialize_all_variables().run()

    
    summary_op = tf.merge_all_summaries()
    train_dir = 'tmp'
    summary_writer = tf.train.SummaryWriter(train_dir, session.graph)  
    print('Initialized')
    for step in range(num_steps):
        # Run the computations. We tell .run() that we want to run the optimizer,
        # and get the loss value and the training predictions returned as numpy
        # arrays.
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if (step % 10 == 0):
            summary_str = session.run(summary_op)
            summary_writer.add_summary(summary_str, step)  
            print('Loss at step %d: %f' % (step, l))
            acc = accuracy(predictions, train_labels[:train_subset, :])
            print('Training accuracy: %.1f%%' % acc)
            training_accuracy.append(acc)
            acc =  accuracy(valid_prediction.eval(), valid_labels)
            print('Validation accuracy: %.1f%%' % acc)
            validation_accuracy.append(acc)
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))



1024 hidden Neurons: same accuracy as logistic function?
Loss at step 190: 0.706768
Training accuracy: 82.2%
Validation accuracy: 80.0%
Test accuracy: 87.0%

1024 hidden Neurons: exponential decay at 2.0 start and 0.2 decay rate. Using regularization w/ beta = .1;
Loss at step 290: 0.029613
Training accuracy: 81.4%
Validation accuracy: 78.1%
Test accuracy: 84.7% (used 410 and it still was only ~85%) 

1024 hidden Neurons: exponential decay at 2.0 start and 0.2 decay rate. Using regularization w/ beta = .5;
Loss at step 290: 0.006009
Training accuracy: 77.5%
Validation accuracy: 75.1%

1024 hidden Neurons: exponential decay at 2.0 start and 0.2 decay rate. Using regularization w/ beta = .01;
Loss at step 290: 0.148112
Training accuracy: 79.6%
Validation accuracy: 78.4%
and at 
Loss at step 490: 0.108916
Training accuracy: 81.5%
Validation accuracy: 79.3%
Test accuracy: 88.0%

increasing still... 

1024 hidden Neurons: exponential decay at 5.0 start and 0.2 decay rate. Using regularization w/ beta = .1;
Using a sigmoid function for the priors, didn't go anywhere fast.


3136 hidden Neurons: Adam optimizer.  Using regularization w/ beta = .1;
Didn't move anywhere...

# Here is the attempt at an actually deep nn. 

In [None]:
# TRYING TO GET BETTER ACCURACY!

# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000
num_neurons = image_size*image_size*4
graph = tf.Graph()

def model_1d(data, dropout):
     # Hidden Layer Training computation.
  vals = tf.matmul(data, weights1) + biases1
  hidden1 = tf.nn.relu(vals)
  if dropout:
        hidden1 = tf.nn.dropout(tf.nn.relu(vals),.5)            
  vals2 = tf.matmul(hidden1,weights2)+biases2
  hidden2 = tf.nn.relu(vals2)
  if dropout:
        hidden2 = tf.nn.relu(vals2)
  vals3 = tf.matmul(tf.nn.dropout(hidden2,.25),weights3)+biases3
#  hidden3 = tf.nn.relu(vals3)
#  if dropout:
#        hidden3 = tf.nn.relu(vals3)
#  vals4 = tf.matmul(tf.nn.dropout(hidden3,.25),weights4)+biases4
  return vals3

with graph.as_default():

  # Input data.
  # Load the training, validation and test data into constants that are
  # attached to the graph.
  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
  tf_train_labels = tf.constant(train_labels[:train_subset])
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  # These are the parameters that we are going to be training. The weight
  # matrix will be initialized using random valued following a (truncated)
  # normal distribution. The biases get initialized to zero.
  #with tf.name_scope('hidden') as scope:

  layer_name = 'layer1'
  weights1 = tf.Variable(
    tf.random_normal([image_size * image_size, num_neurons], mean = 0,
                     stddev = np.sqrt(6/(image_size * image_size+num_neurons)),name='weights1')
    )
  variable_summaries(weights1, layer_name + '/weights',0)
  biases1 = tf.Variable(tf.zeros([num_neurons]),name='biases1')
  variable_summaries(biases1, layer_name + '/biases1',0)  
  layer_name = 'layer2'
  weights2 = tf.Variable(tf.truncated_normal([num_neurons, num_labels], mean = 0,
                     stddev = np.sqrt(6/(num_neurons+ num_neurons)),name='weights2')
                        )
  variable_summaries(weights1, layer_name + '/weights2',0)
  biases2 = tf.Variable(tf.zeros([num_labels]),name='biases2')  
#   variable_summaries(biases1, layer_name + '/biases2',0)  
  #Priors
  priors = tf.Variable(tf.ones([train_subset])*0.90,name='priors')
#   variable_summaries(priors, layer_name + '/priors',0)  

  beta = .001;
  #   loss = tf.reduce_mean((hidden1-tf_train_labels))
  z1 = tf.nn.sigmoid(tf.matmul(tf_train_dataset, weights1) + biases1)
  z2 = tf.matmul(z1, weights2)+biases2
  loss = tf.reduce_mean(tf.mul(tf.nn.relu6(priors*6),tf.nn.softmax_cross_entropy_with_logits(z2,tf_train_labels))
                       )/tf.reduce_mean(tf.nn.relu6(priors*6)+beta*tf.nn.l2_loss(weights1) + beta*tf.nn.l2_loss(weights2))

  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
#   optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 2.0
  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           1000, .2, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
#   optimizer = tf.train.AdagradOptimizer(learning_rate=0.5, initial_accumulator_value=0.1, use_locking=False, name='Adagrad')
#   optimizer = tf.train.AdamOptimizer(learning_rate=0.5).minimize(loss) # Adam Optimizer
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
 # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(z2)
  z1 = tf.matmul(tf_valid_dataset, weights1) + biases1
  z2 = tf.matmul(z1, weights2)+biases2
  valid_prediction = tf.nn.softmax(z2)
  z1 = tf.matmul(tf_test_dataset, weights1) + biases1
  z2 = tf.matmul(z1, weights2)+biases2
  test_prediction = tf.nn.softmax(z2)

num_steps = 50
training_accuracy = []
validation_accuracy = []
with tf.Session(graph=graph) as session:
    # This is a one-time operation which ensures the parameters get initialized as
    # we described in the graph: random weights for the matrix, zeros for the
    # biases. 
    tf.initialize_all_variables().run()

    
    summary_op = tf.merge_all_summaries()
    train_dir = 'tmp'
    summary_writer = tf.train.SummaryWriter(train_dir, session.graph)  
    print('Initialized')
    for step in range(num_steps):
        # Run the computations. We tell .run() that we want to run the optimizer,
        # and get the loss value and the training predictions returned as numpy
        # arrays.
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if (step % 10 == 0):
            summary_str = session.run(summary_op)
            summary_writer.add_summary(summary_str, step)  
            print('Loss at step %d: %f' % (step, l))
            acc = accuracy(predictions, train_labels[:train_subset, :])
            print('Training accuracy: %.1f%%' % acc)
            training_accuracy.append(acc)
            acc =  accuracy(valid_prediction.eval(), valid_labels)
            print('Validation accuracy: %.1f%%' % acc)
            validation_accuracy.append(acc)
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


In [None]:
def model_1d(data, dropout):
     # Hidden Layer Training computation.
  vals = tf.matmul(data, weights1) + biases1
  hidden1 = tf.nn.relu(vals)
  if dropout:
        hidden1 = tf.nn.dropout(tf.nn.relu(vals),.5)            
  vals2 = tf.matmul(hidden1,weights2)+biases2
  hidden2 = tf.nn.relu(vals2)
  if dropout:
        hidden2 = tf.nn.relu(vals2)
  vals3 = tf.matmul(tf.nn.dropout(hidden2,.25),weights3)+biases3
#  hidden3 = tf.nn.relu(vals3)
#  if dropout:
#        hidden3 = tf.nn.relu(vals3)
#  vals4 = tf.matmul(tf.nn.dropout(hidden3,.25),weights4)+biases4
  return vals3

batch_size = 200
#num_neurons = [image_size * image_size, 1024, num_labels]
num_neurons = [image_size * image_size, 1024, 512, num_labels]
graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights1 = tf.Variable(
    tf.truncated_normal([num_neurons[0], num_neurons[1]], 0, 2.0/np.sqrt(num_neurons[0])))
  biases1 = tf.Variable(tf.zeros([num_neurons[1]]))
  
  weights2 = tf.Variable(
    tf.truncated_normal([num_neurons[1], num_neurons[2]], 0, 2.0/np.sqrt(num_neurons[1])))
  biases2 = tf.Variable(tf.zeros([num_neurons[2]]))

  weights3 = tf.Variable(
    tf.truncated_normal([num_neurons[2], num_neurons[3]], 0, 2.0/np.sqrt(num_neurons[2])))
  biases3 = tf.Variable(tf.zeros([num_neurons[3]]))

#  weights4 = tf.Variable(
#    tf.truncated_normal([num_neurons[3], num_neurons[4]], 0, 2.0/np.sqrt(num_neurons[3])))
#  biases4 = tf.Variable(tf.zeros([num_neurons[4]]))


  beta = 5e-4;
#  logits = (tf.matmul(tf_train_dataset, weights) + biases)
  logits = model_1d(tf_train_dataset, 1)
#  loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
  loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)+
                        beta*tf.nn.l2_loss(weights1) + beta*tf.nn.l2_loss(weights2))
    
  # Optimizer.
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 1e-3
  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           1000, 0.98, staircase=True)
  #learning_rate = 1e-2
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model_1d(tf_valid_dataset, 0))
  test_prediction = tf.nn.softmax(model_1d(tf_test_dataset, 0))

In [None]:
things to do: switch up training/validation and only have those and then use test accuracy? 
    RANDOM MINI BATCHES AND SAVE P VALUE. MAKE SURE TO NOT LOSE DETAILS AND HAVE HIGH COVERAGE OF OBSERVATIONS

In [None]:
import numpy as np
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt

# .pyplot as plt
validation_accuracy = np.ones(10)
plt.plot([np.mean(validation_accuracy[i]) for i in range(len(validation_accuracy))])
plt.show()

In [None]:
TRYING WHAT THAT ONE PAPER DID