Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

  from ._conv import register_converters as _register_converters


First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [7]:
batch_size = 128

graph2 = tf.Graph()
with graph2.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  whx= tf.Variable(
    tf.truncated_normal([image_size * image_size,1024]))
  why=tf.Variable(
    tf.truncated_normal([1024, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  hidden=tf.nn.relu(tf.matmul(tf_train_dataset,whx))
  logits = tf.matmul(hidden, why) + biases
  l2_loss = tf.nn.l2_loss(whx) + tf.nn.l2_loss(why)  # 正则项
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))+0.001 * l2_loss # 惩罚项系数
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(tf.matmul(
    tf.nn.relu(tf.matmul(tf_valid_dataset, whx)),why) + biases)
  test_prediction =  tf.nn.softmax(tf.matmul(
    tf.nn.relu(tf.matmul(tf_test_dataset, whx)),why) + biases)

In [8]:
num_steps = 3001

with tf.Session(graph=graph2) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
      print("loss is : ",l )
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 635.205200
Minibatch accuracy: 10.9%
Validation accuracy: 16.1%
loss is :  635.2052
Minibatch loss at step 500: 330.395172
Minibatch accuracy: 73.4%
Validation accuracy: 79.7%
loss is :  330.39517
Minibatch loss at step 1000: 291.061584
Minibatch accuracy: 81.2%
Validation accuracy: 81.7%
loss is :  291.06158
Minibatch loss at step 1500: 282.282501
Minibatch accuracy: 82.8%
Validation accuracy: 82.5%
loss is :  282.2825
Minibatch loss at step 2000: 263.089600
Minibatch accuracy: 85.2%
Validation accuracy: 83.5%
loss is :  263.0896
Minibatch loss at step 2500: 249.189240
Minibatch accuracy: 82.8%
Validation accuracy: 83.9%
loss is :  249.18924
Minibatch loss at step 3000: 236.119400
Minibatch accuracy: 83.6%
Validation accuracy: 83.7%
loss is :  236.1194
Test accuracy: 89.8%


比之前的test accuracy 提高了0.8%

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [9]:
step=1
offset_range = 1000
(step * batch_size) % offset_range

128

In [12]:
step=2
offset_range = 1000
(step * batch_size) % offset_range

256

In [16]:
ls=[]
for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    offset_range = 1000
    offset = (step * batch_size) % offset_range
    ls.append( offset)
    print(offset)

0
128
256
384
512
640
768
896
24
152
280
408
536
664
792
920
48
176
304
432
560
688
816
944
72
200
328
456
584
712
840
968
96
224
352
480
608
736
864
992
120
248
376
504
632
760
888
16
144
272
400
528
656
784
912
40
168
296
424
552
680
808
936
64
192
320
448
576
704
832
960
88
216
344
472
600
728
856
984
112
240
368
496
624
752
880
8
136
264
392
520
648
776
904
32
160
288
416
544
672
800
928
56
184
312
440
568
696
824
952
80
208
336
464
592
720
848
976
104
232
360
488
616
744
872
0
128
256
384
512
640
768
896
24
152
280
408
536
664
792
920
48
176
304
432
560
688
816
944
72
200
328
456
584
712
840
968
96
224
352
480
608
736
864
992
120
248
376
504
632
760
888
16
144
272
400
528
656
784
912
40
168
296
424
552
680
808
936
64
192
320
448
576
704
832
960
88
216
344
472
600
728
856
984
112
240
368
496
624
752
880
8
136
264
392
520
648
776
904
32
160
288
416
544
672
800
928
56
184
312
440
568
696
824
952
80
208
336
464
592
720
848
976
104
232
360
488
616
744
872
0
128
256
384
512
640
768
896


336
464
592
720
848
976
104
232
360
488
616
744
872
0
128
256
384
512
640
768
896
24
152
280
408
536
664
792
920
48
176
304
432
560
688
816
944
72
200
328
456
584
712
840
968
96
224
352
480
608
736
864
992
120
248
376
504
632
760
888
16
144
272
400
528
656
784
912
40
168
296
424
552
680
808
936
64
192
320
448
576
704
832
960
88
216
344
472
600
728
856
984
112
240
368
496
624
752
880
8
136
264
392
520
648
776
904
32
160
288
416
544
672
800
928
56
184
312
440
568
696
824
952
80
208
336
464
592
720
848
976
104
232
360
488
616
744
872
0
128
256
384
512
640
768
896
24
152
280
408
536
664
792
920
48
176
304
432
560
688
816
944
72
200
328
456
584
712
840
968
96
224
352
480
608
736
864
992
120
248
376
504
632
760
888
16
144
272
400
528
656
784
912
40
168
296
424
552
680
808
936
64
192
320
448
576
704
832
960
88
216
344
472
600
728
856
984
112
240
368
496
624
752
880
8
136
264
392
520
648
776
904
32
160
288
416
544
672
800
928
56
184
312
440
568
696
824
952
80
208
336
464
592
720
848
976
104
23

In [21]:
min(set(ls))

0

In [20]:
max(set(ls))

992

In [13]:
# offset_range = 1000
# offset = (step * batch_size) % offset_range
# num_steps = 3001  
#缩小训练数据范围：将把batch数据的起点offset的可选范围变小（只能选择0-992之间的数据）：
with tf.Session(graph=graph2) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    offset_range = 1000
    offset = (step * batch_size) % offset_range
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
      print("loss is : ",l )
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 651.039490
Minibatch accuracy: 11.7%
Validation accuracy: 18.0%
loss is :  651.0395
Minibatch loss at step 500: 300.548981
Minibatch accuracy: 97.7%
Validation accuracy: 70.1%
loss is :  300.54898
Minibatch loss at step 1000: 284.719604
Minibatch accuracy: 100.0%
Validation accuracy: 70.3%
loss is :  284.7196
Minibatch loss at step 1500: 270.833313
Minibatch accuracy: 100.0%
Validation accuracy: 70.3%
loss is :  270.8333
Minibatch loss at step 2000: 257.624298
Minibatch accuracy: 100.0%
Validation accuracy: 70.4%
loss is :  257.6243
Minibatch loss at step 2500: 245.059509
Minibatch accuracy: 100.0%
Validation accuracy: 70.4%
loss is :  245.05951
Minibatch loss at step 3000: 233.107529
Minibatch accuracy: 100.0%
Validation accuracy: 70.4%
loss is :  233.10753
Test accuracy: 76.8%


过拟合了

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [5]:
batch_size = 128

graph2 = tf.Graph()
with graph2.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  whx= tf.Variable(
    tf.truncated_normal([image_size * image_size,1024]))
  why=tf.Variable(
    tf.truncated_normal([1024, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  hidden=tf.nn.relu(tf.matmul(tf_train_dataset,whx))
  hidden_drop=tf.nn.dropout(hidden,0.5)    # relu 出来的结果使得部分结果为0

  logits = tf.matmul(hidden_drop, why) + biases
  l2_loss = tf.nn.l2_loss(whx) + tf.nn.l2_loss(why)  # 正则项
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))+0.001 * l2_loss # 惩罚项系数
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(tf.matmul(
    tf.nn.relu(tf.matmul(tf_valid_dataset, whx)),why) + biases)
  test_prediction =  tf.nn.softmax(tf.matmul(
    tf.nn.relu(tf.matmul(tf_test_dataset, whx)),why) + biases)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [6]:
num_steps = 3001

with tf.Session(graph=graph2) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
      print("loss is : ",l )
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 838.317749
Minibatch accuracy: 10.2%
Validation accuracy: 16.6%
loss is :  838.31775
Minibatch loss at step 500: 346.784241
Minibatch accuracy: 72.7%
Validation accuracy: 83.7%
loss is :  346.78424
Minibatch loss at step 1000: 299.813782
Minibatch accuracy: 83.6%
Validation accuracy: 84.4%
loss is :  299.81378
Minibatch loss at step 1500: 287.622314
Minibatch accuracy: 81.2%
Validation accuracy: 84.6%
loss is :  287.6223
Minibatch loss at step 2000: 264.592499
Minibatch accuracy: 81.2%
Validation accuracy: 84.6%
loss is :  264.5925
Minibatch loss at step 2500: 248.298431
Minibatch accuracy: 80.5%
Validation accuracy: 84.1%
loss is :  248.29843
Minibatch loss at step 3000: 233.701950
Minibatch accuracy: 78.9%
Validation accuracy: 84.4%
loss is :  233.70195
Test accuracy: 89.9%


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


1.try learning rate decay

In [13]:
batch_size = 128

graph2 = tf.Graph()
with graph2.as_default():
    

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  whx= tf.Variable(
    tf.truncated_normal([image_size * image_size,1024]))
  why=tf.Variable(
    tf.truncated_normal([1024, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  hidden=tf.nn.relu(tf.matmul(tf_train_dataset,whx))
  hidden_drop=tf.nn.dropout(hidden,0.5)    # relu 出来的结果使得部分结果为0

  logits = tf.matmul(hidden_drop, why) + biases
  l2_loss = tf.nn.l2_loss(whx) + tf.nn.l2_loss(why)  # 正则项
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))+0.001 * l2_loss # 惩罚项系数
    
  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.05, global_step,100,0.96)   # decay_rate=0.96； decay_step=100
  
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(tf.matmul(
    tf.nn.relu(tf.matmul(tf_train_dataset, whx)),why) + biases)    # 注意预测时不用dropout
  valid_prediction = tf.nn.softmax(tf.matmul(
    tf.nn.relu(tf.matmul(tf_valid_dataset, whx)),why) + biases)
  test_prediction =  tf.nn.softmax(tf.matmul(
    tf.nn.relu(tf.matmul(tf_test_dataset, whx)),why) + biases)

In [14]:
num_steps = 3001

with tf.Session(graph=graph2) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
      print("loss is : ",l )
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 840.418335
Minibatch accuracy: 4.7%
Validation accuracy: 16.6%
loss is :  840.41833
Minibatch loss at step 500: 344.335327
Minibatch accuracy: 78.9%
Validation accuracy: 83.4%
loss is :  344.33533
Minibatch loss at step 1000: 303.889282
Minibatch accuracy: 88.3%
Validation accuracy: 84.3%
loss is :  303.88928
Minibatch loss at step 1500: 295.816589
Minibatch accuracy: 85.9%
Validation accuracy: 84.8%
loss is :  295.8166
Minibatch loss at step 2000: 288.064484
Minibatch accuracy: 85.9%
Validation accuracy: 84.9%
loss is :  288.06448
Minibatch loss at step 2500: 276.814270
Minibatch accuracy: 86.7%
Validation accuracy: 85.1%
loss is :  276.81427
Minibatch loss at step 3000: 273.223267
Minibatch accuracy: 84.4%
Validation accuracy: 85.5%
loss is :  273.22327
Test accuracy: 91.0%


 看到准确率提高到了91%

2.深层神经网络

In [27]:
batch_size = 128

graph2 = tf.Graph()
with graph2.as_default():
    
    
# Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)


  #第一层的权重初始化 
  # Variables.
  whx1= tf.Variable(
    tf.truncated_normal([image_size * image_size,1024],stddev=2/image_size * image_size))
  bia1= tf.Variable(tf.zeros([1024]))

  # Training computation.

  weights=[]
  bias=[]
  hidden_cur_node=1024

  # 中间层的权重初始化
  for i in range(layers-2):
    if  hidden_cur_node>2:
        hidden_next_node=int(hidden_cur_node/2)
    else:
        hidden_next_node=2
    weight=tf.Variable(tf.truncated_normal([ hidden_cur_node, hidden_next_node],stddev=2/ hidden_cur_node))
    bia= tf.Variable(tf.zeros([hidden_next_node]))
    weights.append(weight)
    bias.append(bia)
    hidden_cur_node=hidden_next_node


   # 第一层的前向传播
  # first  for training 
  keep_prob=0.5
  hidden=tf.nn.relu(tf.matmul(tf_train_dataset,whx1)+bia1)
  hidden_drop=tf.nn.dropout(hidden,keep_prob)    # relu 出来的结果使得部分结果为0 \


#first for valid
  hidden_valid=tf.nn.relu(tf.matmul(tf_valid_dataset, whx1)+ bia1)


# first for test

  hidden_test=tf.nn.relu(tf.matmul(tf_test_dataset, whx1)+ bia1) 

#中间层的前向传播
  for i in range(layers-2):
        
        hidden=tf.nn.relu(tf.matmul(hidden,weights[i])+bias[i])
        
        hidden_drop=tf.nn.relu(tf.matmul(hidden_drop,weights[i])+bias[i])
        keep_prob += 0.5 * i / (layers + 1) # 每一次DropOut时，因为后面的layer得到的信息越重要，需要动态调整丢弃的比例，到后面的layer，丢弃的比例要减小
        hidden_drop==tf.nn.dropout(hidden_drop,keep_prob)
        
        
        hidden_valid=tf.nn.relu(tf.matmul(hidden_valid,weights[i])+ bias[i])
        hidden_test=tf.nn.relu(tf.matmul(hidden_test, weights[i])+ bias[i])
  
  #最后一层
  why= tf.Variable(
    tf.truncated_normal([hidden_cur_node,num_labels],stddev=2/hidden_cur_node))
  biay= tf.Variable(tf.zeros([num_labels]))
  
  logits = tf.matmul(hidden_drop, why) + biay
    
 # prediction:
  logits_pre = tf.matmul(hidden, why) + biay
  valid_pre=tf.matmul(hidden_valid, why) + biay
  test_pre=tf.matmul(hidden_test, why) + biay
  
  
   
  
  
  l2_loss = tf.nn.l2_loss(whx1) + tf.nn.l2_loss(why)  # 正则项
  for  i in range(layers-2):
    l2_loss+= tf.nn.l2_loss(weights[i])
    
    
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))+0.001 * l2_loss # 惩罚项系数

  global_step = tf.Variable(0)  # count the number of steps taken.
  learning_rate = tf.train.exponential_decay(0.05, global_step,100,0.96)   # decay_rate=0.96； decay_step=100


  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits_pre)
  valid_prediction = tf.nn.softmax(valid_pre)
  test_prediction =  tf.nn.softmax(test_pre)

In [28]:
layers=7

In [30]:
num_steps = 10000

with tf.Session(graph=graph2) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
      print("loss is : ",l )
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 1247.052368
Minibatch accuracy: 7.8%
Validation accuracy: 10.0%
loss is :  1247.0524
Minibatch loss at step 500: 1191.995605
Minibatch accuracy: 12.5%
Validation accuracy: 10.0%
loss is :  1191.9956
Minibatch loss at step 1000: 1148.437622
Minibatch accuracy: 22.7%
Validation accuracy: 22.8%
loss is :  1148.4376
Minibatch loss at step 1500: 1113.266602
Minibatch accuracy: 82.0%
Validation accuracy: 81.0%
loss is :  1113.2666
Minibatch loss at step 2000: 1086.358154
Minibatch accuracy: 85.2%
Validation accuracy: 83.5%
loss is :  1086.3582
Minibatch loss at step 2500: 1064.787720
Minibatch accuracy: 85.9%
Validation accuracy: 85.2%
loss is :  1064.7877
Minibatch loss at step 3000: 1047.593994
Minibatch accuracy: 84.4%
Validation accuracy: 85.6%
loss is :  1047.594
Minibatch loss at step 3500: 1033.870972
Minibatch accuracy: 82.8%
Validation accuracy: 86.3%
loss is :  1033.871
Minibatch loss at step 4000: 1022.716675
Minibatch accuracy: 78.1%
Validati

用7层网络增加了训练次数，准确率到了93