In [1]:
import tensorflow as tf

In [2]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
#print(x_train.min(), x_train.max())
#print(x_train.shape)
#print(x_test.shape)
#print(x_train[0])

In [3]:
x_train, x_test = x_train / 255.0, x_test / 255.0
#print(x_train[0])

In [4]:
print(y_train.shape)
print(y_train[0])

(60000,)
5


###  Building the Network...

In [5]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

In [6]:
import copy #needed for copy or deepcopy
x_train_copy = copy.copy(x_train)
print(x_train_copy.shape)
print(x_train_copy[0].flatten().shape)
x_train_copy = x_train_copy.reshape(60000, 28*28)
print(x_train_copy.shape)
print(x_train.shape)

(60000, 28, 28)
(784,)
(60000, 784)
(60000, 28, 28)


In [7]:
x_train = x_train.reshape(60000, 28*28)
x_test = x_test.reshape(10000, 28*28)

### Model #1

In [8]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [9]:
model = tf.keras.models.Sequential([
    #tf.keras.layers.Flatten(input_shape=(28, 28))
    tf.keras.layers.Dense(16, activation='selu', input_shape=(784,)),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

In [10]:
model.compile(optimizer='Adam',
             loss=loss_fn,
             metrics=['accuracy'])

In [11]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ea4d244f08>

In [12]:
model.evaluate(x_test, y_test, verbose=2)

313/313 - 0s - loss: 0.2092 - accuracy: 0.9428


[0.2091987580060959, 0.942799985408783]

### Model  #2

In [13]:
model = tf.keras.models.Sequential([
    #tf.keras.layers.Flatten(input_shape=(28, 28))
    tf.keras.layers.Dense(16, activation='relu', input_shape=(784,)),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

In [14]:
model.compile(optimizer='RMSProp',
             loss=loss_fn,
             metrics=['accuracy'])

In [15]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ea4d2935c8>

In [16]:
model.evaluate(x_test, y_test, verbose=2)

313/313 - 0s - loss: 0.1967 - accuracy: 0.9422


[0.1966744214296341, 0.9422000050544739]

### Model #3

In [17]:
model = tf.keras.models.Sequential([
    #tf.keras.layers.Flatten(input_shape=(28, 28))
    tf.keras.layers.Dense(28, activation='relu', input_shape=(784,)),
    tf.keras.layers.Dense(14, activation='relu'),
    tf.keras.layers.Dense(7, activation='relu'),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

In [18]:
model.compile(optimizer='adam',
             loss=loss_fn,
             metrics=['accuracy'])

In [19]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ea4f6e28c8>

In [20]:
model.evaluate(x_test, y_test, verbose=2)

313/313 - 0s - loss: 0.1498 - accuracy: 0.9587


[0.14977861940860748, 0.9587000012397766]

### Model #4

In [21]:
model = tf.keras.models.Sequential([
    #tf.keras.layers.Flatten(input_shape=(28, 28))
    tf.keras.layers.Dense(784, activation='tanh', input_shape=(784,)),
    #tf.keras.layers.Dense(14, activation='relu'),
    #tf.keras.layers.Dense(14, activation='softmax'),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

In [22]:
model.compile(optimizer='ftrl',
             loss=loss_fn,
             metrics=['accuracy'])

In [23]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ea4fc52c08>

In [24]:
model.evaluate(x_test, y_test, verbose=2)

313/313 - 0s - loss: 0.6225 - accuracy: 0.8219


[0.622535228729248, 0.8219000101089478]

### Model # 5

In [25]:
model = tf.keras.models.Sequential([
    #tf.keras.layers.Flatten(input_shape=(28, 28))
    tf.keras.layers.Dense(28, activation='sigmoid', input_shape=(784,)),
    #tf.keras.layers.Dense(14, activation=''),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])

In [26]:
model.compile(optimizer='adam',
             loss=loss_fn,
             metrics=['accuracy'])

In [27]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ea85015308>

In [28]:
model.evaluate(x_test, y_test, verbose=2)

313/313 - 0s - loss: 0.1730 - accuracy: 0.9500


[0.17298318445682526, 0.949999988079071]

### The Answer to Homework Question #1 
<p>1. Based on this article about the sigmoid vs softmax functions, why do we not want to use sigmoid in our mnist example? <br>
https://glassboxmedicine.com/2019/05/26/classification-sigmoid-vs-softmax/<br><br>
The sigmoid function treats all of the outputs individually.  The softmax algorithm treats them as interrelated values dependent on one another.  Each of the softmax values collectively sum to one.  So if one value increases another decreases. The softmax algorithm predicts the one and only one answer.  A sigmoid function can detect many different outcomes. A softmax function only predicts one.  For our MNIST example we only want to predict one outcome.  We don’t want an algorithm that will return a set of possible solutions.  Each number must be clearly identified.   We do not want an algorithm that will produce an ambiguous solution. <p>


### Results

<p>Model #1<br>
options: relu, adam<br> 
train time: 1 second<br> 
train acc: 0.9467<br>
 test acc0.9460<br>
To get best results, you<br>
I did not alter anything for this test<br>

Model #2<br>
options: relu, RMSProp<br> 
train time: 1 second<br> 
train acc: 0.9445<br>
 test acc 0.9376<br>
To get best results, you<br>
Changed optimizer to RMSProl<br>

Model #3<br>
options: relu, adam<br> 
train time: 1 second<br> 
train acc: 0.0.9617<br>
 test acc 0.9534<br>
To get best results, you<br>
Defined three layers</p>

Model #4<br>
options: tanh, ftrl<br> 
train time: 1 second<br> 
train acc: 0.8016<br>
 test acc 0.8246<br>
To get best results, you<br>
Changed optimizer to ftrl<br>

Model #5<br>
options: sigmoid, adam<br> 
train time: 1 second<br> 
train acc: 0.9532<br>
test acc 0.9527<br>
To get best results, you<br>
used sigmoid activation function</p>