In [None]:
'''
    Original
'''
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility

# add this to see the difference in time it took to complete
import time


# parameters - original 
NB_EPOCH = 200
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits

#  Set the optimization algorithm used to train the model's weights to Stochastic Gradient Descent (SGD).
#  This impacts how the neural network learns
OPTIMIZER = SGD() 
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION


# setting up the pixels of the image with a label 
(X_train, y_train), (X_test, y_test) = mnist.load_data()

#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784

# transforms the image data from MNIST dataset
# to a data set that can be used by the neural network
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# continues to transform the test image data into pixel values.
# by dividing every pixel by 255 so it can be a number either between 0 and 1
X_train /= 255
X_test /= 255
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

'''
Those lines are converting the label data (y_train and y_test) from their original integer format into a
categorical format suitable for training a *multi-class classification model. Initially,
the labels are represented as single digits (0 to 9) indicating which handwritten digit the image
represents. However, many machine learning models expect the labels to be *"one-hot" encoded vectors instead
of integers. The np_utils.to_categorical function from Keras does this encoding automatically. It takes the
integer labels and the total number of classes (NB_CLASSES=10 for digits 0-9), and converts each label
into a vector with 0s in all positions except a 1 in the index corresponding to that digit class. This
allows the model to interpret the output as a probability distribution over the 10 possible classes
during training and prediction. So y_train and y_test now contain the categorically encoded label data
matching the format expected by the model.
-- Left off here...--
'''
Y_train = np_utils.to_categorical(y_train, NB_CLASSES)
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)

# set up model
model = Sequential()

# first hidden layer
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))

# output 
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()

# add a comment about what this does 
model.compile(loss='categorical_crossentropy',
optimizer=OPTIMIZER,
metrics=['accuracy'])

# start timer 
start_time = time.time()

# add a comment about what this does
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)

# end timer
end_time = time.time()
final_time = end_time - start_time


score = model.evaluate(X_test, Y_test, verbose=VERBOSE)

print("Test score:", score[0])
print('Test accuracy:', score[1])
print(f"Test completion time: {final_time:.2f} seconds")


In [None]:
'''
    Updated
'''
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.optimizers import Adam
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility

# parameters - updated
NB_EPOCH = 20
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = Adam()
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION
DROPOUT=0.3


'''
 2nd round - Add a screen shot of the results with these params ^^^ and add it to markdown response
 Be sure to commment on changing the optimizer and adding the dropout and increasing the epoch
'''

# setting up the pixels of the image with a label 
(X_train, y_train), (X_test, y_test) = mnist.load_data()

#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784


# add a comment about what this does
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# add a comment about what this does
X_train /= 255
X_test /= 255
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# add a comment about what this does
Y_train = np_utils.to_categorical(y_train, NB_CLASSES)
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)

# set up model
model = Sequential()

# first hidden layer
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dropout(DROPOUT))

# second hidden layer 
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dropout(DROPOUT))

# output 
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()

# add a comment about what this does 
model.compile(loss='categorical_crossentropy',
optimizer=OPTIMIZER,
metrics=['accuracy'])

# start timer 
start_time = time.time()

# add a comment about what this does
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)

# end timer
end_time = time.time()
final_time = end_time - start_time

score = model.evaluate(X_test, Y_test, verbose=VERBOSE)


print("Test score:", score[0])
print('Test accuracy:', score[1])
print(f"Test completion time: {final_time:.2f} seconds")

# Original Run vs. Updated Run

## Original Run Details

The initial neural network script results after the training we achieve about 98.6% accuracy on training, 97.2% on validation, 97.4% on the test, and it took 21.76 minutes to complete the training.

<img src="./2originalrun.png" alt="original run 2"  width="600"/>

## Updated Run Details

After making changes to neural network by adding another hidden layer, changing the optimizer algorithm, and a drop out for each hidden layer results after the training we achieve about 98.1% accuracy on training, 97.8% on validation, 97.9% on the test, and it took 2.6 minutes to complete the training.

<img src="./2updatedrun.png" alt="updated run 2"  width="600"/>

## What happens to the accuracy rates for the training, validation, and test data sets as you change the parameters?

### Epochs
The way neural networks train themselves on a set of data is by constantly being exposed to the data and making sense of it. The constant exposure is broken down into rounds or cycles. The epoch is the number of cycles you want our neural network model to be exposed to the data before its training its complete. Initially, our neural network had a very basic configuration so making the epoch number high made sense if we wanted to achieve higher accuracy on our results. The drawback of having a high number of epochs it takes longer for the neural network to complete its training. As you can see when our `NB_EPOCH = 200` it took 21.76 minutes to complete the training. So even though we have a 98.6% accuracy on training, 97.2% on validation, 97.4% on the test it look a substantialy amount of time to achieve this goal.

In our second round we lowered `NP_EPOCH = 20` and if that is all we changed the training time would drastically decrease, but the training goals would be far from accurate. For example, adjusting the epoch to 20 and changing nothign else about the model script resulted in 93% accuracy on training, 93.4% on validation, 93.4% on the test, and it took 2.3 minutes to complete the training. The time it took to complete the training decreased, which is good. However, the rest the training metrics decreased as well, which is not good. The objective is to obtain our training goals and minimize the time it takes to acheive these goals. So we are going to set the epoch to 20, we have to make further optimizations to our neural network.

### Optimizers
The purpose of selecting an optimizer algorithm is to update the weights while the model goes through each round of training. Selecting an optimizer algorithm also requires an objective function (loss functions), because the algorithm will excute this function in order to adjust the weights and biases to minimize any loss between the predicted and actual output. The initial optimizer algorithm used for the neural network was a stochastic gradient descent `OPTIMIZER = SGD()`. The way this gradient descent works is it performs one weight update every epoch. Though very efficient, it requires many more runs because it also generates a lot of noise, which the model must interpret.

When we updated the model and lowered the number of epochs we introducted a more advanced optimization algorithm known as Adam `OPTIMIZER = ADAM()`. This algorithm is similar to SGD interms of acceleration, but also includes another concept of momentum as well in the form of velocity. Using Adam allows for faster convergence at the cost of more computation. Overall, since we are using a more advanced optimization algorithm we can lower the number of epochs and still achieve our training goals. 

Just to see what would happen, I set the optimization algorithm to Adam and kept the number of epochs high at 200, and the results were quite surprising 

### Additional Hidden Layer
What is a hidden layer, their purpose in the neural network and how is it implemented? Also, what is the pros/cons of adding an additional layer.

### Dropout 

What is a dropout, their purpose in the neural network and how is it implemented? Also, what is the pros/cons of adding an additional layer.

## Conclusion
Share why adding the additional changes made the neural network more accurate 