In [9]:
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from keras.utils import to_categorical
np.random.seed(1671) # for reproducibility
# network and training
NB_EPOCH = 20
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION
# data: shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# normalize
X_train /= 255
X_test /= 255
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = to_categorical(y_train, NB_CLASSES)
Y_test = to_categorical(y_test, NB_CLASSES)
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy',
optimizer=OPTIMIZER,
metrics=['accuracy'])
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])



60000 train samples
10000 test samples
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               100480    
                                                                 
 activation_3 (Activation)   (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 activation_4 (Activation)   (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 10)                1290      
                                                                 
 activation_5 (Activation)   (None, 10)                0         
                                                                 
Total params: 1

In [1]:
# increasing epochs from 20 to 60
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from keras.utils import to_categorical
np.random.seed(1671) # for reproducibility
# network and training
NB_EPOCH = 60
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION
# data: shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# normalize
X_train /= 255
X_test /= 255
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = to_categorical(y_train, NB_CLASSES)
Y_test = to_categorical(y_test, NB_CLASSES)
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy',
optimizer=OPTIMIZER,
metrics=['accuracy'])
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])



60000 train samples
10000 test samples
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               100480    
                                                                 
 activation (Activation)     (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 activation_1 (Activation)   (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
 activation_2 (Activation)   (None, 10)                0         
                                                                 
Total params: 118

In [4]:
# increasing epochs to 120
# 
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from keras.utils import to_categorical
np.random.seed(1671) # for reproducibility
# network and training
NB_EPOCH = 120
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT=0.2 # how much TRAIN is reserved for VALIDATION
# data: shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# normalize
X_train /= 255
X_test /= 255
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = to_categorical(y_train, NB_CLASSES)
Y_test = to_categorical(y_test, NB_CLASSES)
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy',
optimizer=OPTIMIZER,
metrics=['accuracy'])
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])



60000 train samples
10000 test samples
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 128)               100480    
                                                                 
 activation_9 (Activation)   (None, 128)               0         
                                                                 
 dense_10 (Dense)            (None, 128)               16512     
                                                                 
 activation_10 (Activation)  (None, 128)               0         
                                                                 
 dense_11 (Dense)            (None, 10)                1290      
                                                                 
 activation_11 (Activation)  (None, 10)                0         
                                                                 
Total params: 1

The data for 20 epochs: 

- accuracy: 0.9461
- Test score: 0.19061198830604553
- Test accuracy: 0.9460999965667725
- 60000 train samples
- 10000 test samples

The data for for 60 epochs: 

- accuracy: 0.9674
- Test score: 0.10819873213768005
- Test accuracy: 0.9674000144004822
- 60000 train samples
- 10000 test samples

For 120 epochs: 

- accuracy: 0.9752
- Test score: 0.08114946633577347
- Test accuracy: 0.9751999974250793
- 60000 train samples
- 10000 test samples

This code was provided in the text, Deep Learning With Keras (Gulli & Pal, 2017). The code is an algorithm used to train the computer to recognize handwritten digits using supervised learning. According to IBM, supervised learning uses labeled datasets to train models to classify data or reach a certain conclusion (What Is Supervised Learning?  | IBM, n.d.). 

In this code, we can observe that with each iteration or epoch, the test accuracy increases as the machine becomes experienced. As expected, we see that as we increase the number of epochs, we observe that the accuracy increases. Though I only tested up to 120 epochs, Gulli and Pal explain that as we test up to a certain point, the accuracy increases, but after we test beyond around 100 epochs, increases in accuracy begin to slow. In the figure on page 28, we see that as we train the data past approximately 100 epochs, there is a point at which the accuracy benefits with additional training begin to diminish (Gulli & Pal, 2017). 

![image.png](attachment:image.png) 

Image credit: (Gulli & Pal, 2017)


In these cases, the model may be more likely to overfit the data, making generalizations that are not beneficial for predicting digits correctly. According to *Do Machine Learning Models Memorize or Generalize?* from the website Google PAIR, a “model overfits the training data when it performs well on the training data but poorly on the test data” (n.d.). Similarly, Investopedia defines overfitting as a “modeling error in statistics that occurs when a function is too closely aligned to a limited set of data points” (Twin, 2021) Overfitting may also involve making generalizations based on characteristics that are not appropriate, which is what we see in this model. 

In this model, accuracy rates increase as the training sample size and validation are kept constant. This happens as the model improves and "learns" patterns over many iterations. As we manipulate the number of epochs that the model is trained over, we increase the iterations that the model will execute the full dataset over and for each iteration, the model gains experience. As the data grows, the model becomes more and more experienced and efficient with its prediction capabilities. The benefits, however, begin to taper off and only modest improvements can be achieved after a certain number of epochs (around 100). After this point, the model begins overfitting and making mistakes and generalizations that prevent it from significantly improving in accuracy any further. 

**References**

*Do machine learning models memorize or generalize?* (n.d.). https://pair.withgoogle.com/explorables/grokking/#:~:text=A%20model%20overfits%20the%20training,required%20to%20make%20more%20generalizations.

Gulli, A., & Pal, S. (2017). *Deep Learning with Keras. Packt Publishing Limited.* ISBN: 978-1-78712-842-2.

Twin, A. (2021, October 22). *Understanding overfitting and how to prevent it.* Investopedia. https://www.investopedia.com/terms/o/overfitting.asp

What is Supervised Learning?  | *IBM.* (n.d.). https://www.ibm.com/topics/supervised-learning
