# Aim : To Compare different available `Optimizers` 

In [1]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import numpy as np
from tensorflow.keras import initializers
from tensorflow.python.keras import activations

print(tf.__version__)

# downloading fashion_mnist data
fashion_mnist = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

train_images = train_images / 255.0

test_images = test_images / 255.0       

c:\users\sonu.ramkumar.jha\desktop\experiments\env\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
c:\users\sonu.ramkumar.jha\desktop\experiments\env\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


2.5.0


In [3]:
activation = tf.keras.activations.tanh

model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation=activation),
tf.keras.layers.Dense(10)
])

optim = tf.keras.optimizers.SGD()
model.compile(optimizer=optim,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               100480    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________


<img src="images\model.jpeg" height=50% width=50% alt-text="Case 1 Gradient Descent">

**Forward Pass:**

$$\overbrace{\begin{bmatrix}
    i_{1}  \\
    i_{2}  \\
    \vdots  \\
    i_{784}
\end{bmatrix}\begin{bmatrix}
    w^{1}_{11} & w^{1}_{12} & w^{1}_{13} & \dots  & w^{1}_{1128} \\
    w^{1}_{21} & w^{1}_{22} & w^{1}_{23} & \dots  & w^{1}_{2128} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w^{1}_{7841} & w^{1}_{7842} & w^{1}_{7843} & \dots  & w^{1}_{784128}
\end{bmatrix}+\begin{bmatrix}
    b_{1}  \\
    b_{2}  \\
    \vdots  \\
    b_{128}
\end{bmatrix}}^{Input of First Activation Function} =\begin{bmatrix}
    s_{1}  \\
    s_{2}  \\
    \vdots  \\
    s_{128}
\end{bmatrix}$$

$$Activation1 \ \left (\begin{bmatrix}
    s_{1}  \\
    s_{2}  \\
    \vdots  \\
    s_{128}
\end{bmatrix} \right)=\begin{bmatrix}
    i^{`}_{1}  \\
    i^{`}_{2}  \\
    \vdots  \\
    i^{`}_{128}
\end{bmatrix}$$

$$\overbrace{\begin{bmatrix}
    i^{`}_{1}  \\
    i^{`}_{2}  \\
    \vdots  \\
    i^{`}_{128}
\end{bmatrix}\begin{bmatrix}
    w^{2}_{11} & w^{2}_{12} & w^{2}_{13} & \dots  & w^{2}_{110} \\
    w^{2}_{21} & w^{2}_{22} & w^{2}_{23} & \dots  & w^{2}_{210} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w^{2}_{1281} & w^{2}_{1282} & w^{`}_{1283} & \dots  & w^{2}_{12810}
\end{bmatrix}+\begin{bmatrix}
    b^{`}_{1}  \\
    b^{`}_{2}  \\
    \vdots  \\
    b^{`}_{128}
    \end{bmatrix}}^{Input of Second Activation Function}=\begin{bmatrix}
    s^{`}_{1}  \\
    s^{`}_{2}  \\
    \vdots  \\
    s^{`}_{10}
\end{bmatrix}$$


$$Activation2 \ \left (\begin{bmatrix}
    s^{`}_{1}  \\
    s^{`}_{2}  \\
    \vdots  \\
    s^{`}_{128}
\end{bmatrix}\  \right )=\begin{bmatrix}
    y^{`}_{1}  \\
    y^{`}_{2}  \\
    \vdots  \\
    y^{`}_{10}
\end{bmatrix}$$

**Whole Forward Pass:**

$$Activation2\left (Activation1\left (\overbrace{\begin{bmatrix}
    i_{1}  \\
    i_{2}  \\
    \vdots  \\
    i_{784}
\end{bmatrix}*\begin{bmatrix}
    w^{1}_{11} & w^{1}_{12} & w^{1}_{13} & \dots  & w^{1}_{1128} \\
    w^{1}_{21} & w^{1}_{22} & w^{1}_{23} & \dots  & w^{1}_{2128} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w^{1}_{7841} & w^{1}_{7842} & w^{1}_{7843} & \dots  & w^{1}_{784128}
\end{bmatrix}+\begin{bmatrix}
    b_{1}  \\
    b_{2}  \\
    \vdots  \\
    b_{128}
\end{bmatrix}}^{Input of Hidden Layer}  \right )+\begin{bmatrix}
    w^{2}_{11} & w^{2}_{12} & w^{2}_{13} & \dots  & w^{2}_{110} \\
    w^{2}_{21} & w^{2}_{22} & w^{2}_{23} & \dots  & w^{2}_{210} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w^{2}_{1281} & w^{2}_{1282} & w^{2}_{1283} & \dots  & w^{2}_{12810}
\end{bmatrix}+\begin{bmatrix}
    b^{`}_{1}  \\
    b^{`}_{2}  \\
    \vdots  \\
    b^{`}_{128}
    \end{bmatrix}\  \right )$$

**In Simple Form**

$$Activation2 \ (Activation1 \ (I_{1784} \ W^{1}_{784128}+B^{1}_{128})+W^{2}_{12810}+B^{2}_{10}) = y{`}$$

**Cast Function(Error):**
$$C = (y-y{`})^2$$

**Backword Pas:**

**We have 4 values to update before the 2nd forward pass -** 

$$\vec W^{1}, \vec B^{1}, \vec W^{2} and \ \vec B^{2}$$

## Gradient Discent

**Gradien Discent Formula**

$$\boxed{\vec{W_{new}} = \vec{W_{old}} - \eta \frac{\partial \vec{C}}{\partial \vec{W_{old}}}}\$$

**Where**

\begin{equation}
 \left.\begin{aligned}
        \vec{W_{new}} = New \ Weight\\
        \vec{W_{old}} = Old \ Weight\\
        \eta = learning \ rate
       \end{aligned}
 \right\}
\end{equation}


**According to Chain Rule**

\begin{equation}
\frac{\partial \vec C}{\partial \vec W^{1}} = \frac{\partial C}{\partial y^{`}}\times\frac{\partial y^{`}}{\partial Activation2}\times\frac{\partial Activation2}{\partial Activation1}\times \frac{\partial Activation1}{\partial W^{1}}\\
\frac{\partial \vec C}{\partial \vec B} = \frac{\partial C}{\partial y{`}}\times\frac{\partial y{`}}{\partial Activation2}\times\frac{\partial Activation2}{\partial Activation1}\times\frac{\partial Activation1}{\partial B}\\
\frac{\partial \vec C}{\partial \vec W^{2}} = \frac{\partial C}{\partial y{`}}\times\frac{\partial y{`}}{\partial Activation2}\times\frac{\partial Activation2}{\partial W_{`}}\\
\frac{\partial \vec C}{\partial \vec B{`}} = \frac{\partial C}{\partial y{`}}\times\frac{\partial y{`}}{\partial Activation2}\times\frac{\partial Activation2}{\partial B_{`}}\\
\end{equation}

**Step:1**
$$When \ \vec W = W_{0}$$

$$\therefore W_{1} = \vec W_{0} - \eta \frac{\partial \vec{C}}{\partial \vec{W_{0}}}$$
<img src="images\gd1.png" height=50% width=50% alt-text="Case 1 Gradient Descent">

**Step:2**
    $$When \ \vec W = W_{1}$$

$$\therefore W_{2} = \vec W_{1} - \eta \frac{\partial \vec{C}}{\partial \vec{W_{1}}}$$
<img src="images\gd2.png" height=50% width=50% alt-text="Case 2 Gradient Descent">

`Similarly`
$$When \ \vec W = W_{n-1}$$

$$\therefore W_{n} = \vec W_{n-1} - \eta \frac{\partial \vec{C}}{\partial \vec{W_{n-1}}}$$
<img src="images\gdn.png" height=50% width=50% alt-text="Case n Gradient Descent">

**Observations**:
    - As you can see Wn is directly propotional to Dc/Dw. 

In [4]:
model.fit(train_images, train_labels, epochs=10, batch_size=32)

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

print('test_loss', test_loss)
print('test_accuracy', test_acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 0s - loss: 0.4070 - accuracy: 0.8539
test_loss 0.4069558382034302
test_accuracy 0.8539000153541565


# Momentum Optimizer

**With momentum the weight update formula becomes**

$$\vec{W_{new}} = \vec{W_{old}} - m_{new}$$
$$m_{new} = \beta \ m_{old}+\eta \frac{\partial \vec{C}}{\partial \vec{W_{old}}}$$

`Where m is called momentum and` 

$$\beta : coefficient \ of \ momentum$$

**Step:1**
$$When: \ \vec W_{old} = W_{0} \ and \ m = m_{0} = 0$$

$$\therefore m_{1} = \beta \ m_{0}+\eta \frac{\partial \vec{C}}{\partial \vec{W_{0}}}\$$

$$\therefore m_{1} = \eta \frac{\partial \vec{C}}{\partial \vec{W_{0}}} \ \left (\because m_{0}=0\right )\tag{i}$$

$$\therefore \boxed{\vec W_{1} = W_{0}-m_{1} = \vec W_{0} - \eta \frac{\partial \vec
{C}}{\partial \vec{W_{0}}}}\tag{Same as Gradient Discent Formula}$$
<img src="images\gd1.png" height=50% width=50% alt-text="Case 1 Momentum">

**Observations**
- As you can see for the first time weight update happens like the `Gradient Discent` when `m=0`.

**Step:2**
$$When: \ \vec W_{old} = W_{1} \ and \ m =  m_{1} = \eta \frac{\partial \vec{C}}{\partial \vec{W_{0}}}\tag{from eq(i)}\$$

$$\therefore \vec m_{2} = \beta \ m_{1} + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{1}}}\tag{ii}$$

$$\therefore \vec W_{2} = \vec W_{1} - m2$$

$$\therefore \vec W_{2} = \vec W_{1} - \beta \ m_{1} + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{1}}}$$

$$\therefore \vec W_{2} = \vec W_{1} - \beta \left( \eta \frac{\partial \vec{C}}{\partial \vec{W_{0}}} \right)+ \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{1}}}$$

$$\therefore \boxed{\vec W_{2} = \vec W_{1} - \eta \left( \beta \ \frac{\partial \vec{C}}{\partial \vec{W_{0}}}+\frac{\partial \vec{C}}{\partial \vec{W_{1}}} \right)}$$

**Case:1**
$$For \ \beta = 0$$
$$\therefore \boxed{\vec W_{2} =\vec W_{1}-\frac{\partial \vec{C}}{\partial \vec{W_{1}}}}\tag{same as Gradient Discent}$$
<img src="images\mc2.png" height=50% width=50% alt-text="Case 2 Momentum">

**Case:2**
$$For \ \beta = 0.9$$
$$\therefore \boxed{\vec W_{2} = \vec W_{1} - \eta \left( 0.9 \ \frac{\partial \vec{C}}{\partial \vec{W_{0}}}+\frac{\partial \vec{C}}{\partial \vec{W_{1}}} \right)}$$

**Observations**
- For the second time weight depends on 90% of the initial weight plus 100% last weight.

**Step:3**
$$When: \ \vec W_{old} = W_{2} \ and \ m =  \vec m_{2} = \beta \ m_{1} + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{1}}}\tag{from eq(ii)}$$

$$\therefore \vec m_{3} = \beta \ m_{2} + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{2}}}\tag{iii}$$

$$\therefore \vec m_{3} = \beta \left( \beta \ m_{1} + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{1}}}\right) + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{2}}}$$

$$\therefore \vec m_{3} = \beta \left( \beta \ \eta \frac{\partial \vec{C}}{\partial \vec{W_{0}}} + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{1}}}\right) + \eta \ \frac{\partial \vec{C}}{\partial \vec{W_{2}}}\tag{from eq(i)}$$

$$\therefore \boxed{ \vec m_{3} = \eta \left (\beta^{2} \frac{\partial \vec{C}}{\partial \vec{W_{0}}}+ \beta \frac{\partial \vec{C}}{\partial \vec{W_{1}}}+\frac{\partial \vec{C}}{\partial \vec{W_{2}}} \right)}\tag{iv}$$

$$\therefore \vec W_{3} = \vec W_{2}-m_{3}$$

$$\therefore \boxed{\vec W_{3} = \vec W_{2}-\eta \left (\beta^{2} \frac{\partial \vec{C}}{\partial \vec{W_{0}}}+ \beta \frac{\partial \vec{C}}{\partial \vec{W_{1}}}+\frac{\partial \vec{C}}{\partial \vec{W_{2}}} \right)}$$

`Similarly`
$$When: \ \vec W_{old} = W_{n-1}$$

$$\vec W_{n} = W_{n-1}-\eta \left (\sum{\substack{
            3\le i \le 0\\
            0\le j \le 3}}
\beta^{i-j} \frac{\partial \vec{C}}{\partial \vec{W_{j}}}\right)$$

$$\vec W_{n} = W_{n-1}-\eta \left (\sum{\substack{
            3\le i \le 0\\
            0\le j \le 3}}
\beta^{i-j} \frac{\partial \vec{C}}{\partial \vec{W_{j}}}\right)$$
<img src="images\mn.png" height=50% width=50% alt-text="Case n Momentum">

**Case:1**
$$For \ \beta = 0$$
$$\boxed{\vec W_{n} = \vec W_{n-1} - \frac{\partial \vec{C}}{\partial \vec{W_{n-1}}}}\tag{Same as Gradiend Discent}$$

**Observatinos**
- As you can see in Case:1 for beta = 0, weight update happens like Gradient Discent.
- For beta=0.9 (practically good), Weight depends on 81% of past weight, 90% recent past weight and 100% the last weight.
- Therefore `momemtum optimizer` has and advantage over `Gradient Discent` that with the `impace of previous weight` it can jump through `saddle point`.

In [5]:
activation = tf.keras.activations.tanh

model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation=activation),
tf.keras.layers.Dense(10)
])

optim = tf.keras.optimizers.SGD(momentum=0.9)
model.compile(optimizer=optim,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

# model summary
model.summary()

model.fit(train_images, train_labels, epochs=10, batch_size=32)

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

print('test_loss', test_loss)
print('test_accuracy', test_acc)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               100480    
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 0s - loss: 0.3553 - accuracy: 0.8704
test_loss 0.3552625775337219
test_accuracy 0.8704000115394592


## 3. Nesterov Accelaraged Gradient (NAG)

**Weight Update Formula**

$$W_{new} = W_{old} - m_{new}$$

$$where \ m_{new} = \beta \ m_{ols} + \eta \ \frac{\partial C}{\partial \left ( W_{old}-\beta \ m_{old}\right )}$$

**Step:1**

$$When \ m = m_{0} = 0 \ and \ \vec W = W_{0}\tag{i}$$

$$m_{1} = \beta \ m_{0} + \eta \ \frac{\partial C}{\partial \left ( W_{0}-\beta \ m_{0}\right )}$$

*since m_0 = 0*
$$\therefore m_{1} = \eta \ \frac{\partial C}{\partial  W_{0}}\tag{ii}$$

$$\vec W_{1} = \vec W_{0}-m_{1}$$

$$\boxed{\therefore \vec W_{1} = \vec W_{0}- \eta \ \frac{\partial C}{\partial  W_{0}}}\tag{from eq(ii)}$$
<img src="images\gd1.png" height=50% width=50% alt-text="Case 1 NAG">

**Step:2**
    $$Whem \ m = m_{1}, \  W = W_{1}$$

$$\therefore m_{2} = \beta \ m_{1} + \eta \ \frac{\partial C}{\partial \left ( W_{1}-\beta \ m_{1}\right )}$$

$$\therefore m_{2} = \eta \left (\beta \frac{\partial C}{\partial  W_{0}}+ \frac{\partial C}{\partial \left(W_{1}-\beta \ m_{1}\right) }\right)\tag{from eq(ii)}$$

`Now : `
$$\vec W_{2} = \vec W_{1}-m_{2}$$

$$\therefore \vec W_{2} = \vec W_{1} - \eta \left (\beta \frac{\partial C}{\partial  W_{0}}+ \frac{\partial C}{\partial \left(W_{1}-\beta \ m_{1}\right) }\right)\tag{from eq(ii)}$$
<img src="images\nag2.png" height=50% width=50% alt-text="Case 2 NAG">

In [6]:
activation = tf.keras.activations.tanh

model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation=activation),
tf.keras.layers.Dense(10)
])

optim = tf.keras.optimizers.SGD(momentum=0.9, nesterov=True)
model.compile(optimizer=optim,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

# model summary
model.summary()

model.fit(train_images, train_labels, epochs=10, batch_size=32)

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

print('test_loss', test_loss)
print('test_accuracy', test_acc)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               100480    
_________________________________________________________________
dense_5 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 1s - loss: 0.3505 - accuracy: 0.8717
test_loss 0.35048532485961914
test_accuracy 0.8716999888420105


**Observations**
- AS you can see.

## Elongated Ball Problem

## Adagrad

**Weight Update Formula**

$$\vec W_{new} = \vec W_{old} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{old}}}{\sqrt{S_{new}+ \epsilon}}$$

$$where \ S_{new} = \ S_{old} + \frac{\partial \vec C}{\partial \vec W_{old}} \cdot \frac{\partial \vec C}{\partial \vec W_{old}}$$

$$\therefore \ S_{new} = \ S_{old} +   \left (\frac{\partial C}{\partial W_{old}}\right)^{2}\tag{i}$$

**Case:1**
    $$When \ S=S_{0} = 0,\ and \ \vec W = \vec W_{0}$$

$$\therefore S_{1} = S_{0} + \left (\frac{\partial C}{\partial W_{0}}\right)^{2}$$

$$\therefore S_{1} = \left (\frac{\partial C}{\partial W_{0}}\right)^{2}\tag{ii}$$

$$\therefore \vec W_{1} = \vec W_{0} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{0}}}{\sqrt{S_{1}+ \epsilon}}$$

$$\therefore \vec W_{1} = \vec W_{0} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{0}}}{\sqrt{\left (\frac{\partial C}{\partial W_{0}}\right)^{2}+ \epsilon}}\tag{from eq(ii)}$$

$$\therefore \vec W_{1} = \vec W_{0} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{0}}}{|\frac{\partial C}{\partial W_{0}}|}\tag{E is neayly 0}$$

$$\therefore \vec W_{1} = \vec W_{0} - \eta$$

**Case:2**
    $$When \ S=S_{1},\ and \ \vec W = \vec W_{1}$$

$$\therefore S_{2} = S_{1} + \left (\frac{\partial C}{\partial W_{1}}\right)^{2}\tag{from eq(i)}$$

$$\therefore S_{2} = \left (\frac{\partial C}{\partial W_{0}}\right)^{2} + \left (\frac{\partial C}{\partial W_{1}}\right)^{2}\tag{from (ii)}$$

$$\therefore \vec W_{2} = \vec W_{1} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{1}}}{\sqrt{S_{2}+ \epsilon}}$$

$$\therefore \vec W_{2} = \vec W_{1} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{1}}}{\sqrt{\left (\frac{\partial C}{\partial W_{0}}\right)^{2} + \left (\frac{\partial C}{\partial W_{1}}\right)^{2}+ \epsilon}}$$

`Similarly when`
$$When \ S=S_{n-1},\ and \ \vec W = \vec W_{n-1}$$

$$\therefore S_{n} = \left (\frac{\partial C}{\partial W_{0}}\right)^{2} + \left (\frac{\partial C}{\partial W_{1}}\right)^{2}+\hdots+\left (\frac{\partial C}{\partial W_{n-1}}\right)^{2}$$

$$\therefore S_{n} = \sum_{n=0}^{n-1}\left (\frac{\partial C}{\partial W_{n}}\right)^{2}$$

$$\vec W_{n} = \vec W_{n-1} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{n-1}}}{\sqrt{S_{n}+ \epsilon}}$$

$$\vec W_{n} = \vec W_{n-1} -\eta \frac{ \frac{\partial \vec C}{\partial \vec W_{n-1}}}{\sqrt{\sum_{n=0}^{n-1}\left (\frac{\partial C}{\partial W_{n}}\right)^{2}+ \epsilon}}$$

In [7]:
activation = tf.keras.activations.tanh

model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation=activation),
tf.keras.layers.Dense(10)
])

optim = tf.keras.optimizers.Adagrad()
model.compile(optimizer=optim,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

# model summary
model.summary()

model.fit(train_images, train_labels, epochs=10, batch_size=32)

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

print('test_loss', test_loss)
print('test_accuracy', test_acc)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               100480    
_________________________________________________________________
dense_7 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
313/313 - 0s - loss: 0.5237 - accuracy: 0.8190
test_loss 0.5236613154411316
test_accuracy 0.8190000057220459


**Observations**
- As