# Aim : To Compare different available `Optimizers` 

In [1]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import numpy as np
from tensorflow.keras import initializers
from tensorflow.python.keras import activations

print(tf.__version__)

# downloading fashion_mnist data
fashion_mnist = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

train_images = train_images / 255.0

test_images = test_images / 255.0       

c:\users\sonu.ramkumar.jha\desktop\experiments\env\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
c:\users\sonu.ramkumar.jha\desktop\experiments\env\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


2.5.0


In [3]:
activation = tf.keras.activations.tanh

model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation=activation),
tf.keras.layers.Dense(10)
])

optim = tf.keras.optimizers.SGD()
model.compile(optimizer=optim,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

# model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               100480    
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________


<img src="images\model.jpeg" height=50% width=50% alt-text="Case 1 Gradient Descent">

**Gradient Descent**

$$\vec{W_n} = \vec{W_o} - \frac{\partial \vec{C}}{\partial \vec{W_o}}\$$


$$\begin{bmatrix}
    i_{1}  \\
    i_{2}  \\
    \vdots  \\
    i_{728}
\end{bmatrix}*\begin{bmatrix}
    w_{11} & x_{12} & x_{13} & \dots  & x_{1128} \\
    w_{21} & x_{22} & x_{23} & \dots  & x_{2128} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w_{7281} & x_{7282} & x_{7283} & \dots  & x_{728128}
\end{bmatrix}+\begin{bmatrix}
    b_{1}  \\
    b_{2}  \\
    \vdots  \\
    b_{128}
\end{bmatrix} =\begin{bmatrix}
    s_{1}  \\
    s_{2}  \\
    \vdots  \\
    s_{128}
\end{bmatrix}$$

$$Activation\left (\begin{bmatrix}
    s_{1}  \\
    s_{2}  \\
    \vdots  \\
    s_{128}
\end{bmatrix} \right)=\begin{bmatrix}
    i^{`}_{1}  \\
    i^{`}_{2}  \\
    \vdots  \\
    i^{`}_{128}
\end{bmatrix}$$

$$\begin{bmatrix}
    i^{`}_{1}  \\
    i^{`}_{2}  \\
    \vdots  \\
    i^{`}_{128}
\end{bmatrix}*\begin{bmatrix}
    w^{`}_{11} & w^{`}_{12} & w^{`}_{13} & \dots  & w^{`}_{110} \\
    w^{`}_{21} & w^{`}_{22} & w^{`}_{23} & \dots  & w^{`}_{210} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w^{`}_{1281} & w^{`}_{1282} & w^{`}_{1283} & \dots  & w^{`}_{12810}
\end{bmatrix}+\begin{bmatrix}
    b^{`}_{1}  \\
    b^{`}_{2}  \\
    \vdots  \\
    b^{`}_{128}
    \end{bmatrix}=\begin{bmatrix}
    s^{`}_{1}  \\
    s^{`}_{2}  \\
    \vdots  \\
    s^{`}_{10}
\end{bmatrix}$$


$$Activation\left (\begin{bmatrix}
    s^{`}_{1}  \\
    s^{`}_{2}  \\
    \vdots  \\
    s^{`}_{128}
\end{bmatrix}\  \right )=\begin{bmatrix}
    y^{`}_{1}  \\
    y^{`}_{2}  \\
    \vdots  \\
    y^{`}_{10}
\end{bmatrix}$$

## Whole Forward Pass 

$$Activation\left (Activation\left (\begin{bmatrix}
    i_{1}  \\
    i_{2}  \\
    \vdots  \\
    i_{728}
\end{bmatrix}*\begin{bmatrix}
    w_{11} & x_{12} & x_{13} & \dots  & x_{1128} \\
    w_{21} & x_{22} & x_{23} & \dots  & x_{2128} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w_{7281} & x_{7282} & x_{7283} & \dots  & x_{728128}
\end{bmatrix}+\begin{bmatrix}
    b_{1}  \\
    b_{2}  \\
    \vdots  \\
    b_{128}
\end{bmatrix}\  \right )+\begin{bmatrix}
    w^{`}_{11} & w^{`}_{12} & w^{`}_{13} & \dots  & w^{`}_{110} \\
    w^{`}_{21} & w^{`}_{22} & w^{`}_{23} & \dots  & w^{`}_{210} \\
    \vdots & \vdots & \vdots & \ddots & \vdots \\
    w^{`}_{1281} & w^{`}_{1282} & w^{`}_{1283} & \dots  & w^{`}_{12810}
\end{bmatrix}+\begin{bmatrix}
    b^{`}_{1}  \\
    b^{`}_{2}  \\
    \vdots  \\
    b^{`}_{128}
    \end{bmatrix}\  \right )$$

**Case 1**
$$W_0 > 0$$

<img src="images\C1GD.jpeg" height=50% width=50% alt-text="Case 1 Gradient Descent">

In [None]:
model.fit(train_images, train_labels, epochs=10)

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

print('test_loss', test_loss)
print('test_accuracy', test_acc)