In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt

cifar10 = keras.datasets.cifar10

(train_images, train_labels), (test_images, test_labels) = cifar10.load_data()

print(train_images.shape) # 50000, 32, 32, 3

# Normalize: 0,255 -> 0,1
train_images, test_images = train_images / 255.0, test_images / 255.0

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

def show():
    plt.figure(figsize=(10,10))
    for i in range(16):
        plt.subplot(4,4,i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(train_images[i], cmap=plt.cm.binary)
        # The CIFAR labels happen to be arrays, 
        # which is why you need the extra index
        plt.xlabel(class_names[train_labels[i][0]])
    plt.show()

(50000, 32, 32, 3)


In [None]:
optimizer_options = ['SGDNesterov', 'Adagrad', 'RMSProp', 'AdaDelta', 'Adam']
dropout_options = [False]

In [None]:
def optimizer_fn(optimizer, lr, name='Optimizer'):
    with tf.compat.v1.variable_scope(name):
        global_step = tf.Variable(1, dtype=tf.float32, trainable=False)
        cur_lr = lr / tf.math.sqrt(x=global_step)

        if optimizer == 'SGDNesterov':
            return tf.keras.optimizers.SGD(learning_rate=lr,momentum=0.99,nesterov=True)
        elif optimizer == 'Adagrad':
            return tf.keras.optimizers.Adagrad(learning_rate=cur_lr)
        elif optimizer == 'RMSProp':
            return tf.keras.optimizers.RMSprop(learning_rate=cur_lr)
        elif optimizer == 'AdaDelta':
            return tf.keras.optimizers.Adadelta(learning_rate=cur_lr)
        elif optimizer == 'Adam':
            return AdamOptimizer(learning_rate=cur_lr)
        else:
            raise NotImplementedError(" [*] Optimizer is not included in list!")

In [None]:
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.framework import ops
from tensorflow.python.training import optimizer
#from tensorflow.python.eager import context
from tensorflow.python.ops import resource_variable_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.training import training_ops

from keras import backend_config
from keras.optimizers.optimizer_v2 import optimizer_v2

class AdamOptimizer(optimizer_v2.OptimizerV2):
    def __init__(
        self,
        learning_rate=0.001,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-7,
        amsgrad=False,
        name="Adam",
        **kwargs
    ):
        super().__init__(name, **kwargs)
        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
        self._set_hyper("decay", self._initial_decay)
        self._set_hyper("beta_1", beta_1)
        self._set_hyper("beta_2", beta_2)
        self.epsilon = epsilon or backend_config.epsilon()
        self.amsgrad = amsgrad

    def _create_slots(self, var_list):
        # Create slots for the first and second moments.
        # Separate for-loops to respect the ordering of slot variables from v1.
        for var in var_list:
            self.add_slot(var, "m")
        for var in var_list:
            self.add_slot(var, "v")
        if self.amsgrad:
            for var in var_list:
                self.add_slot(var, "vhat")

    def _prepare_local(self, var_device, var_dtype, apply_state):
        super()._prepare_local(var_device, var_dtype, apply_state)

        local_step = tf.cast(self.iterations + 1, var_dtype)
        beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
        beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
        beta_1_power = tf.pow(beta_1_t, local_step)
        beta_2_power = tf.pow(beta_2_t, local_step)
        lr = apply_state[(var_device, var_dtype)]["lr_t"] * (
            tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
        )
        apply_state[(var_device, var_dtype)].update(
            dict(
                lr=lr,
                epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
                beta_1_t=beta_1_t,
                beta_1_power=beta_1_power,
                one_minus_beta_1_t=1 - beta_1_t,
                beta_2_t=beta_2_t,
                beta_2_power=beta_2_power,
                one_minus_beta_2_t=1 - beta_2_t,
            )
        )

    def set_weights(self, weights):
        params = self.weights
        # If the weights are generated by Keras V1 optimizer, it includes vhats
        # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
        # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
        num_vars = int((len(params) - 1) / 2)
        if len(weights) == 3 * num_vars + 1:
            weights = weights[: len(params)]
        super().set_weights(weights)

    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)
        ) or self._fallback_apply_state(var_device, var_dtype)

        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")

        if not self.amsgrad:
            return tf.raw_ops.ResourceApplyAdam(
                var=var.handle,
                m=m.handle,
                v=v.handle,
                beta1_power=coefficients["beta_1_power"],
                beta2_power=coefficients["beta_2_power"],
                lr=coefficients["lr_t"],
                beta1=coefficients["beta_1_t"],
                beta2=coefficients["beta_2_t"],
                epsilon=coefficients["epsilon"],
                grad=grad,
                use_locking=self._use_locking,
            )
        else:
            vhat = self.get_slot(var, "vhat")
            return tf.raw_ops.ResourceApplyAdamWithAmsgrad(
                var=var.handle,
                m=m.handle,
                v=v.handle,
                vhat=vhat.handle,
                beta1_power=coefficients["beta_1_power"],
                beta2_power=coefficients["beta_2_power"],
                lr=coefficients["lr_t"],
                beta1=coefficients["beta_1_t"],
                beta2=coefficients["beta_2_t"],
                epsilon=coefficients["epsilon"],
                grad=grad,
                use_locking=self._use_locking,
            )

In [None]:
def running_model(optimizer):
  model = keras.models.Sequential()
  for i in range(3):
    model.add(layers.Conv2D(32, (5,5), strides=(1,1), padding="valid", activation='relu', input_shape=(32,32,3)))
    model.add(layers.MaxPool2D(pool_size=(3, 3),strides=(2,2), padding="same"))
  model.add(layers.Flatten())
  model.add(layers.Dense(1000, activation='relu'))
  model.add(layers.Dense(10))
  print(model.summary())
  #import sys; sys.exit()

  # loss and optimizer
  loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  optim = optimizer_fn(optimizer,0.001)
  metrics = ["accuracy"]

  model.compile(optimizer=optim, loss=loss, metrics=metrics)

  # training
  batch_size = 128
  epochs = 10

  model.fit(train_images, train_labels, epochs=epochs,
            batch_size=batch_size, verbose=2)

  # evaulate
  model.evaluate(test_images,  test_labels, batch_size=batch_size, verbose=2)

In [None]:
for optimizer in optimizer_options:
  for dropout in dropout_options:
    print('\nOptimizer: {}\tDropout option: {}\n'.format(optimizer, dropout))
    running_model(optimizer)


Optimizer: SGDNesterov	Dropout option: False

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_42 (Conv2D)          (None, 28, 28, 32)        2432      
                                                                 
 max_pooling2d_40 (MaxPoolin  (None, 14, 14, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_43 (Conv2D)          (None, 10, 10, 32)        25632     
                                                                 
 max_pooling2d_41 (MaxPoolin  (None, 5, 5, 32)         0         
 g2D)                                                            
                                                                 
 conv2d_44 (Conv2D)          (None, 1, 1, 32)          25632     
                                                                 
 max_p

KeyboardInterrupt: ignored