In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

## 1. Defining Custom Layers 

The fundamental data structure in neural networks is the layer. A Layer is an object that encapsulates some state (weights) and some computation (a forward pass). 

`tf.keras.layers.Layer` is the base class of all Keras layers, and it inherits from `tf.Module`

#### a)  Define a Layer

In [2]:
class MyDense(tf.keras.layers.Layer):
    # Adding **kwargs to support base Keras layer arguments
    def __init__(self, in_features, out_features, **kwargs):
        super(MyDense, self).__init__(**kwargs)
        self.w = tf.Variable(
          tf.random.normal([in_features, out_features]), name='w')
        self.b = tf.Variable(tf.zeros([out_features]), name='b')
    
    def call(self, x):
        y = tf.matmul(x, self.w) + self.b
        return tf.nn.relu(y)


In [3]:
# Instantiate your layer

simple_layer = MyDense(name="simple", in_features=2, out_features=4)


# Call the layer on a sample input

x = tf.random.normal((4,2))
y = simple_layer(x)

print(y)

2022-09-19 14:32:23.732668: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0004:04:00.0, compute capability: 7.0


tf.Tensor(
[[0.6998929  0.24331883 0.         0.36765662]
 [2.8838756  0.         0.9268994  0.        ]
 [2.2448802  0.9250624  0.         1.3324821 ]
 [0.         1.2756517  0.         0.6899017 ]], shape=(4, 4), dtype=float32)


#### b) Build Method 

It is often convenient to delay creating variables until the input shape is fixed.

In [4]:
class MyDense(tf.keras.layers.Layer):
    
    def __init__(self, units=32, **kwargs):
        super(MyDense, self).__init__(**kwargs)
        self.units = units

    def build(self, input_shape):
        self.w = tf.Variable(tf.random.normal([input_shape[-1], self.units]), name='w')
        self.b = tf.Variable(tf.zeros([self.units]), name='b')

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [5]:
# Instantiate your layer

flexible_layer = MyDense(name="simple", units=4)


# Call the layer on a sample input

x = tf.random.normal((2,2))
y = flexible_layer(x)

print(y)

tf.Tensor(
[[-1.6320368   1.3680271  -1.5001991  -0.03979062]
 [-1.4034127   1.7092023  -1.3292414  -0.04464768]], shape=(2, 4), dtype=float32)


In [6]:
# At this point we can inspect the variable

flexible_layer.variables

[<tf.Variable 'simple/w:0' shape=(2, 4) dtype=float32, numpy=
 array([[ 0.9928404 , -1.1255851 ,  0.93421894,  0.02994949],
        [-0.6392404 ,  0.26839444, -0.5679271 , -0.01034955]],
       dtype=float32)>,
 <tf.Variable 'simple/b:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

In [7]:
# We can also call the variables by name

flexible_layer.b

<tf.Variable 'simple/b:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>

#### c) Non-trainable weights 

By default, the variables in a layer are trainable, i.e. they will tracked by the Gradient Tape and will be updated during backpropagation. However, we can also specify certain weights to be non-trainable.

In [8]:
class MyDense(tf.keras.layers.Layer):
    
    def __init__(self, units=32, **kwargs):
        super(MyDense, self).__init__(**kwargs)
        self.units = units

    def build(self, input_shape):
        self.w = tf.Variable(tf.random.normal([input_shape[-1], self.units]), name='w', trainable=True)
        self.b = tf.Variable(tf.zeros([self.units]), name='b', trainable=False)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [9]:
# Instantiate the layer

my_new_layer = MyDense(units=16)

x = tf.random.normal((2,2))
y = my_new_layer(x)

y.shape

TensorShape([2, 16])

In [10]:
print("weights:", len(my_new_layer.weights))
print("non-trainable weights:", len(my_new_layer.non_trainable_weights))


# It's not included in the trainable weights:
print("\n trainable_weights:", my_new_layer.trainable_weights)
print("\n non trainable_weights:", my_new_layer.non_trainable_weights)

weights: 2
non-trainable weights: 1

 trainable_weights: [<tf.Variable 'my_dense/w:0' shape=(2, 16) dtype=float32, numpy=
array([[ 0.90057856, -1.0036141 , -0.58286494,  1.1879911 , -0.36154723,
         2.402001  ,  1.1370946 , -0.5823032 , -1.6684176 , -0.95081204,
        -1.5145642 , -0.8617373 , -0.8604067 ,  1.1418787 , -0.52168334,
         0.17226969],
       [ 1.6129915 , -0.2245607 , -0.18759921,  0.3975187 , -0.06355886,
         0.8081994 ,  0.59818935, -1.4164699 ,  1.6456181 , -0.06233479,
         0.12785867, -0.11070318,  2.0580926 ,  1.5896587 , -1.7733927 ,
         0.28407833]], dtype=float32)>]

 non trainable_weights: [<tf.Variable 'my_dense/b:0' shape=(16,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>]


#### d) training arg in call()  

In [11]:
class CustomDropout(tf.keras.layers.Layer):
    def __init__(self, rate, **kwargs):
        super(CustomDropout, self).__init__(**kwargs)
        self.rate = rate

    def call(self, inputs, training=None):
        if training:
            return tf.nn.dropout(inputs, rate=self.rate)
        return inputs

In [12]:
dropuout = CustomDropout(rate=0.5)

x = tf.random.normal((2,2))
print('input: ', x)


# During training
output_during_training = dropuout(x, training=True)
print('\n output_during_training: ', output_during_training)


# During inference
output_during_inference = dropuout(x, training=False)
print('\n output_during_inference: ', output_during_inference)

input:  tf.Tensor(
[[ 1.0559696  -0.7131017 ]
 [ 0.77184045  1.237681  ]], shape=(2, 2), dtype=float32)

 output_during_training:  tf.Tensor(
[[ 0.        -1.4262034]
 [ 1.5436809  2.475362 ]], shape=(2, 2), dtype=float32)

 output_during_inference:  tf.Tensor(
[[ 1.0559696  -0.7131017 ]
 [ 0.77184045  1.237681  ]], shape=(2, 2), dtype=float32)


#### e) Recursively composible  

It also possible to compose a layer out of other layers. The outer layer will automatically track the weights of the inner layer.

In [13]:
# Let's assume we are reusing the Linear class
# with a `build` method that we defined above.


class MLPBlock(tf.keras.layers.Layer):
    def __init__(self):
        super(MLPBlock, self).__init__()
        self.dense_1 = MyDense(32)
        self.dense_2 = MyDense(32)
        self.dense_3 = MyDense(1)

    def call(self, inputs):
        x = self.dense_1(inputs)
        x = tf.nn.relu(x)
        x = self.dense_2(x)
        x = tf.nn.relu(x)
        return self.dense_3(x)


mlp = MLPBlock()
y = mlp(tf.ones(shape=(3, 64)))  # The first call to the `mlp` will create the weights
print("weights:", len(mlp.weights))
print("trainable weights:", len(mlp.trainable_weights))
print("y.shape: ", y.shape)

weights: 6
trainable weights: 3
y.shape:  (3, 1)


## 2. Defining Models: Three Levels of abstraction

Given a set of (either predefined or custom defined) layers, we can begin to start composing them into a DAG to define a model. A `tf.keras.Model` is similar to a `tf.keras.layers.Layer` except that models come with extra functionality that make them easy to train, evaluate, load, save, and even train on multiple machines.

#### a) Sequential

A Sequential model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.

In [14]:
# There are two ways to define a sequential model:

# 1. Either as a list of layers

model = tf.keras.Sequential(
    [
        tf.keras.layers.InputLayer(input_shape=(4,)),
        tf.keras.layers.Dense(32),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Dense(16),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Dense(1)
    ]
)


# 2. Or instantiate a Sequential Model and add layers by calling the .add() method on it
model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(4,)))
model.add(tf.keras.layers.Dense(32))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(16))
model.add(tf.keras.layers.ReLU())
model.add(tf.keras.layers.Dense(1))

In [15]:
# Now we can call the model on an Input Tensor
x = tf.ones((16, 4))
y = model(x)

print(y.shape)

(16, 1)


In [16]:
# We can call summary method to display the graph
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 32)                160       
                                                                 
 re_lu_2 (ReLU)              (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 16)                528       
                                                                 
 re_lu_3 (ReLU)              (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 705
Trainable params: 705
Non-trainable params: 0
_________________________________________________________________


#### b) Functional API 

The __Functional API__ is more flexible than Sequential, and specifically come in handy when the model has non-linear topology, shared layers and/or multiple inputs, outputs.

First, lets redefine the above model in Functional API.

In [17]:
inputs = tf.keras.Input(shape=(4,))

x = tf.keras.layers.Dense(32)(inputs)
x = tf.keras.layers.ReLU()(x)
x = tf.keras.layers.Dense(16)(x)
x = tf.keras.layers.ReLU()(x)

outputs = tf.keras.layers.Dense(1)(x)


model = tf.keras.Model(inputs=inputs, outputs=outputs, name="functional_model")

In [18]:
model.summary()

Model: "functional_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense_6 (Dense)             (None, 32)                160       
                                                                 
 re_lu_4 (ReLU)              (None, 32)                0         
                                                                 
 dense_7 (Dense)             (None, 16)                528       
                                                                 
 re_lu_5 (ReLU)              (None, 16)                0         
                                                                 
 dense_8 (Dense)             (None, 1)                 17        
                                                                 
Total params: 705
Trainable params: 705
Non-traina

With Functional API, it's easy to define more complex topologies. Lets define a model with multiple inputs and outputs.

Let's say we want a model that takes in a few weather data variables on any given day to predict temperature and humidity for the same day:

Inputs:

- Pressure
- Precipitation
- Clouds
- Wind


Outputs:

- Temperature
- Humidity

In [19]:
# Now lets build this model

pressure_input = tf.keras.layers.Input(shape=(1,), name='pressure')
precipitation_input = tf.keras.layers.Input(shape=(1,), name='precipitation')
clouds_input = tf.keras.layers.Input(shape=(1,), name='clouds')
wind_input = tf.keras.layers.Input(shape=(1,), name='wind')


# Lets pass the pressure and precipitaion through a one stack of linear layers, and clouds and wind through another
x = tf.keras.layers.concatenate([pressure_input, precipitation_input])
x = tf.keras.layers.Dense(units=32, activation='relu')(x)
x = tf.keras.layers.Dense(units=16, activation='relu')(x)


y = tf.keras.layers.concatenate([clouds_input, wind_input])
y = tf.keras.layers.Dense(units=32, activation='relu')(y)
y = tf.keras.layers.Dense(units=16, activation='relu')(y)


# Lets merge the two branches and send through a few more layers
z = tf.keras.layers.concatenate([x,y])
z = tf.keras.layers.Dense(units=32, activation='relu')(z)
z = tf.keras.layers.Dense(units=16, activation='relu')(z)

# Finally split again into two outputs
temperature = tf.keras.layers.Dense(units=1, name='temperature')(z)
humidity = tf.keras.layers.Dense(units=1, name='humidity')(z)


multiple_inp_model = tf.keras.Model(inputs=[pressure_input, precipitation_input, clouds_input, wind_input], 
                       outputs=[temperature, humidity], name="multi_input_output_model")

In [20]:
# We can print the summary but it might be difficult to visualize the graph
multiple_inp_model.summary()

Model: "multi_input_output_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 pressure (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 precipitation (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 clouds (InputLayer)            [(None, 1)]          0           []                               
                                                                                                  
 wind (InputLayer)              [(None, 1)]          0           []                               
                                                                           

In [21]:
# Luckily we can also plot the model
tf.keras.utils.plot_model(multiple_inp_model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


#### c) Subclassing 

In [22]:
class FCN(tf.keras.Model):

    def __init__(self):
        super(FCN, self).__init__()
        self.dense_1 = tf.keras.layers.Dense(32)
        self.dense_2 = tf.keras.layers.Dense(16)
        self.dense_3 = tf.keras.layers.Dense(1)
        self.relu = tf.keras.layers.ReLU()


    def call(self, inputs):
        x = self.dense_1(inputs)
        x = self.relu(x)
        x = self.dense_2(x)
        x = self.relu(x)
        return self.dense_3(x)

In [23]:
model = FCN()


# Call the model on an Input Tensor
x = tf.ones((16, 4))
y = model(x)

print(y.shape)

(16, 1)


In [24]:
# Print summary

model.summary()

Model: "fcn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            multiple                  160       
                                                                 
 dense_16 (Dense)            multiple                  528       
                                                                 
 dense_17 (Dense)            multiple                  17        
                                                                 
 re_lu_6 (ReLU)              multiple                  0         
                                                                 
Total params: 705
Trainable params: 705
Non-trainable params: 0
_________________________________________________________________


## 3. Training: Three Levels of abstraction

For this exercise, we will fix the model architecture (a small CNN) and train it on the MNIST dataset. 

In [25]:
# Prepare Dataset

from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28, 28, 1))
train_images = train_images.astype('float32') / 255

train_images, val_images = train_images[:50000], train_images[50000:]
train_labels, val_labels = train_labels[:50000], train_labels[50000:]

test_images = test_images.reshape((10000, 28, 28, 1))
test_images = test_images.astype('float32') / 255

train_labels = to_categorical(train_labels)
val_labels = to_categorical(val_labels)
test_labels = to_categorical(test_labels)

In [26]:
# Define Model
from tensorflow.keras import layers

Input = tf.keras.layers.Input(shape=(28,28,1))

x = layers.Conv2D(32, (3, 3), activation='relu')(Input)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.Flatten()(x)
x = layers.Dense(64, activation='relu')(x)

Output = layers.Dense(10, activation='softmax')(x)

my_CNN = tf.keras.Model(inputs=Input, outputs=Output)
my_CNN.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 28, 28, 1)]       0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 5, 5, 64)         0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 3, 3, 64)          36928 

#### a) Model.fit() method 

To use the built in methods `(Model.fit(), Model.evaluate(), Model.predict() `, we simply need to specify the
- optimizer
- loss
- metrics

and compile the model.

In [27]:
my_CNN.compile(
    # Optimizer
    optimizer = tf.keras.optimizers.Adam(),
    # Loss function to minimize
    loss = tf.keras.losses.CategoricalCrossentropy(),
    # List of metrics to monitor
    metrics = [tf.keras.metrics.CategoricalAccuracy()],
)

The `.fit()` method will accept `numpy arrays`, `tf.data.Dataset` objects and `data generators`. Here we will input the MNIST data as a numpy array.

The `.fit()` method can slice the data into batches, and will iterate over the entire dataset for a given number of epochs. Additionally, after each epoch it will evaluate on a hold-out validation set if specified.

In [30]:
history = my_CNN.fit(
    train_images,
    train_labels,
    batch_size=64,
    epochs=2,
    validation_data=(val_images, val_labels),
)

2022-09-19 14:33:19.863515: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 149.54MiB (rounded to 156800000)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-09-19 14:33:19.863571: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2022-09-19 14:33:19.863585: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 74, Chunks in use: 74. 18.5KiB allocated for chunks. 18.5KiB in use in bin. 3.8KiB client-requested in use in bin.
2022-09-19 14:33:19.863595: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 3, Chunks in use: 3. 1.5KiB allocated for chunks. 1.5KiB in use in bin. 1.5KiB client-requested in use in bin.
2022-09-19 14:33:19.863604: I 

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

r.cc:1066] InUse at 7ffe3ba00000 of size 1280 next 1
2022-09-19 14:33:19.863781: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 7ffe3ba00500 of size 256 next 2
2022-09-19 14:33:19.863790: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 7ffe3ba00600 of size 256 next 3
2022-09-19 14:33:19.863796: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 7ffe3ba00700 of size 256 next 4
2022-09-19 14:33:19.863803: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 7ffe3ba00800 of size 256 next 5
2022-09-19 14:33:19.863810: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 7ffe3ba00900 of size 256 next 6
2022-09-19 14:33:19.863817: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 7ffe3ba00a00 of size 256 next 7
2022-09-19 14:33:19.863823: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 7ffe3ba00b00 of size 256 next 8
2022-09-19 14:33:19.863830: I tensorflow/core/common_runtime/bfc_allocat

The returned history object holds a record of the loss and metric values recorded at the end of each epoch during training:

In [29]:
history.history

NameError: name 'history' is not defined

After training, we can call the `evaaluate` or `predict` methods on a test set.

In [None]:
my_CNN.evaluate(test_images, test_labels)

In [None]:
predictions = my_CNN.predict(test_images, verbose=1)

In [None]:
predictions.shape

###### What if there are multiple outputs? 

In [None]:
#multiple_inp_model.summary()

In [None]:
multiple_inp_model.compile(
    
    # Optimizer
    optimizer = tf.keras.optimizers.Adam(),
    
    # Loss function to minimize
    loss = {
        'temperature': tf.keras.losses.MeanSquaredError(),
        'humidity': tf.keras.losses.CategoricalCrossentropy()
    },
    
    # List of metrics to monitor
    metrics = {
        'temperature': [tf.keras.metrics.MeanAbsoluteError(),],
        'humidity': [tf.keras.metrics.CategoricalAccuracy(),]
    }
)

#### b) Customizing what happens in Model.fit()

To customize what `fit()` does, we just need to override the `train_step(self, data)` method of the `Model` class.

Let's do this with our simple CNN from above.

In [None]:
# Source: https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit

loss_tracker = tf.keras.metrics.Mean(name="loss")
accuracy_tracker = tf.keras.metrics.CategoricalAccuracy(name="accuracy")


class CustomModel(tf.keras.Model):
    
    def train_step(self, data):
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            # Compute our own loss
            loss = tf.keras.losses.categorical_crossentropy(y, y_pred)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss)
        accuracy_tracker.update_state(y, y_pred)
        return {"loss": loss_tracker.result(), "acc": accuracy_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker, accuracy_tracker]

In [None]:
# Define the architecture
Input = tf.keras.layers.Input(shape=(28,28,1))

x = layers.Conv2D(32, (3, 3), activation='relu')(Input)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.Flatten()(x)
x = layers.Dense(64, activation='relu')(x)

Output = layers.Dense(10, activation='softmax')(x)




# Reconstruct an instance of our CNN model
my_new_CNN = CustomModel(inputs=Input, outputs=Output)

# Now during compilation we don't need to pass loss or metrics
my_new_CNN.compile(optimizer="adam")

# Print summary
my_new_CNN.summary()

In [None]:
# Train 
my_new_CNN.fit(
    train_images,
    train_labels,
    batch_size=64,
    epochs=2,)

#### c) Training Loop from scratch

In [None]:
# Define Model

Input = tf.keras.layers.Input(shape=(28,28,1))

x = layers.Conv2D(32, (3, 3), activation='relu')(Input)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.Flatten()(x)
x = layers.Dense(64, activation='relu')(x)

Output = layers.Dense(10, activation='softmax')(x)

model = tf.keras.Model(inputs=Input, outputs=Output)
model.summary()

In [None]:
# Reshape data to simulate batches

batch_size= 16

train_images = train_images.reshape(-1, batch_size, 28,28,1)
val_images = val_images.reshape(-1, batch_size, 28,28,1)
test_images = test_images.reshape(-1, batch_size, 28,28,1)

train_labels = train_labels.reshape(-1, batch_size, 10)
val_labels = val_labels.reshape(-1, batch_size, 10)
test_labels = test_labels.reshape(-1, batch_size, 10)

In [None]:
# Define Optimizer, Loss functions and Metrics

optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.CategoricalCrossentropy()


# Prepare the metrics.
train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
val_acc_metric = tf.keras.metrics.CategoricalAccuracy()

In [None]:
# Training Script

import time

epochs = 2

for epoch in range(epochs):
    
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()
    
    # Reinstantiate datasets (don't have to do this for data generators or tf.data)
    train_dataset = zip(train_images, train_labels)
    val_dataset = zip(val_images, val_labels)
    test_dataset = zip(test_images, test_labels)

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        
        with tf.GradientTape() as tape:
            logits = model(x_batch_train, training=True)
            loss_value = loss_fn(y_batch_train, logits)
        
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # Update training metric.
        train_acc_metric.update_state(y_batch_train, logits)

        
        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    
    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    
    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    
    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val, training=False)
        # Update val metrics
        val_acc_metric.update_state(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))