# Use a custom training loop to use different learning rates for different layers

* Download the data, 
* Create train/val/test datasets
* Shift & scale
* Use one-hot encoding

In [1]:
import tensorflow as tf
from tensorflow import keras

(X_train_full, y_train_full), (X_test_full, y_test_full) = keras.datasets.mnist.load_data()

In [2]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

boundary = 10000
num_classes = 10

X_train = (X_train_full[:boundary]).astype(np.float32)
y_train = to_categorical(y_train_full[:boundary], num_classes=num_classes)

X_val = (X_train_full[boundary:]).astype(np.float32)
y_val = to_categorical(y_train_full[boundary:].astype(np.int8), num_classes=num_classes)

X_test = (X_test_full).astype(np.float32)
y_test = to_categorical(y_test_full.astype(np.int8), num_classes=num_classes)

X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_val = np.reshape(X_val, (X_val.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

scaler.fit(X_val)
X_val = scaler.transform(X_val)

scaler.fit(X_test)
X_test = scaler.transform(X_test)

print('\nTrain')
print(X_train.shape, X_train.dtype)
print(y_train.shape, y_train.dtype)

print('\nVal')
print(X_val.shape, X_val.dtype)
print(y_val.shape, y_val.dtype)

print('\nTest')
print(X_test.shape, X_test.dtype)
print(y_test.shape, y_test.dtype)

print('\nMean values:')
print(X_train.mean())
print(X_val.mean())
print(X_test.mean())

print('\nStd devs')
print(X_train.std())
print(X_val.std())
print(X_test.std())


Train
(10000, 784) float32
(10000, 10) float32

Val
(50000, 784) float32
(50000, 10) float32

Test
(10000, 784) float32
(10000, 10) float32

Mean values:
2.0484535e-10
1.7590426e-09
-7.0399166e-09

Std devs
0.9265089
0.95498234
0.92306054


## Create a model

In [14]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=X_train.shape[1:]),
    keras.layers.Dense(30, activation='elu', kernel_initializer='he_normal'),
    keras.layers.Dense(10, activation='softmax')
])

for layer in [1, 2]:
    print(model.layers[layer].trainable_variables)

[<tf.Variable 'dense_14/kernel:0' shape=(784, 30) dtype=float32, numpy=
array([[-0.06334785, -0.03868385, -0.00731754, ...,  0.01721342,
        -0.00363457, -0.04714373],
       [-0.1105616 ,  0.01088115,  0.08010225, ...,  0.02330122,
        -0.00272126,  0.03687508],
       [ 0.05796933,  0.02085468, -0.00350108, ..., -0.04745568,
         0.07518546,  0.0439621 ],
       ...,
       [-0.00030396, -0.08895139, -0.02215209, ...,  0.10188583,
         0.04243958,  0.03422828],
       [-0.05767525, -0.02718014,  0.07887196, ...,  0.05166608,
        -0.03118842, -0.00030598],
       [-0.03227028, -0.07641676, -0.01380265, ..., -0.08534314,
         0.04366009, -0.00054667]], dtype=float32)>, <tf.Variable 'dense_14/bias:0' shape=(30,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>]
[<tf.Variable 'dense_15/kernel:0' shape=(30, 10) dtype=float32, numpy=
array([[-0

## Define a function for getting a batch of data

In [4]:
def random_batch(X, y, batch_size=32):
    idx = np.random.randint(len(X), size=batch_size)
    return X[idx], y[idx]

## Define a function for printing data out during training

In [5]:
def print_status_bar(epoch, iteration, mean_training_loss, metrics):
    print('Epoch = {}, iteration = {}, mean_training_loss = {:.3f}, mean_accuracy={:.3f}'.format(epoch, iteration, mean_training_loss, metrics[0].result()))

## Define hyper-parameters

In [20]:
n_epochs = 2
batch_size = 64
n_steps = len(X_train) // batch_size
optimizer_1 = keras.optimizers.Nadam(lr=0.01)
optimizer_2 = keras.optimizers.Nadam(lr=0.01)
loss_fn = keras.losses.categorical_crossentropy
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.Accuracy()]

## Build the custom loop

In [36]:
# loop over the epochs
for epoch in range(1, n_epochs + 1):
    print('Epoch', epoch)
    
    # loop over the batches
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train, y_train)
        
        with tf.GradientTape() as tape:
            y_pred = model(X_batch, training=True)  # Use the model to make predictions on the batch
            loss = tf.reduce_mean(loss_fn(y_batch, y_pred))  # Find the average loss over the batch
            
        gradients = tape.gradient(loss, [model.layers[1].trainable_variables, model.layers[2].trainable_variables])  # Find d(loss)/d(var) for trainable vars
        print(gradients)
        
        optimizer_1.apply_gradients(zip(gradients, model.layers[1].trainable_variables))  # apply a gradient descent step
        optimizer_2.apply_gradients(zip(gradients, model.layers[2].trainable_variables))
        
        # calculate things to print
        ml = mean_loss(loss)
        for metric in metrics:
            metric(y_batch, y_pred)
            
        iteration = step*batch_size
        
        # print out the info
        print_status_bar(epoch, iteration, ml.numpy(), metrics)
            

Epoch 1
[[<tf.Tensor: id=143238, shape=(784, 30), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: id=143237, shape=(30,), dtype=float32, numpy=
array([-3.01859109e-03, -2.87481374e-03, -3.07617971e-04,  1.51406304e-04,
       -7.32861925e-04, -1.85290230e-06,  2.45796855e-06,  4.35578649e-08,
        1.83739007e-06, -2.05368960e-05,  8.60313321e-06, -9.92809419e-06,
       -2.39732559e-04, -3.45359585e-04,  1.04909588e-03,  2.57871929e-03,
        4.10520798e-03,  9.39877587e-04, -2.62016227e-04,  2.93579709e-04,
        6.48872810e-05, -1.02387403e-05,  1.81811419e-03,  4.99843666e-03,
       -2.03044954e-04,  1.06391135e-05, -6.03130275e-06,  2.54580318e-05,
       -3.41951963e-03,  1.90237397e-03], dtype=float32)>], [<tf.Tensor: id=143235, sha

InvalidArgumentError: Shapes of all inputs must match: values[0].shape = [784,30] != values[1].shape = [30] [Op:Pack] name: x