## Motivation

In this note, we exame the idea declared in section 1.5.5 on MNIST data.

In [1]:
import numpy as np
import tensorflow as tf
from keras.losses import MSE
from sklearn.metrics import accuracy_score

from utils import get_gradient_loss_fn

2024-03-19 14:12:52.703021: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## The MNIST Data

In [2]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32').reshape([-1, 28*28]) / 255.0
x_test = x_test.astype('float32').reshape([-1, 28*28]) / 255.0
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

## Train a Model with Gradient Loss

In [3]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10, 'softmax')
])

get_gradient_loss = get_gradient_loss_fn(
    lambda inputs: MSE(inputs[1], model(inputs[0]))
)

In [4]:
optimizer = tf.optimizers.Adam()

@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        loss = get_gradient_loss((x, y))
    grads = tape.gradient(loss, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables))
    return loss

In [5]:
ds = tf.data.Dataset.from_tensor_slices(
    (x_train, tf.one_hot(y_train, 10))
)
ds = ds.batch(100)

In [6]:
for epoch in range(20):
    for x, y in ds:
        loss = train_step(x, y)
    print(epoch, loss)

0 tf.Tensor(0.00022991777, shape=(), dtype=float32)
1 tf.Tensor(0.00016096806, shape=(), dtype=float32)
2 tf.Tensor(0.00013518962, shape=(), dtype=float32)
3 tf.Tensor(0.00011561232, shape=(), dtype=float32)
4 tf.Tensor(0.00011101961, shape=(), dtype=float32)
5 tf.Tensor(0.00010521267, shape=(), dtype=float32)
6 tf.Tensor(0.00011087055, shape=(), dtype=float32)
7 tf.Tensor(0.000108633445, shape=(), dtype=float32)
8 tf.Tensor(0.00010395485, shape=(), dtype=float32)
9 tf.Tensor(9.801582e-05, shape=(), dtype=float32)
10 tf.Tensor(9.600283e-05, shape=(), dtype=float32)
11 tf.Tensor(9.6686e-05, shape=(), dtype=float32)
12 tf.Tensor(8.454644e-05, shape=(), dtype=float32)
13 tf.Tensor(8.760694e-05, shape=(), dtype=float32)
14 tf.Tensor(8.176513e-05, shape=(), dtype=float32)
15 tf.Tensor(8.39288e-05, shape=(), dtype=float32)
16 tf.Tensor(8.118931e-05, shape=(), dtype=float32)
17 tf.Tensor(8.327232e-05, shape=(), dtype=float32)
18 tf.Tensor(9.277199e-05, shape=(), dtype=float32)
19 tf.Tensor(8.

In [7]:
accuracy_score(y_test, tf.argmax(model(x_test), axis=1))

0.9785

## Baseline Model

In [8]:
baseline_model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10, 'softmax')
])

In [9]:
baseline_model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy'],
)

In [10]:
baseline_model.fit(
    x_train, y_train,
    epochs=20,
    validation_data=(x_test, y_test),
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f96a36dc590>

In [11]:
accuracy_score(y_test, tf.argmax(baseline_model(x_test), axis=1))

0.9782

## Conclusion

By simply using the "gradient loss", we obtain a result that approaches the baseline.