# Chapter 12: Custom Models and Training with TensorFlow

In [164]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -U tqdm
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

- so far, we've only used TensorFlow's high-level API, `tf.keras`, but it's gotten us pretty far
- we've built various neural network architectures, including regression and classification nets, Wide & Deep nets, and self-normalizing nets, using all sorts of techniques, such as Batch Normalization, dropout, and learning rate schedules
- in fact, 95% of the use cases you will encounter will not require anything more than `tf.keras` and `tf.data`
---
- but now it's time to dive deeper into TensorFlow and take a look at its lower-level Python API, which will be useful when you need extra control to write custom functions

## A Quick Tour of TensorFlow
- the most popular Deep Learning library, particularly well suited and fine-tuned for large-scale Machine Learning
- developed by Google's Brain team and it powers many of Google's large-scale services
- very similar core to NumPy, but with GPU support
- take some time to browse through TensorFlow's API as you will find that it is quite rich and well documented
- for example, I had no idea `tf.estimator` was one of TensorFlow's *High-Level Deep Learning APIs* (in addition to `tf.keras`)
---
- at the lowest level, each TensorFlow operation (*op*) is implemented using highly efficient C++ code
- many operations have multiple implementations called *kernels*, where each kernel is dedicated to a specific device type (CPUs, GPUs, or even TPUs)
- GPUs dramatically speed up computations by splitting them into many smaller chunks and running them in parallel across many GPU threads
- you can even run your models directly in your browser with *TensorFlow.js*
---
- there's more to TensorFlow than the library
- there's TensorBoard for visualization
- there's TensorFlow Extended (TFX), which is a set of libraries built by Google to productionize TensorFlow projects
- Google's *TensorFlow Hub* provides a way to easily download and reuse pretrained neural networks
- you can also get many neural network architectures, some of them pretrtained, in TensorFlow's model garden

## Tensors and Operations
- TensorFlow's API revolves around *tensors*, which flow from operation to operation, hence the name Tensor*Flow*
- a tensor is very similar to a NumPy `ndarray`: it is usually a multidimensional array, but it can also hold a scalar (a simple value, such as `42`)
---
- you can create a tensor with `tf.constant()`
- for example, here is a tensor representing a matrix with 2 rows and 3 columns of floats: 

In [165]:
tf.constant([[1., 2., 3.], [4., 5., 6.]]) # matrix

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)>

In [166]:
tf.constant(42) # scalar

<tf.Tensor: shape=(), dtype=int32, numpy=42>

- similar to an `ndarray`, a `tf.Tensor` has a shape and data type (`dtype`):

In [167]:
t = tf.constant([[1., 2., 3.], [4., 5., 6.]])
t.shape, t.dtype

(TensorShape([2, 3]), tf.float32)

- indexing works much like in NumPy:

In [168]:
t[:, 1:]

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[2., 3.],
       [5., 6.]], dtype=float32)>

In [169]:
t[..., 1, tf.newaxis]

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[2.],
       [5.]], dtype=float32)>

- most importantly, all sorts of tensor operations are available:

In [170]:
t + 10 # added 10 to each float

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[11., 12., 13.],
       [14., 15., 16.]], dtype=float32)>

In [171]:
tf.square(t) # squared each float

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 1.,  4.,  9.],
       [16., 25., 36.]], dtype=float32)>

In [172]:
t @ tf.transpose(t) 

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[14., 32.],
       [32., 77.]], dtype=float32)>

- you will find all the basic math operations you need and most operations that you can find in NumPy 
- some functions, however, have a different name than in NumPy
- for example, in TensorFlow, you must write `tf.transpose(t)`; you cannot just write `t.T` like in NumPy
- the reason is that the `tf.transpose()` function does not do exactly the same thing as NumPy's `T` attribute
- in TensorFlow, a new tensor is created with its own copy of the transposed data, while in NumPy, `t.T` is just a transposed view on the same data

## Keras' Low-Level API
- the Keras API has its own low-level API, located in `keras.backend`
- it includes functions like `square()`, `exp()`, and `sqrt()`
- in `tf.keras`, these functions generally just call the corresponding TensorFlow operations
- here is a small example: 

In [173]:
from tensorflow import keras
K = keras.backend
K.square(K.transpose(t)) + 10 # squares then adds 10

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[11., 26.],
       [14., 35.],
       [19., 46.]], dtype=float32)>

## Tensors and NumPy
- tensors play nice with NumPy, as you can create a tensor from a NumPy array and vice versa
- you can even apply TensorFlow operations to NumPy arrays and NumPy operations to tensors:

In [174]:
a = np.array([2., 4., 5.]) 
tf.constant(a) # ndarry --> tensor

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([2., 4., 5.])>

In [175]:
t.numpy() # tensor --> ndarry

array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)

In [176]:
np.array(t) # tensor --> ndarry

array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)

In [177]:
tf.square(a) # tensorflow operation on an ndarry

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([ 4., 16., 25.])>

In [178]:
np.square(t) # numpy operation on a tensor

array([[ 1.,  4.,  9.],
       [16., 25., 36.]], dtype=float32)

- notice that NumPy uses 64-bit precision by default, while TensorFlow uses 32-bit
- this is because 32-bit precision is generally more than enough for neural networks, plus it runs faster and uses less RAM
- so when you create a tensor from a NumPy array, make sure to set the `dtype=tf.float32`

## Type Conversions
- type conversions can significantly hurt performance, and they can easily go unnoticed when they are done automatically
- to avoid this, TensorFlow does not perform any type conversions automatically
- for example, you cannot add a float tensor and an integer tensor, and you cannot even add a 32-bit float and a 64-bit float:

In [179]:
try:
    tf.constant(2.0) + tf.constant(40) # cannot perform float tensor + integer tensor
except tf.errors.InvalidArgumentError as ex:
    print(ex)

cannot compute AddV2 as input #1(zero-based) was expected to be a float tensor but is a int32 tensor [Op:AddV2] name: add/


In [180]:
try:
    tf.constant(2.0) + tf.constant(40., dtype=tf.float64) # cannot add tensors of different float dtypes
except tf.errors.InvalidArgumentError as ex:
    print(ex)

cannot compute AddV2 as input #1(zero-based) was expected to be a float tensor but is a double tensor [Op:AddV2] name: add/


- this may be annoying at first, but remember that it's for your own good
- even so, you can use `tf.cast()` when you really need to convert types:

In [181]:
t2 = tf.constant(40., dtype=tf.float64)
tf.constant(2.0) + tf.cast(t2, tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=42.0>

## Variables
- the `tf.Tensor` values we've seen so far are immutable, meaning you cannot modify them
- **therefore, we cannot use regular tensors to implement weights in a neural network (as they must be tweaked by backpropagation)**
- plus, other parameters may also need to change over time (momentum optimizer keeps track of past gradients)
- what we need is a `tf.Variable`:

In [182]:
v = tf.Variable([[1., 2., 3.], [4., 5., 6.]])
v

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)>

- a `tf.Variable` acts similar to a `tf.Tensor`: you can perform the same operations with it, it plays nicely with NumPy as well, and it is just as picky with types
- but, a `tf.Variable` can also be modified in place using the `assign()` method 
- you can also modify individual cells (or slices) by using the cell's (or slice's) `assign()` method or by using the `scatter_update()` or `scatter_nd_update()` methods:

In [183]:
v.assign(2 * v)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2.,  4.,  6.],
       [ 8., 10., 12.]], dtype=float32)>

In [184]:
v[0, 1].assign(42) # v[0, 1] = 1st row, 2nd column

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 42.,  6.],
       [ 8., 10., 12.]], dtype=float32)>

In [185]:
v[:, 2].assign([0., 1.]) # all rows, 3rd colums

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 42.,  0.],
       [ 8., 10.,  1.]], dtype=float32)>

In [186]:
try:
    v[1] = [7., 8., 9.]
except TypeError as ex:
    print(ex)

'ResourceVariable' object does not support item assignment


In [187]:
v.scatter_nd_update(indices=[[0, 0], [1, 2]],
                    updates=[100., 200.])

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[100.,  42.,   0.],
       [  8.,  10., 200.]], dtype=float32)>

In [188]:
sparse_delta = tf.IndexedSlices(values=[[1., 2., 3.], [4., 5., 6.]],
                                indices=[1, 0])
v.scatter_update(sparse_delta)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[4., 5., 6.],
       [1., 2., 3.]], dtype=float32)>

- in practice, you will rarely have to create variables manually, since Keras provides an `add_weight()` method that will take care of it for you, as we will see
- moreover, model parameters will generally be updated directly by the optimizers, so you will rarely need to update variables manually

## Other Data Structures: 
- TensorFlow supports several other data structures, including the following:
---
- ***Sparse Tensors*** (`tf.SparseTensor`): efficiently represent tensors containing mostly zeros
- the `tf.sparse` package contains operations for sparse tensors

In [189]:
s = tf.SparseTensor(indices=[[0, 1], [1, 0], [2, 3]],
                    values=[1., 2., 3.],
                    dense_shape=[3, 4])
print(s)

SparseTensor(indices=tf.Tensor(
[[0 1]
 [1 0]
 [2 3]], shape=(3, 2), dtype=int64), values=tf.Tensor([1. 2. 3.], shape=(3,), dtype=float32), dense_shape=tf.Tensor([3 4], shape=(2,), dtype=int64))


In [190]:
tf.sparse.to_dense(s)

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [2., 0., 0., 0.],
       [0., 0., 0., 3.]], dtype=float32)>

In [191]:
s2 = s * 2.0
s2

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x21ed8c5ca90>

In [192]:
try:
    s3 = s + 1.
except TypeError as ex:
    print(ex)

unsupported operand type(s) for +: 'SparseTensor' and 'float'


In [193]:
s4 = tf.constant([[10., 20.], [30., 40.], [50., 60.], [70., 80.]])
tf.sparse.sparse_dense_matmul(s, s4)

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[ 30.,  40.],
       [ 20.,  40.],
       [210., 240.]], dtype=float32)>

In [194]:
s5 = tf.SparseTensor(indices=[[0, 2], [0, 1]],
                     values=[1., 2.],
                     dense_shape=[3, 4])
print(s5)

SparseTensor(indices=tf.Tensor(
[[0 2]
 [0 1]], shape=(2, 2), dtype=int64), values=tf.Tensor([1. 2.], shape=(2,), dtype=float32), dense_shape=tf.Tensor([3 4], shape=(2,), dtype=int64))


In [195]:
try:
    tf.sparse.to_dense(s5)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

indices[1] = [0,1] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SparseToDense]


In [196]:
s6 = tf.sparse.reorder(s5)
tf.sparse.to_dense(s6)

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[0., 2., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]], dtype=float32)>

- *Tensor Arrays* (`tf.TensorArray`): are lists of tensors
- they have a fixed size by default, but can optimally be made dynamic
- all tensors they contain must have the same shape and data type

In [197]:
array = tf.TensorArray(dtype=tf.float32, size=3)
# all tensors have the same shape and data type
array = array.write(0, tf.constant([1., 2.]))
array = array.write(1, tf.constant([3., 10.]))
array = array.write(2, tf.constant([5., 7.]))

In [198]:
array.read(1)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 3., 10.], dtype=float32)>

In [199]:
array.stack()

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[1., 2.],
       [0., 0.],
       [5., 7.]], dtype=float32)>

In [200]:
mean, variance = tf.nn.moments(array.stack(), axes=0)
mean

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 3.], dtype=float32)>

In [201]:
variance

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([4.6666665, 8.666667 ], dtype=float32)>

- ***Ragged Tensors*** (`tf.RaggedTensor`): represent lists of lists of tensors
- every tensor has the same shape and data type
- the `tf.ragged` package contains operations for ragged tensors

In [202]:
p = tf.constant(["Café", "Coffee", "caffè", "咖啡"]) # array of strings

In [203]:
tf.strings.length(p, unit="UTF8_CHAR") # length of each string in array

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 6, 5, 2])>

In [204]:
r = tf.strings.unicode_decode(p, "UTF8") # ragged tensor example
r

<tf.RaggedTensor [[67, 97, 102, 233], [67, 111, 102, 102, 101, 101], [99, 97, 102, 102, 232], [21654, 21857]]>

- this was just preface: now, onto ragged tensors:

In [205]:
print(r[1])

tf.Tensor([ 67 111 102 102 101 101], shape=(6,), dtype=int32)


In [206]:
print(r[1:3]) 

<tf.RaggedTensor [[67, 111, 102, 102, 101, 101], [99, 97, 102, 102, 232]]>


In [207]:
r2 = tf.ragged.constant([[65, 66], [], [67]])
print(tf.concat([r, r2], axis=0))

<tf.RaggedTensor [[67, 97, 102, 233], [67, 111, 102, 102, 101, 101], [99, 97, 102, 102, 232], [21654, 21857], [65, 66], [], [67]]>


In [208]:
r3 = tf.ragged.constant([[68, 69, 70], [71], [], [72, 73]])
print(tf.concat([r, r3], axis=1))

<tf.RaggedTensor [[67, 97, 102, 233, 68, 69, 70], [67, 111, 102, 102, 101, 101, 71], [99, 97, 102, 102, 232], [21654, 21857, 72, 73]]>


In [209]:
tf.strings.unicode_encode(r3, "UTF-8")

<tf.Tensor: shape=(4,), dtype=string, numpy=array([b'DEF', b'G', b'', b'HI'], dtype=object)>

In [210]:
r.to_tensor()

<tf.Tensor: shape=(4, 6), dtype=int32, numpy=
array([[   67,    97,   102,   233,     0,     0],
       [   67,   111,   102,   102,   101,   101],
       [   99,    97,   102,   102,   232,     0],
       [21654, 21857,     0,     0,     0,     0]])>

- ***String Tensors*** are regular tensors of type `tf.string`
- these represent byte strings, not Unicode strings, so if you create a string tensor using a Unicode string (like `"toucan"`), then it will get encoded to UTF-8 automatically (`b"caf\xc3\xa9"`) <-- (byte string)
- alternatively, you can represent Unicode strings using tensors of type `tf.int32`, where each item represents a Unicode code point (`[99, 97, 102, 233]`)

In [211]:
tf.constant(b"hello world")

<tf.Tensor: shape=(), dtype=string, numpy=b'hello world'>

In [212]:
tf.constant("café")

<tf.Tensor: shape=(), dtype=string, numpy=b'caf\xc3\xa9'>

In [213]:
u = tf.constant([ord(c) for c in "café"])
u

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 99,  97, 102, 233])>

In [214]:
b = tf.strings.unicode_encode(u, "UTF-8")
tf.strings.length(b, unit="UTF8_CHAR")

<tf.Tensor: shape=(), dtype=int32, numpy=4>

In [215]:
tf.strings.unicode_decode(b, "UTF-8")

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 99,  97, 102, 233])>

- ***Sets*** are represented as regular tensors (or sparse tensors)
- for example, `tf.constant([[1, 2], [3, 4]])` represents the two sets `{1, 2}` and `{3, 4}`
- more generally, each set is represented by a vector in the tensor's last axis
- you can manipulate sets using operations in the `tf.sets` package

In [216]:
set1 = tf.constant([[2, 3, 5, 7], [7, 9, 0, 0]])
set2 = tf.constant([[4, 5, 6], [9, 10, 0]])
tf.sparse.to_dense(tf.sets.union(set1, set2))

<tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[ 2,  3,  4,  5,  6,  7],
       [ 0,  7,  9, 10,  0,  0]])>

In [217]:
tf.sparse.to_dense(tf.sets.difference(set1, set2))

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[2, 3, 7],
       [7, 0, 0]])>

In [218]:
tf.sparse.to_dense(tf.sets.intersection(set1, set2))

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[5, 0],
       [0, 9]])>

- ***Queues*** store tensors across multiple steps 
- TensorFlow offers various kinds of queues: simple First In, First Out (FIFO) queues (FIFQueue), queues that can prioritize some items (`PriorityQueue`), shuffle their items (`RandomShuffleQueue`), and batch items of different shapes by padding (`PaddingFIFOQueue`)
- these classes are all in the `tf.queue.package`
---
- with tensors, operations, variables, and various data structures at your disposal, you are now ready to customize your models and training algorithms

## Custom Loss Functions
- let's start by creating a custom loss function, which is a simple and common use case
---
- **suppose you want to train a regression model, but your training set is too noisy**
- of course, you begin by trying to clean up your dataset by removing or fixing outliers, but that turns out to be insufficient; the dataset is still noisy
- **this is a good time to use the Huber loss instead of good old MSE**
- the Huber loss is not part of the official Keras API, but it is available in `tf.keras`
- for now, let's pretend it's not there: implementing it is easy as pie
- just create a function that takes the labels and predictions as arguments, and use TensorFlow operations to compute every instance's loss:

In [219]:
# necessary imports
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# training + test set
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)

# validation set
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

# scaling datasets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [220]:
def huber_fn(y_true, y_pred):
    error = y_true - y_pred
    is_small_error = tf.abs(error) < 1
    squared_loss = tf.square(error) / 2
    linear_loss  = tf.abs(error) - 0.5
    return tf.where(is_small_error, squared_loss, linear_loss)

- it is also preferable to return a tensor containing one loss per instance, rather than returning the mean loss
- this way, Keras can apply class weights or sample weights when requested
---
- now you can use this loss when you compile the Keras model, then train your model

In [221]:
input_shape = X_train.shape[1:]

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="selu", kernel_initializer="lecun_normal",
                       input_shape=input_shape),
    keras.layers.Dense(1),
])

In [222]:
model.compile(loss=huber_fn, optimizer="nadam", metrics=["mae"])

In [223]:
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid))

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21ed68a0d68>

- and that's it: for each batch during training, Keras will call the `huber_fn()` function to compute the loss and use it to perform a Gradient Descent step

## Saving and Loading Models That Contain Custom Components (custom loss functions continued)
- saving a model containing a custom loss function works fine as Keras saves the name of the function
- whenever you load it, you'll need to provide a dictionary that maps the function name to the actual function
- more generally, when you load a model containing custom objects, you need to map the names to the objects: 

In [224]:
model.save("my_model_with_a_custom_loss.h5")

In [225]:
model = keras.models.load_model("my_model_with_a_custom_loss.h5",
                                custom_objects={"huber_fn": huber_fn}) # mapping the names to the objects

In [226]:
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid)) 

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21ed9095080>

- with the current implementaiton, any error between -1 and 1 is considered "small"
- **however, if you want to set a different threshold, one solution is to create a function that creates a configured loss function**:

In [227]:
# editing huber loss function code to be able to set threshold
def create_huber(threshold=1.0):
    def huber_fn(y_true, y_pred):
        error = y_true - y_pred
        is_small_error = tf.abs(error) < threshold
        squared_loss = tf.square(error) / 2
        linear_loss  = threshold * tf.abs(error) - threshold**2 / 2
        return tf.where(is_small_error, squared_loss, linear_loss)
    return huber_fn

In [228]:
model.compile(loss=create_huber(2.0), optimizer="nadam", metrics=["mae"])

- unfortunately, when you save the model, the `threshold` will not be saved
- this means that you will have to specify the `threshold` value when loading the model:

In [229]:
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid))

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21edb9342b0>

In [230]:
model.save("my_model_with_a_custom_loss_threshold_2.h5")

In [231]:
model = keras.models.load_model("my_model_with_a_custom_loss_threshold_2.h5",
                                custom_objects={"huber_fn": create_huber(2.0)}) # manually setting threshold

In [232]:
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid))

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21edbbb3198>

- you can solve this by creating a subclass of the `keras.losses.Loss` class, and then implementing its `get_config()` method: 

In [233]:
class HuberLoss(keras.losses.Loss):
    def __init__(self, threshold=1.0, **kwargs):
        self.threshold = threshold
        super().__init__(**kwargs)
    def call(self, y_true, y_pred):
        error = y_true - y_pred
        is_small_error = tf.abs(error) < self.threshold
        squared_loss = tf.square(error) / 2
        linear_loss  = self.threshold * tf.abs(error) - self.threshold**2 / 2
        return tf.where(is_small_error, squared_loss, linear_loss)
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "threshold": self.threshold}

- let's walk through the code above: 
---
- the constructor (`__init__()`) accepts `**kwargs` and passes them to the parent constructor
- the `call()` method takes the labels and predictions, computes all the instance losses, and returns them
- the `get_config()` method returns a dictionary mapping each hyperparameter name to its value
---
- note: putting `*args` and/or `**kwargs` as the last items in your function definition’s argument list allows that function to accept an arbitrary number of arguments and/or keyword arguments
---
- you can then use any instance of this class when you compile the model:

In [234]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="selu", kernel_initializer="lecun_normal",
                       input_shape=input_shape),
    keras.layers.Dense(1),
])

In [235]:
model.compile(loss=HuberLoss(2.), optimizer="nadam", metrics=["mae"])

In [236]:
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid))

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21edbcb9c88>

In [237]:
model.save("my_model_with_a_custom_loss_class.h5")

- now, when you save the model, the threshold will be saved along with it
- now, when you load the model, you just need to map the class name to the class itself:

In [238]:
# code doesn't work
# model = keras.models.load_model("my_model_with_a_custom_loss_class.h5", 
#                                 custom_objects={"HuberLoss": HuberLoss})

- that's it for losses
- just as simple are custom activation functions, initializers, regularizers, and constraints

## Custom Activation Functions, Initializers, Regularizers, and Constraints
- most Keras functionalities can be customized in a very similar way
- **generally, you will just need to write a simple function with the appropriate inputs and outputs (as already seen)**
- here are examples of a custom activation function:

In [239]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [240]:
# custom activation function
def my_softplus(z): # return value is just tf.nn.softplus(z)
    return tf.math.log(tf.exp(z) + 1.0)

# custom Glorot initializer
def my_glorot_initializer(shape, dtype=tf.float32):
    stddev = tf.sqrt(2. / (shape[0] + shape[1]))
    return tf.random.normal(shape, stddev=stddev, dtype=dtype)

# custom l1 regularizer
def my_l1_regularizer(weights):
    return tf.reduce_sum(tf.abs(0.01 * weights))

# custom constraint that ensures weights are all positive
def my_positive_weights(weights): # return value is just tf.nn.relu(weights)
    return tf.where(weights < 0., tf.zeros_like(weights), weights)

- the arguments depend on the type of custom function
- these custom functions can be used normally:

In [241]:
layer = keras.layers.Dense(1, activation=my_softplus,
                           kernel_initializer=my_glorot_initializer,
                           kernel_regularizer=my_l1_regularizer,
                           kernel_constraint=my_positive_weights)

In [242]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [243]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="selu", kernel_initializer="lecun_normal",
                       input_shape=input_shape),
    keras.layers.Dense(1, activation=my_softplus,
                       kernel_regularizer=my_l1_regularizer,
                       kernel_constraint=my_positive_weights,
                       kernel_initializer=my_glorot_initializer),
])

In [244]:
model.compile(loss="mse", optimizer="nadam", metrics=["mae"])

In [245]:
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid))

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21edbe61588>

In [246]:
model.save("my_model_with_many_custom_parts.h5") # saving the custom model

In [247]:
model = keras.models.load_model( # loading the custom model
    "my_model_with_many_custom_parts.h5",
    custom_objects={
       "my_l1_regularizer": my_l1_regularizer,
       "my_positive_weights": my_positive_weights,
       "my_glorot_initializer": my_glorot_initializer,
       "my_softplus": my_softplus,
    })

- the `activation` function will be applied to the output of this `Dense` layer (result will be passed to next layer)
- the layer's weights will be initialized using the values returned by the `kernel_initializer`
- at each training step, the weights will be passed to the `kernel_regularizer` to compute the regularization loss
- finally, the `kernel_constraint` function will be called after each training step, and the layer's weights will be replaced by the constrained weights
---
- if a function has hyperparameters that need to be saved along with the model, then you will want to subclass the appropriate class like we did with our custom loss function
- here is a simple class for $l_1$ regularization that saves its `factor` hyperparameter:

In [248]:
class MyL1Regularizer(keras.regularizers.Regularizer): # subclassed the appropriate class
    def __init__(self, factor):
        self.factor = factor
    def __call__(self, weights):
        return tf.reduce_sum(tf.abs(self.factor * weights))
    def get_config(self):
        return {"factor": self.factor}

In [249]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [250]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="selu", kernel_initializer="lecun_normal",
                       input_shape=input_shape),
    keras.layers.Dense(1, activation=my_softplus,
                       kernel_regularizer=MyL1Regularizer(0.01),
                       kernel_constraint=my_positive_weights,
                       kernel_initializer=my_glorot_initializer),
])

In [251]:
model.compile(loss="mse", optimizer="nadam", metrics=["mae"])

In [252]:
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid))

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21edc12f908>

In [253]:
model.save("my_model_with_many_custom_parts.h5")

In [254]:
model = keras.models.load_model(
    "my_model_with_many_custom_parts.h5",
    custom_objects={
       "MyL1Regularizer": MyL1Regularizer,
       "my_positive_weights": my_positive_weights,
       "my_glorot_initializer": my_glorot_initializer,
       "my_softplus": my_softplus,
    })

### Important Note
- when subclassing the appropriate class, we must:
- implement the `call()` method for losses, layers, activation functions, and models
- implement the `__call__()` method for regularizers, initializers, and constraints

## Custom Metrics
- **losses and metrics are not the same**
- losses (cross entropy) are used by Gradient Descent to *train* a model, so they must be differentiable, and their gradients should not be 0 everywhere
- metrics (accuracy) are used to *evaluate* a model, so they can be non-differentiable or have 0 gradients everywhere
---
- in most cases, defining a custom metric function is similar to defining a custom loss function
- in fact, we can even use the Huber loss function we created earlier as a metric (not a popular choice: MSE or MAE is preferred):

In [255]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [256]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="selu", kernel_initializer="lecun_normal",
                       input_shape=input_shape),
    keras.layers.Dense(1),
])

In [257]:
model.compile(loss="mse", optimizer="nadam", metrics=[create_huber(2.0)]) # using Huber loss function we created earlier as a metric

In [258]:
model.fit(X_train_scaled, y_train, epochs=2)

Train on 11610 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21edd3b0208>

- for each batch during training, Keras will compute this metric and keep track of its mean since the start of the epoch
- most of the time, this is exactly what you want, but not always
---
### Streaming Metric
- suppose a model made 5 positive predictions in the 1st batch, 4 of which were correct: that's 80% precision
- the model then made 3 positive predictions in the second batch, but they were all incorrect: that's 0% precision
- if you compute the mean of these precisions, you get 40%, but that is not the model's actual precision over the 2 batches: there were a total of 4 true positives out of 8 total predictions, so the overall precision is really 50% 
- we need an object to keep track of the number of true/false positives and compute their ratio when requested: this is *precisely* what the `keras.metrics.Precision` class does

In [259]:
precision = keras.metrics.Precision()
# passing in labels and predictions 
# 1st batch
precision([0, 1, 1, 1, 0, 1, 0, 1], [1, 1, 0, 1, 0, 1, 0, 1]) # 4/5 positive predictions correct = 80% precision

<tf.Tensor: shape=(), dtype=float32, numpy=0.8>

In [260]:
# 2nd batch
precision([0, 1, 0, 0, 1, 0, 1, 1], [1, 0, 1, 1, 0, 0, 0, 0]) # 0/3 positive predictions correct = 0% precision

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

- we created a `Precision` object and used it like a function, passing it the labels and the predictions for the first and second batches
- **after the first batch, it returns a precision of 80%, and after the second batch, it returns a precision of 50%**
- this is called a *streaming metric* as it is gradually updated after each batch
--- 
- at any point, we can call the `result()` method to get the current value of this metric
- we can also look at its variables (tracking the number of true/false positives) by using the `variables` attribute, and we can reset these variables using the `reset_states()` method:

In [261]:
precision.result() # result method

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

In [262]:
precision.variables # variables attribute

[<tf.Variable 'true_positives:0' shape=(1,) dtype=float32, numpy=array([4.], dtype=float32)>,
 <tf.Variable 'false_positives:0' shape=(1,) dtype=float32, numpy=array([4.], dtype=float32)>]

In [263]:
precision.reset_states() # reset_states() method

- if you want to create such a streaming metric, create a subclass of the `keras.metrics.Metric` class
- here is a simple example that keeps track of the toal Huber loss and the number of instances seen so far:

In [264]:
# creating a streaming metric
class HuberMetric(keras.metrics.Metric):
    def __init__(self, threshold=1.0, **kwargs):
        super().__init__(**kwargs) # handles base args (e.g., dtype)
        self.threshold = threshold
        #self.huber_fn = create_huber(threshold) # TODO: investigate why this fails
        self.total = self.add_weight("total", initializer="zeros")
        self.count = self.add_weight("count", initializer="zeros")
    def huber_fn(self, y_true, y_pred): # workaround
        error = y_true - y_pred
        is_small_error = tf.abs(error) < self.threshold
        squared_loss = tf.square(error) / 2
        linear_loss  = self.threshold * tf.abs(error) - self.threshold**2 / 2
        return tf.where(is_small_error, squared_loss, linear_loss)
    def update_state(self, y_true, y_pred, sample_weight=None):
        metric = self.huber_fn(y_true, y_pred)
        self.total.assign_add(tf.reduce_sum(metric))
        self.count.assign_add(tf.cast(tf.size(y_true), tf.float32))
    def result(self):
        return self.total / self.count
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "threshold": self.threshold}

- we learned that some metrics, like precision, cannot always be averaged over batches, in which the only option is to implement a streaming metric

## Custom Layers
- if a model is a sequence of layers (A, B, C, A, B, C, A, B, C), then you might want to define a custom layer D containing layers A, B, C, so your model would simply be D, D, D
- you may also want to build an architecture with an exotic layer for which TensorFlow does not provide a default implementation
---
- if you want to build a custom layer without any weights, write a function and wrap it in a `keras.layers.Lambda` layer
- the following layer, for example, will apply the exponential function to its inputs:

In [265]:
exponential_layer = keras.layers.Lambda(lambda x: tf.exp(x))

In [266]:
exponential_layer([-1., 0., 1.])

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.36787945, 1.        , 2.7182817 ], dtype=float32)>

- this custom layer can be used like any other layer and even as an activation function
- the exponential layer is sometimes used in the output layer of a regression model when the values to predict have very different scales (0.001, 10., 1,000.)
---
- like with everything else, to build a custom stateful layer (a layer with weights), you need to create a subclass of the `keras.layers.Layer` class

In [267]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=input_shape),
    keras.layers.Dense(1),
    exponential_layer # using exponential layer as an output layer
])

model.compile(loss="mse", optimizer="nadam")
model.fit(X_train_scaled, y_train, epochs=5,
          validation_data=(X_valid_scaled, y_valid))
model.evaluate(X_test_scaled, y_test)

Train on 11610 samples, validate on 3870 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


nan

In [268]:
class MyDense(keras.layers.Layer):
    def __init__(self, units, activation=None, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.activation = keras.activations.get(activation)

    def build(self, batch_input_shape):
        self.kernel = self.add_weight(
            name="kernel", shape=[batch_input_shape[-1], self.units],
            initializer="glorot_normal")
        self.bias = self.add_weight(
            name="bias", shape=[self.units], initializer="zeros")
        super().build(batch_input_shape) # must be at the end

    def call(self, X):
        return self.activation(X @ self.kernel + self.bias)

    def compute_output_shape(self, batch_input_shape):
        return tf.TensorShape(batch_input_shape.as_list()[:-1] + [self.units])

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "units": self.units,
                "activation": keras.activations.serialize(self.activation)}

- you can now use a `MyDense` layer just like any other layer:

In [269]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [270]:
model = keras.models.Sequential([
    MyDense(30, activation="relu", input_shape=input_shape),
    MyDense(1)
])

In [271]:
model.compile(loss="mse", optimizer="nadam")
model.fit(X_train_scaled, y_train, epochs=2,
          validation_data=(X_valid_scaled, y_valid))
model.evaluate(X_test_scaled, y_test)

Train on 11610 samples, validate on 3870 samples
Epoch 1/2
Epoch 2/2


0.5359284612559533

In [272]:
model.save("my_model_with_a_custom_layer.h5")

In [273]:
model = keras.models.load_model("my_model_with_a_custom_layer.h5",
                                custom_objects={"MyDense": MyDense})

- to create a layer with multiple inputs, the argument to the `call()` method should be a tuple containing all the inputs
- similarly the argument to the `compute_output_shape()` method should be a tuple containing each input's batch shape
- the following toy layer takes two inputs and returns three outputs:

In [274]:
class MyMultiLayer(keras.layers.Layer):
    def call(self, X):
        X1, X2 = X
        return X1 + X2, X1 * X2

    def compute_output_shape(self, batch_input_shape):
        batch_input_shape1, batch_input_shape2 = batch_input_shape
        return [batch_input_shape1, batch_input_shape2]

- this layer can be used like any other layer, but only using the Functional and Subclassing APIs, not the Sequential API (which only accepts layers with one input and one output)
---
- let's create a layer with different behavior during training and testing (if it uses `Dropout` or `BatchNormalization` layers):

In [275]:
class AddGaussianNoise(keras.layers.Layer):
    def __init__(self, stddev, **kwargs):
        super().__init__(**kwargs)
        self.stddev = stddev

    def call(self, X, training=None):
        if training:
            noise = tf.random.normal(tf.shape(X), stddev=self.stddev)
            return X + noise
        else:
            return X

    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape

## Custom Models
- pretty straightforward: subclass the `keras.Model` class, create layers and variables in the constructor, and implement the `call()` method
---
- in the model we're going to build, the inputs go through a 1st dense layer, then through a *residual block* composed of 2 dense layers and an addition operation, then through this same residual block 3 more times, then through a 2nd residual block, and then through a dense output layer
- **this model is very unrealistic, but it's just to illustrate that you can easily build any kind of model you want, even one that contains loops and skip connections**
- let's 1st creation a `ResidualBlock` layer:

In [276]:
X_new_scaled = X_test_scaled

In [277]:
# residual block composed of 2 dense layers and an addition operation
class ResidualBlock(keras.layers.Layer):
    def __init__(self, n_layers, n_neurons, **kwargs):
        super().__init__(**kwargs)
        self.hidden = [keras.layers.Dense(n_neurons, activation="elu",
                                          kernel_initializer="he_normal")
                       for _ in range(n_layers)]

    def call(self, inputs):
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        return inputs + Z

- next, let's use the Subclassing API to define the model itself:

In [278]:
class ResidualRegressor(keras.models.Model):
    def __init__(self, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.hidden1 = keras.layers.Dense(30, activation="elu",
                                          kernel_initializer="he_normal")
        self.block1 = ResidualBlock(2, 30)
        self.block2 = ResidualBlock(2, 30)
        self.out = keras.layers.Dense(output_dim)

    def call(self, inputs):
        Z = self.hidden1(inputs)
        for _ in range(1 + 3):
            Z = self.block1(Z)
        Z = self.block2(Z)
        return self.out(Z)

- we create the layers in the constructor and use them in the `call()` method
- this model can be used like any other model
- if you want to be able to save the model and load it, you must implement `get_config()` method (as we did earlier) in both the `ResidualBlock` class and the `ResidualRegressor` class

In [279]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [280]:
model = ResidualRegressor(1)
model.compile(loss="mse", optimizer="nadam")
history = model.fit(X_train_scaled, y_train, epochs=5)
score = model.evaluate(X_test_scaled, y_test)
y_pred = model.predict(X_new_scaled)

Train on 11610 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Losses and Metrics Based on Model Internals
- there will be times when you want to define losses based on other parts of your model, such as the weights or activations of its hidden layers
- this may be useful for regularization purposes or to monitor some internal aspect of your model
---
- to define a custom loss based on model internals, compute it based on any part of the model, then pass the result to the `add_loss()` method
- let's build a custom MLP model composed of a stack of five hidden layers plus an output layer:

In [281]:
class ReconstructingRegressor(keras.models.Model):
    # constructor creates a DNN with 5 dense layers and one dense output layer
    def __init__(self, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.hidden = [keras.layers.Dense(30, activation="selu",
                                          kernel_initializer="lecun_normal")
                       for _ in range(5)]
        self.out = keras.layers.Dense(output_dim)
        # TODO: check https://github.com/tensorflow/tensorflow/issues/26260
        #self.reconstruction_mean = keras.metrics.Mean(name="reconstruction_error")

    def build(self, batch_input_shape): 
        # processes the inputs through all 5 hidden layers, then passes the result through the reconstruction layer, which produces the               reconstruction
        n_inputs = batch_input_shape[-1]
        self.reconstruct = keras.layers.Dense(n_inputs)
        super().build(batch_input_shape)

    def call(self, inputs, training=None):
        # computes the reconstruction loss and adds it to the model's list of losses
        # we scale down the reconstruction loss so it does not dominate the main loss
        # finally, the call() method passes the output of the hidden layers to the output layer and returns its output
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        reconstruction = self.reconstruct(Z)
        recon_loss = tf.reduce_mean(tf.square(reconstruction - inputs))
        self.add_loss(0.05 * recon_loss)
        #if training:
        #    result = self.reconstruction_mean(recon_loss)
        #    self.add_metric(result)
        return self.out(Z)

In [282]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [283]:
model = ReconstructingRegressor(1)
model.compile(loss="mse", optimizer="nadam")
history = model.fit(X_train_scaled, y_train, epochs=2)
y_pred = model.predict(X_test_scaled)

Train on 11610 samples
Epoch 1/2
Epoch 2/2


## Computing Gradients Using Autodiff
- to understand how to use autodiff to compute gradients automatically, let's consider a simple toy function:

In [284]:
def f(w1, w2):
    return 3 * w1 ** 2 + 2 * w1 * w2

- let's compute an approximation of each partial derivative by measuring how much the function's output changes when you tweak the corresponding hyperparameter:

In [285]:
w1, w2 = 5, 3
eps = 1e-6
(f(w1 + eps, w2) - f(w1, w2)) / eps

36.000003007075065

In [286]:
(f(w1, w2 + eps) - f(w1, w2)) / eps

10.000000003174137

- this works well and is easy to implement, but you need to call `f()` at least once per parameter, which makes this approach intractable for large neural networks
- instead, we should use autodiff: TensorFlow makes this simple

In [287]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
    z = f(w1, w2)

gradients = tape.gradient(z, [w1, w2])

- we first define 2 variables, `w1` and `w2`, then we create a `tf.GradientTape` context that will automatically record every operation that involves a variable, and finally we ask this tape to compute the gradients of the result `z` with regard to both variables `[w1, w2]`
- let's take a look at the gradients that TensorFlow computed: 

In [288]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

- the results are extremely accurate and the `gradient()` method only goes through the recorded computations once (in reverse order), no matter how many variables there are, so it is incredibly efficient
---
- the tape is automatically erased immediately after you call its `gradient()` method, so you will get an exception if you try to call `gradient()` twice:

In [289]:
with tf.GradientTape() as tape:
    z = f(w1, w2)

dz_dw1 = tape.gradient(z, w1)
try:
    dz_dw2 = tape.gradient(z, w2)
except RuntimeError as ex:
    print(ex)

GradientTape.gradient can only be called once on non-persistent tapes.


- to call `gradient()` more than once, make the tape persistent and delete it each time you are done with it to free resources

In [290]:
with tf.GradientTape(persistent=True) as tape: # making tape persistent
    z = f(w1, w2)

dz_dw1 = tape.gradient(z, w1)
dz_dw2 = tape.gradient(z, w2) # works now!
del tape

dz_dw1, dz_dw2

(<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>)

- the tape only tracks operations involving variables: 

In [291]:
c1, c2 = tf.constant(5.), tf.constant(3.)
with tf.GradientTape() as tape:
    z = f(c1, c2)

gradients = tape.gradient(z, [c1, c2])
gradients

[None, None]

- you can force the tape to watch any tensors you like, to record every operation that involves them:

In [292]:
with tf.GradientTape() as tape:
    tape.watch(c1)
    tape.watch(c2)
    z = f(c1, c2)

gradients = tape.gradient(z, [c1, c2])
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

- in some cases, you may want to stop gradients from backpropagating through some part of your neural network
- to do this, use the `tf.stop_gradient()` function, which returns the inputs during the forward pass, but does not let gradients through during backpropagation

In [293]:
def f(w1, w2):
    return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 * w2)

with tf.GradientTape() as tape:
    z = f(w1, w2)

tape.gradient(z, [w1, w2])

[<tf.Tensor: shape=(), dtype=float32, numpy=30.0>, None]

- finally, you may run into some numerical issues when computing gradients
- for example, if you compute the gradients of the `my_softplus()` function for large inputs, the result will be NaN:

In [294]:
x = tf.Variable(100.) # large input
with tf.GradientTape() as tape:
    z = my_softplus(x)

tape.gradient(z, [x])

[<tf.Tensor: shape=(), dtype=float32, numpy=nan>]

## Custom Training Loops
- sometimes the `fit()` method may not be flexible enough for what you need to do
- for example, the Wide & Deep paper uses two different optimizers: one for the wide path and the other for the deep path
- the `fit()` method, however, only uses one optimizer (the one that we specify when compiling the model), so implementing this paper requires writing your own custom loop 
---
- you may also write custom training loops to simply feel more confident that they do precisely what you intend for them to do
- however, writing custom functions will make your code longer, more error-prone, and harder to maintain
- so, unless you really need the extra flexibility, just stick with the `fit()` method
---
- let's build a simple model (no need to compile it as we will handle the training loop manually): 

In [295]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [296]:
l2_reg = keras.regularizers.l2(0.05)
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="elu", kernel_initializer="he_normal",
                       kernel_regularizer=l2_reg),
    keras.layers.Dense(1, kernel_regularizer=l2_reg)
])

- next, let's create a function that will randomly sample a batch of instances from the training set:

In [297]:
def random_batch(X, y, batch_size=32):
    idx = np.random.randint(len(X), size=batch_size)
    return X[idx], y[idx]

- let's also define a function that will display the training status, including the number of steps, the total number of steps, the mean loss since the start of the epoch, and other metrics:

In [298]:
def print_status_bar(iteration, total, loss, metrics=None):
    metrics = " - ".join(["{}: {:.4f}".format(m.name, m.result())
                         for m in [loss] + (metrics or [])])
    end = "" if iteration < total else "\n"
    print("\r{}/{} - ".format(iteration, total) + metrics,
          end=end)

- okay, let's start by defining some hyperparameters and choosing the optimizer, the loss function, and the metrics:

In [299]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [300]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = keras.optimizers.Nadam(lr=0.01)
loss_fn = keras.losses.mean_squared_error
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.MeanAbsoluteError()]

- now, let's build the custom training loop:

In [301]:
# beautiful custom training loop
for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train_scaled, y_train)
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        for variable in model.variables:
            if variable.constraint is not None:
                variable.assign(variable.constraint(variable))
        mean_loss(loss)
        for metric in metrics:
            metric(y_batch, y_pred)
        print_status_bar(step * batch_size, len(y_train), mean_loss, metrics)
    print_status_bar(len(y_train), len(y_train), mean_loss, metrics)
    for metric in [mean_loss] + metrics:
        metric.reset_states()

Epoch 1/5


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

11610/11610 - mean: 1.3955 - mean_absolute_error: 0.5722
Epoch 2/5
11610/11610 - mean: 0.6774 - mean_absolute_error: 0.5280
Epoch 3/5
11610/11610 - mean: 0.6351 - mean_absolute_error: 0.5177
Epoch 4/5
11610/11610 - mean: 0.6384 - mean_absolute_error: 0.5181
Epoch 5/5
11610/11610 - mean: 0.6440 - mean_absolute_error: 0.5222


- most importantly, this training loop does not handle layers that behave differently during training (`Dropout` or `BatchNormalization`)
- to handle these, you need to call the model with `training=True` and make sure it propagates this to every layer that needs it

## TensorFlow Functions and Graphs
- graphs are very simple to implement in TensorFlow
- let's start with a trivial function that computes the cube of its input: 

In [302]:
def cube(x):
    return x ** 3

In [303]:
cube(2)

8

In [304]:
cube(tf.constant(2.0))

<tf.Tensor: shape=(), dtype=float32, numpy=8.0>

- let's use the `tf.function()` to convert this python function to a *TensorFlow Function*
- the TF Function can be used exactly like the original Python function and it will return the same result

In [305]:
tf_cube = tf.function(cube) # converting normal python function to a tensorflow function
tf_cube

<tensorflow.python.eager.def_function.Function at 0x21edc336860>

In [306]:
tf_cube(2)

<tf.Tensor: shape=(), dtype=int32, numpy=8>

In [307]:
tf_cube(tf.constant(2.0))

<tf.Tensor: shape=(), dtype=float32, numpy=8.0>

- alternatively, we could have just used `tf.function` as a decorator (which is actually more common):

In [308]:
@tf.function # decorator
def tf_cube(x):
    return x ** 3

In [309]:
tf_cube(2)

<tf.Tensor: shape=(), dtype=int32, numpy=8>

- the original bython function is still available via the TF Function's `python_function` attribute, in case you ever need it: 

In [310]:
tf_cube.python_function(2)

8

- under the hood, the `tf.function()` analyzed the computations performed by the `cube()` function and **generated an equivalent computation graph**
---
- TF Functions are usually much faster than their Python equivalents: so when you want to boost a Python function, just transform it into a TF Function
---
- moreover, when you write a custom loss function, a custom metric, a custom layer, or any other custom function and use it in a Keras model (as we did throughout this chapter), Keras automatically converts your function into a TF function (so no need to use `tf.Function()`)

## AutoGraph and Tracing
- to generate graphs, TensorFlow first analyzes the Python function's source code to capture all the control flow statements (such as `for` loops) --> this first step is called *AutoGraph*
- after this initial analyzation, AutoGraph outputs an upgraded version of that function in which all the control flow statements are replaced by the appropriate TensorFlow operations
- next, TensorFlow calls this "upgraded" function, but instead of passing the argument, it passes a *symbolic tensor* (a tensor without any actual value)

## Chapter Summary
- we started with a brief overview of TensorFlow
- then we looked at TensorFlow's low-level API, including tensors, operations, variables, and even special data structures
- we then used these tools to customize almost every component in `tf.keras`
- finally, we looked at how TensorFlow Functions can boost performance and how graphs are generated using AutoGraph and tracing

## Exercises
---
- 1) *How would you describe TensorFlow in a short sentence? What are its main features? Can you name other popular Deep Learning libraries?*
---
- TensorFlow is an open-source library for numerical computation that is particularly well suited and fine-tuned for large-scale Machine Learning. Its core is similar to NumPy, but it also features GPU support, support for distributed computing, computation graph analysis, plenty of optimization capabilities, and several powerful APIs such as `tf.keras`, `tf.data`, `tf.image`, `tf.signal`, and more. Other popular Deep Learning libraries include PyTorch, MXNet, Microsoft Cognitive Toolkit, Theano, Caffe2, and Chainer. 
---
- 2) *Is TensorFlow a drop-in replacement for NumPy? What are the main differences between the two?*
---
- Although TensorFlow offers most of the functionalities provided by NumPy, it is not a drop-in replacement.This is because the names of the functions are not always the same and some functions do not behave in the exact same way. Lastly, NumPy arrays are mutable, whereas TensorFlow tensors are not (but you can use a `tf.Variable` if you need a mutable object).
---
- 3) *Do you get the same result with `tf.range(10)` and `tf.constant(np.arange(10))`?*
---

In [311]:
tf.range(10)

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>

In [312]:
tf.constant(np.arange(10))

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>

- Yes, both `tf.range(10)` and `tf.constant(np.arange(10))` return a one-dimensional tensor containing the integers 0 to 9. Keep in mind, however, that TensorFlow defaults to 32 bits, while NumPy defaults to 64 bits.
---
- 4) *Can you name six other data structures available in TensorFlow, beyond regular tensors?*
---
- In addition to regular TensorFlow tensors, TensorFlow offers several other data structures, including sparse tensors, tensor arrays, ragged tensors, queues, string tensors, and sets. The last two are actually represented as regular tensors, but TensorFlow provides special functions to manipulate them (in `tf.strings` and `tf.sets`).
---
- 5) *A custom loss function can be defined by writing a function or by subclassing the `keras.losses.Loss` class. When would you use each option?*
---
- When defining a custom loss function, you can generally just implement it as a regular Python function. However, if your custom loss function must support hyperparameters, then you should subclass the `keras.losses.Loss` class and implement the `__init__()` and `call()` methods. If you want the loss function's hyperparameters to be saved along with the model, then you must also implement the `get_config()` method. 
---
- 6) *Similarly, a custom metric can be defined in a function or a subclass of `keras.metrics.Metric`. When should you use each option?*
---
- Much like loss functions, most metrics can simply be defined as regular Python functions. But again, if you want your custom metric to support hyperparameters, then you should subclass the `keras.metrics.Metric` class. Lastly, if you want the state to be saved along with the model, then you should implement the `get_config()` method. 
---
- 7) *When should you create a custom layer versus a custom model?*
---
- It is important to distinguish the internal components of your model (layers) from the model itself (the object you will train). The former should subclass the `keras.layers.Layer` class, while the latter should subclass the `keras.models.Model` class. 
---
- 8) *What are some use cases that require writing your own custom training loop?*
---
- Writing your own custom training loop is very advanced and can introduce many errors, so you should only do it if absolutely necessary. Keras provides several tools to customize training without having to write an entire custom training loop: callbacks, custom regularizers, custom constraints, custom losses, and so on. However, if you want to use different optimizers for different parts of your neural network, like in the Wide & Deep paper, then writing a custom training loop would be appropriate. A custom training loop can also be useful when debugging, or when trying to understand exactly how training works.
---
- 9) *Can custom Keras components contain arbitrary Python code, or must they be convertible to TF Functions?*
---
- Custom Keras components should be convertible to TF Functions, so they should stick to TF operations. If you absolutely need to include arbitrary Python code in a custom component, you can either wrap it in a `tf.py_function()` operation or set `dynamic=True` when creating the custom layer or model.
---
- 10) *When would you need to create a dynamic Keras model? How do you do that? Why not make all your models dynamic?*
---
- Creating a dynamic Keras model can be useful for debugging or if you want to include arbitrary Python code in your model. However, making a model dynamic prevents Keras from using any of TensorFlow's graph features, so it will slow down training and inference, and you will not have the possibility to export the computation graph, which will severely hinder your model's portability. 
---
- 11) *Implement a custom layer that performs Layer Normalization:*

_Exercise a): The `build()` method should define two trainable weights *α* and *β*, both of shape `input_shape[-1:]` and data type `tf.float32`. *α* should be initialized with 1s, and *β* with 0s._

---

_Exercise b): The `call()` method should compute the mean_ μ _and standard deviation_ σ _of each instance's features. For this, you can use `tf.nn.moments(inputs, axes=-1, keepdims=True)`, which returns the mean μ and the variance σ<sup>2</sup> of all instances (compute the square root of the variance to get the standard deviation). Then the function should compute and return *α*⊗(*X* - μ)/(σ + ε) + *β*, where ⊗ represents itemwise multiplication (`*`) and ε is a smoothing term (small constant to avoid division by zero, e.g., 0.001)._

In [313]:
class LayerNormalization(keras.layers.Layer):
    def __init__(self, eps=0.001, **kwargs):
        super().__init__(**kwargs)
        self.eps = eps

    def build(self, batch_input_shape):
        self.alpha = self.add_weight( # trainable weight 1
            name="alpha", shape=batch_input_shape[-1:],
            initializer="ones")
        self.beta = self.add_weight( # trainable weight 2
            name="beta", shape=batch_input_shape[-1:],
            initializer="zeros")
        super().build(batch_input_shape) # must be at the end

    def call(self, X):
        mean, variance = tf.nn.moments(X, axes=-1, keepdims=True)
        return self.alpha * (X - mean) / (tf.sqrt(variance + self.eps)) + self.beta

    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "eps": self.eps}

_Exercise c): Ensure that your custom layer produces the same (or very nearly the same) output as the `keras.layers.LayerNormalization` layer._

In [314]:
X = X_train.astype(np.float32)

custom_layer_norm = LayerNormalization()
keras_layer_norm = keras.layers.LayerNormalization()

tf.reduce_mean(keras.losses.mean_absolute_error(
    keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=4.817434e-08>

- such a small difference: our custom layer works fine

---
- 12) *Train a model using a custom training loop to tackle the Fashion MNIST dataset:*
---

_Exercise a): Display the epoch, iteration, mean training loss, and mean accuracy over each epoch (updated at each iteration), as well as the validation loss and accuracy at the end of each epoch._

In [315]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full.astype(np.float32) / 255.
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test.astype(np.float32) / 255.

In [316]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [317]:
X_train.shape

(55000, 28, 28)

In [318]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax"),
])

In [319]:
# defining our hyperparameters, optimizer, and metrics
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = keras.optimizers.Nadam(lr=0.01)
loss_fn = keras.losses.sparse_categorical_crossentropy
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.SparseCategoricalAccuracy()]

In [320]:
with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc="Epoch {}/{}".format(epoch, n_epochs)) as steps:
            for step in steps:
                X_batch, y_batch = random_batch(X_train, y_train)
                with tf.GradientTape() as tape:
                    y_pred = model(X_batch)
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)
                gradients = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))                    
                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()
                steps.set_postfix(status)
            y_pred = model(X_valid)
            status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
            status["val_accuracy"] = np.mean(keras.metrics.sparse_categorical_accuracy(
                tf.constant(y_valid, dtype=np.float32), y_pred))
            steps.set_postfix(status)
        for metric in [mean_loss] + metrics:
            metric.reset_states()

NameError: name 'trange' is not defined