### https://www.tensorflow.org/guide/autodiff

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

In [2]:
x = tf.Variable(3.0)

with tf.GradientTape() as tape:
    y = x**2

In [3]:
# dy = 2x * dx
dy_dx = tape.gradient(y, x)
dy_dx.numpy()

6.0

In [4]:
w = tf.Variable(tf.random.normal((3, 2)), name='w')
b = tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
x = [[1., 2., 3.]]

In [5]:
w

<tf.Variable 'w:0' shape=(3, 2) dtype=float32, numpy=
array([[-0.06563916,  2.0368328 ],
       [-0.2923617 ,  0.96828437],
       [ 1.3369893 ,  0.26643306]], dtype=float32)>

In [6]:
b

<tf.Variable 'b:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>

In [7]:
y = x @ w + b
y

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[3.3601685, 4.772705 ]], dtype=float32)>

In [8]:
(y**2).numpy()

array([[11.290732, 22.778715]], dtype=float32)

In [9]:
(y**2).numpy().mean()

17.034723

In [10]:
loss = tf.reduce_mean(y**2)
loss

<tf.Tensor: shape=(), dtype=float32, numpy=17.034723>

In [11]:
with tf.GradientTape(persistent=True) as tape:
    y = x @ w + b
    loss = tf.reduce_mean(y**2)

In [12]:
dloss_dw = tape.gradient(loss, w)
dloss_db = tape.gradient(loss, b)

dloss_dw, dloss_db

(<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[ 3.359375 ,  4.7734375],
        [ 6.71875  ,  9.546875 ],
        [10.078125 , 14.3203125]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.3601685, 4.772705 ], dtype=float32)>)

In [13]:
[dl_dw, dl_db] = tape.gradient(loss, [w, b])

dloss_dw, dloss_db

(<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[ 3.359375 ,  4.7734375],
        [ 6.71875  ,  9.546875 ],
        [10.078125 , 14.3203125]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.3601685, 4.772705 ], dtype=float32)>)

In [14]:
my_vars = {
    'w': w,
    'b': b
}

grad = tape.gradient(loss, my_vars)
grad['b']

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([3.3601685, 4.772705 ], dtype=float32)>

In [15]:
x = tf.constant(3.0)
with tf.GradientTape() as tape:
  tape.watch(x)
  y = x**2

# dy = 2x * dx
dy_dx = tape.gradient(y, x)
dy_dx

<tf.Tensor: shape=(), dtype=float32, numpy=6.0>

In [16]:
print(dy_dx.numpy())

6.0


### If you have many sources a gradient for each is computed!

In [17]:
# A trainable variable
x0 = tf.Variable(3.0, name='x0')
# Not trainable
x1 = tf.Variable(3.0, name='x1', trainable=False)
# Not a Variable: A variable + tensor returns a tensor.
x2 = tf.Variable(2.0, name='x2') + 1.0
# Not a variable
x3 = tf.constant(3.0, name='x3')

with tf.GradientTape() as tape:
  y = (x0**2) + (x1**2) + (x2**2)

grad = tape.gradient(y, [x0, x1, x2, x3])

for g in grad:
  print(g)

tf.Tensor(6.0, shape=(), dtype=float32)
None
None
None


### If you ask gradient on multiple targets or non-scalar the gradients for each are SUMMED and return a scalar value

In [18]:
x = tf.Variable(2.0)
with tf.GradientTape(persistent=True) as tape:
  y0 = x**2
  y1 = 1 / x

print(tape.gradient(y0, x).numpy())
print(tape.gradient(y1, x).numpy())

4.0
-0.25


Thus, if you ask for the gradient of multiple targets, the result for each source is:

- The gradient of the sum of the targets, or equivalently
- The sum of the gradients of each target.

In [19]:
x = tf.Variable(2.0)
with tf.GradientTape() as tape:
  y0 = x**2
  y1 = 1 / x

print(tape.gradient({'y0': y0, 'y1': y1}, x).numpy())

3.75


In [20]:
#'Similarly, if the target(s) are not scalar the gradient of the sum is calculated:'
x = tf.Variable(2.)

with tf.GradientTape() as tape:
  y = x * [3., 4.]

print(tape.gradient(y, x).numpy())

7.0


If you need a separate gradient for each item, refer to **Jacobians.** (https://www.tensorflow.org/guide/advanced_autodiff#jacobians)

### Control flow

https://www.tensorflow.org/guide/autodiff#control_flow

Because a gradient tape records operations as they are executed, Python control flow is naturally handled (for example, if and while statements).

Here a different variable is used on each branch of an if. The gradient only connects to the variable that was used:

In [21]:
x = tf.constant(1.0)

v0 = tf.Variable(2.0)
v1 = tf.Variable(2.0)

with tf.GradientTape(persistent=True) as tape:
  tape.watch(x)
  if x > 0.0:
    result = v0
  else:
    result = v1**2 

dv0, dv1 = tape.gradient(result, [v0, v1])

print(dv0)
print(dv1)

tf.Tensor(1.0, shape=(), dtype=float32)
None


In [22]:
dx = tape.gradient(result, x)

print(dx) # None + 1.0 = None

None


### Getting a gradient of None

https://www.tensorflow.org/guide/autodiff#getting_a_gradient_of_none

When a target is not connected to a source you will get a gradient of None.

In [23]:
#Here z is obviously not connected to x
x = tf.Variable(2.)
y = tf.Variable(3.)

with tf.GradientTape() as tape:
  z = y * y
print(tape.gradient(z, x))

None


### a) Replaced a variable with a tensor:

One common error is to inadvertently replace a tf.Variable with a tf.Tensor, instead of using Variable.assign to update the tf.Variable

### Variable + Tensor = Tensor !!

In [24]:
x = tf.Variable(2.0)

type(x)

tensorflow.python.ops.resource_variable_ops.ResourceVariable

In [25]:
x = x+1
type(x)

tensorflow.python.framework.ops.EagerTensor

In [27]:
x = tf.Variable(2.0)

for epoch in range(2):
  with tf.GradientTape() as tape:
    x = x+1 #x is a tensor now
    y = x+5  

print(tape.gradient(y, x))

None


In [55]:
x = tf.Variable(2.0)

for epoch in range(2):
  with tf.GradientTape(persistent=True) as tape:
    x = x+1 #x is a tensor now
    y = x**2  

print(tape.gradient(y, x))

None


In [28]:
x = tf.Variable(2.0)

for epoch in range(2):
  with tf.GradientTape() as tape:
    y = x+5  

print(tape.gradient(y, x))

tf.Tensor(1.0, shape=(), dtype=float32)


### b) Did calculations outside of TensorFlow

The tape can't record the gradient path if the calculation exits TensorFlow.

In [32]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape() as tape:
    x2 = x**2 # x2 is a tensor

    # This step is calculated with NumPy
    y = np.mean(x2, axis=0)

    # Like most ops, reduce_mean will cast the NumPy array to a constant tensor
    # using `tf.convert_to_tensor`.
    y = tf.reduce_mean(y, axis=0)

print(tape.gradient(y, x))

None


In [29]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)
x2 = x**2

type(x2), type(x)

(tensorflow.python.framework.ops.EagerTensor,
 tensorflow.python.ops.resource_variable_ops.ResourceVariable)

In [30]:
y = np.mean(x2, axis=0)

type(y)

numpy.ndarray

In [31]:
y = tf.reduce_mean(y, axis=0)

type(y)

tensorflow.python.framework.ops.EagerTensor

In [33]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape() as tape:
    #x2 = x**2 # x2 is a tensor

    # This step is calculated with NumPy
    y = np.mean(x, axis=0)

    # Like most ops, reduce_mean will cast the NumPy array to a constant tensor
    # using `tf.convert_to_tensor`.
    y = tf.reduce_mean(y, axis=0)

print(tape.gradient(y, x))

None


In [36]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

type(x)

tensorflow.python.ops.resource_variable_ops.ResourceVariable

In [37]:
y = np.mean(x, axis=0)

type(y)

numpy.ndarray

In [38]:
y = tf.reduce_mean(y, axis=0)

type(y)

tensorflow.python.framework.ops.EagerTensor

In [39]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape() as tape:
    x2 = x**2 # x2 is a tensor

    # This step is calculated with NumPy
    #y = np.mean(x, axis=0)

    # Like most ops, reduce_mean will cast the NumPy array to a constant tensor
    # using `tf.convert_to_tensor`.
    y = tf.reduce_mean(x2, axis=0)

print(tape.gradient(y, x))

tf.Tensor(
[[1. 2.]
 [3. 4.]], shape=(2, 2), dtype=float32)


In [40]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

x2 = x**2

x3 = x + 1

type(x), type(x2), type(x3)

(tensorflow.python.ops.resource_variable_ops.ResourceVariable,
 tensorflow.python.framework.ops.EagerTensor,
 tensorflow.python.framework.ops.EagerTensor)

### But 'x' remains a Variable! That's why dy_dx works!

In [42]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    x2 = x**2 # x2 is a tensor
    x3 = x+1

    # This step is calculated with NumPy
    #y = np.mean(x, axis=0)

    # Like most ops, reduce_mean will cast the NumPy array to a constant tensor
    # using `tf.convert_to_tensor`.
    y1 = tf.reduce_mean(x2, axis=0)
    y2 = tf.reduce_mean(x3, axis=0)
    
print(tape.gradient(y1, x))
print(tape.gradient(y2, x))

tf.Tensor(
[[1. 2.]
 [3. 4.]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[0.5 0.5]
 [0.5 0.5]], shape=(2, 2), dtype=float32)


In [43]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    x2 = x**2 # x2 is a tensor
    x3 = x+1

    # This step is calculated with NumPy
    #y = np.mean(x, axis=0)

    # Like most ops, reduce_mean will cast the NumPy array to a constant tensor
    # using `tf.convert_to_tensor`.
    y1 = x2+2
    y2 = x3+3
    
print(tape.gradient(y1, x))
print(tape.gradient(y2, x))

tf.Tensor(
[[2. 4.]
 [6. 8.]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[1. 1.]
 [1. 1.]], shape=(2, 2), dtype=float32)


### But 'x' remains a Variable!

### c) Took gradients through an integer or string

Integers and strings are not differentiable. If a calculation path uses these data types there will be no gradient.

Nobody expects strings to be differentiable, but it's easy to accidentally create an int constant or variable if you don't specify the dtype.

In [44]:
x = tf.constant(10)

with tf.GradientTape() as g:
  g.watch(x)
  y = x * x

print(g.gradient(y, x))

None


In [45]:
x = tf.constant(10.0)

with tf.GradientTape() as g:
  g.watch(x)
  y = x * x

print(g.gradient(y, x))

tf.Tensor(20.0, shape=(), dtype=float32)


In [46]:
x = tf.constant(10, dtype=tf.float32)

with tf.GradientTape() as g:
  g.watch(x)
  y = x * x

print(g.gradient(y, x))

tf.Tensor(20.0, shape=(), dtype=float32)


### d) Took gradients through a stateful object

## I DONT UNDERSTAND THIS!!!

State stops gradients. When you read from a stateful object, the tape can only observe the current state, not the history that lead to it.

A tf.Tensor is immutable. You can't change a tensor once it's created. It has a value, but no state. All the operations discussed so far are also stateless: the output of a tf.matmul only depends on its inputs.

A tf.Variable has internal state—its value. When you use the variable, the state is read. It's normal to calculate a gradient with respect to a variable, but the variable's state blocks gradient calculations from going farther back. For example:

In [76]:
x0 = tf.Variable(3.0)
x1 = tf.Variable(0.0)
x2 = tf.Variable(5.0)

with tf.GradientTape(persistent=True) as tape:
    # Update x1 = x1 + x0.
    x1.assign_add(x0)
    # The tape starts recording from x1.
    y = x1**2   # y = (x1 + x0)**2

    x2 = x2 + 1
    y2 = x2**2

# This doesn't work.
print(tape.gradient(y, x0))   #dy/dx0 = 2*(x1 + x0)
print(tape.gradient(y, x1))
print(tape.gradient(y2, x2))

None
tf.Tensor(6.0, shape=(), dtype=float32)
tf.Tensor(12.0, shape=(), dtype=float32)


In [57]:
x2 = tf.Variable(5.0)

with tf.GradientTape(persistent=True) as tape:
    x2 = x2 + 1
    y2 = x2**2

print(tape.gradient(y2, x2))

tf.Tensor(12.0, shape=(), dtype=float32)


## WHY if I do another epoch then there is a problem???

In [70]:
x2 = tf.Variable(5.0)

for epoch in range(2):
    with tf.GradientTape(persistent=True) as tape:
        print(f'[Epoch {epoch} BEFORE addition] x2={type(x2)}')
        x2 = x2 + 1
        print(f'[Epoch {epoch} AFTER addition] x2={type(x2)}')
        y2 = x2**2
        print(f'y=={type(y)} at epoch {epoch}')

    print(tape.gradient(y2, x2))
    print('-'*75)

[Epoch 0 BEFORE addition] x2=<class 'tensorflow.python.ops.resource_variable_ops.ResourceVariable'>
[Epoch 0 AFTER addition] x2=<class 'tensorflow.python.framework.ops.EagerTensor'>
y==<class 'tensorflow.python.framework.ops.EagerTensor'> at epoch 0
tf.Tensor(12.0, shape=(), dtype=float32)
---------------------------------------------------------------------------
[Epoch 1 BEFORE addition] x2=<class 'tensorflow.python.framework.ops.EagerTensor'>
[Epoch 1 AFTER addition] x2=<class 'tensorflow.python.framework.ops.EagerTensor'>
y==<class 'tensorflow.python.framework.ops.EagerTensor'> at epoch 1
None
---------------------------------------------------------------------------


In [71]:
x = tf.constant(3.0)

for epoch in range(2):
    with tf.GradientTape(persistent=True) as tape:
        print(f'[Epoch {epoch}]')
        tape.watch(x)
        print(f'type(x)={type(x)}')
        y = x * x
        print(f'type(y)={type(y)}')
        z = y * y
        print(f'type(z)={type(z)}')

    print(tape.gradient(z, y))

[Epoch 0]
type(x)=<class 'tensorflow.python.framework.ops.EagerTensor'>
type(y)=<class 'tensorflow.python.framework.ops.EagerTensor'>
type(z)=<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(18.0, shape=(), dtype=float32)
[Epoch 1]
type(x)=<class 'tensorflow.python.framework.ops.EagerTensor'>
type(y)=<class 'tensorflow.python.framework.ops.EagerTensor'>
type(z)=<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(18.0, shape=(), dtype=float32)


## MY EXPERIMENTS

In [105]:
layer1 = tf.keras.layers.Dense(2, activation='relu')
layer2 = tf.keras.layers.Dense(4, activation='relu')
layer3 = tf.keras.layers.Dense(6, activation='relu')
x = tf.constant([[1., 2., 3.]])

def model(x):
    # Forward pass
    y = layer1(x)
    z = layer2(y)
    w = layer3(z)
    
    model.trainable_variables = [layer1.trainable_variables, 
                                 layer2.trainable_variables,
                                 layer3.trainable_variables
                                ]
    
    return w

with tf.GradientTape(persistent=True) as tape:
    w = model(x)
    loss = tf.reduce_mean(w**2)

print([var.name for var in tape.watched_variables()])

# Calculate gradients with respect to every trainable variable
grad = tape.gradient(loss, model.trainable_variables)

grad

['dense_18/kernel:0', 'dense_18/bias:0', 'dense_19/kernel:0', 'dense_19/bias:0', 'dense_20/kernel:0', 'dense_20/bias:0']


[[<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
  array([[0.61572266, 0.39233398],
         [1.2314453 , 0.78466797],
         [1.847168  , 1.177002  ]], dtype=float32)>,
  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.615712 , 0.3922553], dtype=float32)>],
 [<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
  array([[2.0927715, 0.       , 0.       , 1.994812 ],
         [1.9531488, 0.       , 0.       , 1.8617249]], dtype=float32)>,
  <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.8031466 , 0.        , 0.        , 0.76576364], dtype=float32)>],
 [<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
  array([[1.3563614 , 1.2142563 , 0.        , 0.        , 0.84202576,
          0.        ],
         [0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        ],
         [0.6641536 , 0.59457064, 0.        , 0.        , 0.41230488,
          0.        ]], dtype=floa

In [104]:
layer1 = tf.keras.layers.Dense(2, activation='relu')
layer2 = tf.keras.layers.Dense(4, activation='relu')
layer3 = tf.keras.layers.Dense(6, activation='relu')
x = tf.constant([[1., 2., 3.]])


with tf.GradientTape(persistent=True) as tape:
    # Forward pass
    y = layer1(x)
    z = layer2(y)
    w = layer3(z)
    loss = tf.reduce_mean(w**2)

# Calculate gradients with respect to every trainable variable
grad1 = tape.gradient(loss, layer1.trainable_variables)
grad2 = tape.gradient(loss, layer2.trainable_variables)
grad3 = tape.gradient(loss, layer3.trainable_variables)

grad1, grad2, grad3

([<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
  array([[-0.00325584,  0.03123474],
         [-0.00651169,  0.06246948],
         [-0.00976753,  0.09370422]], dtype=float32)>,
  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([-0.00325643,  0.03123997], dtype=float32)>],
 [<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
  array([[0.        , 0.        , 0.24695492, 0.        ],
         [0.        , 0.        , 0.36380506, 0.        ]], dtype=float32)>,
  <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.        , 0.        , 0.08926678, 0.        ], dtype=float32)>],
 [<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
  array([[0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.        , 0.        , 0.        ,
          0.        ],
         [0.        , 0.2539487 , 0.        , 0.06801331, 0.        ,
          0.        ],
         [0.        , 0.        , 0.        , 0.        , 0.        ,
          0.    

In [106]:
print([var.name for var in tape.watched_variables()])

['dense_18/kernel:0', 'dense_18/bias:0', 'dense_19/kernel:0', 'dense_19/bias:0', 'dense_20/kernel:0', 'dense_20/bias:0']


### If you watch tf.constant then everything works fine

In [110]:
x = tf.constant(3.0)

for epoch in range(2):
    print(f'[Epoch: {epoch}]')
    with tf.GradientTape(persistent=True) as tape:
        #x = x + 1  # or here instead of the end of tape.gradient
        tape.watch(x)
        y = x * x + 2*x +1 
        w = y * y + x**2 + 3
        z = w * w 
        
    # Use the tape to compute the gradient of z with respect to the
    # intermediate value y.
    # dz_dy = 2 * y and y = x ** 2 = 9
    print(tape.gradient(y, x).numpy())
    print(tape.gradient(w, y).numpy())
    print(tape.gradient(w, x).numpy())
    print(tape.gradient(z, w).numpy())
    print(tape.gradient(z, y).numpy())
    
    x = x + 1

[Epoch: 0]
8.0
32.0
262.0
536.0
17152.0
[Epoch: 1]
10.0
50.0
508.0
1288.0
64400.0


### For variables you must be careful when adding tensors to them

### Variable + tensor = tensor hence gradient will not be watched!

In [111]:
x = tf.Variable(3.0)

for epoch in range(2):
    print(f'[Epoch: {epoch}]')
    with tf.GradientTape() as tape:
        y = x**2

    # dy = 2x * dx
    dy_dx = tape.gradient(y, x)
    print(dy_dx.numpy())
    
    # problem with second epoch is that x in a tensor now not a variable
    x = x + 1  

[Epoch: 0]
6.0
[Epoch: 1]


AttributeError: 'NoneType' object has no attribute 'numpy'

### But if you do tape.watch(x) manually then everything works fine

In [112]:
x = tf.Variable(3.0)

for epoch in range(2):
    print(f'[Epoch: {epoch}]')
    with tf.GradientTape() as tape:
        tape.watch(x)
        y = x**2

    # dy = 2x * dx
    dy_dx = tape.gradient(y, x)
    print(dy_dx.numpy())
    
    x = x + 1

[Epoch: 0]
6.0
[Epoch: 1]
8.0


### Instead you could just do x.assign_add(1)

In [188]:
x = tf.Variable(3.0)

for epoch in range(3):
    print(f'[Epoch: {epoch}]')
    with tf.GradientTape() as tape:
        y = x**2

    # dy = 2x * dx
    dy_dx = tape.gradient(y, x)
    print(dy_dx.numpy())
    
    #x = x + 1
    x.assign_add(1)

[Epoch: 0]
6.0
[Epoch: 1]
8.0
[Epoch: 2]
10.0


### Variable state prevents gradient from going back

In [191]:
x0 = tf.Variable(3.0)
x1 = tf.Variable(2.0)

with tf.GradientTape(persistent=True) as tape:
    # Update x1 = x1 + x0.
#     print('Before assign')
#     print([var.name for var in tape.watched_variables()])
    x1.assign_add(x0)
#     print('After assign')
#     print([var.name for var in tape.watched_variables()])
    # The tape starts recording from x1.
    y = x1**2   # y = (x1 + x0)**2

# This doesn't work.
print(tape.gradient(y, x1))
print(tape.gradient(y, x0))   #dy/dx0 = 2*(x1 + x0)

tf.Tensor(10.0, shape=(), dtype=float32)
None


### If you do x = x + 1 on a Variable it will result in error in the second iteration
### But *WHY* it doesn't result in error in the first iteration?? Need to investigate that!!!

In [179]:
x = tf.Variable(3.0)

for epoch in range(2):
    print(f'[Epoch: {epoch}]')
    with tf.GradientTape() as tape:
        print(f'type(x)={type(x).__name__}')
        x = x + 1
        y = x**2
        print(f'type(y)={type(y).__name__}')

    print(f'type(x)={type(x).__name__}')
    dy_dx = tape.gradient(y, x)
    print('gradient dy/dx=', dy_dx.numpy())

[Epoch: 0]
type(x)=ResourceVariable
type(y)=EagerTensor
type(x)=EagerTensor
gradient dy/dx= 8.0
[Epoch: 1]
type(x)=EagerTensor
type(y)=EagerTensor
type(x)=EagerTensor


AttributeError: 'NoneType' object has no attribute 'numpy'

In [156]:
x = tf.Variable(3.0)

for epoch in range(2):
    print(f'[Epoch: {epoch}]')
    with tf.GradientTape() as tape:
        print(f'type(x)={type(x).__name__}')
        y = x**2
        print(f'type(y)={type(y).__name__}')

    # dy = 2x * dx
    dy_dx = tape.gradient(y, x)
    print('gradient dy/dx=', dy_dx.numpy())
    
    # problem with second epoch is that x in a tensor now not a variable
    x = x + 1  
    print(f'After addition...')
    print(f'type(x)={type(x).__name__}')
    print('='*30)

[Epoch: 0]
type(x)=ResourceVariable
type(y)=EagerTensor
gradient dy/dx= 6.0
After addition...
type(x)=EagerTensor
[Epoch: 1]
type(x)=EagerTensor
type(y)=EagerTensor


AttributeError: 'NoneType' object has no attribute 'numpy'

### However if you use a separate variable then that's fine. 
### I think it is because x remains a Variable!

In [181]:
x = tf.Variable(3.0)

for epoch in range(2):
    print(f'[Epoch: {epoch}]')
    with tf.GradientTape(persistent=True) as tape:
        print(f'type(x)={type(x).__name__}')
        x1 = x + 1
        y = x1**2
        print(f'type(y)={type(y).__name__}')

    print(f'type(x)={type(x).__name__}')
    dy_dx = tape.gradient(y, x)
    print('gradient dy/dx=', dy_dx.numpy())
    dy_dx1 = tape.gradient(y, x1)
    print('gradient dy/dx=', dy_dx1.numpy())

[Epoch: 0]
type(x)=ResourceVariable
type(y)=EagerTensor
type(x)=ResourceVariable
gradient dy/dx= 8.0
gradient dy/dx= 8.0
[Epoch: 1]
type(x)=ResourceVariable
type(y)=EagerTensor
type(x)=ResourceVariable
gradient dy/dx= 8.0
gradient dy/dx= 8.0


### I think this doesn't produce an error because x0 remains a Variable.
### But what about x1??

### If you change x1 as follows: x1 = x1 +1 then you get error on second iteration

In [187]:
x0 = tf.Variable(3.0)
x1 = tf.Variable(2.0)

for epoch in range(2):
    print('')
    print(f'Epoch: {epoch}')
    with tf.GradientTape(persistent=True) as tape:
        # Update x1 = x1 + x0.
        x2 = x1 + x0 + 1
        
        x1 = x1 + 1  ## If you change x1 then on second iteration will get an error

        y = x1**1.5 + x2  # y = (x1 + x0)**2

        print(f'type(x0)={type(x0).__name__}')
        print(f'type(x1)={type(x1).__name__}')
        print(f'type(x1)={type(x2).__name__}')
        print(f'type(y)={type(y).__name__}')

    # This doesn't work.
    print(tape.gradient(y, x0))   #x0 remains a variable thats fine
    print(tape.gradient(y, x1))   #x1 is a tensor though. This should get error
    print(tape.gradient(y, x2))   #x1 is a tensor though. This should get error


Epoch: 0
type(x0)=ResourceVariable
type(x1)=EagerTensor
type(x1)=EagerTensor
type(y)=EagerTensor
tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(2.598076, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)

Epoch: 1
type(x0)=ResourceVariable
type(x1)=EagerTensor
type(x1)=EagerTensor
type(y)=EagerTensor
tf.Tensor(1.0, shape=(), dtype=float32)
None
tf.Tensor(1.0, shape=(), dtype=float32)


## e) No gradient registered

Some `tf.Operations` (https://www.tensorflow.org/api_docs/python/tf/Operation) are registered as being non-differentiable and will return None. Others have no gradient registered.

The `tf.raw_ops` (https://www.tensorflow.org/api_docs/python/tf/raw_ops) page shows which low-level ops have gradients registered.

If you attempt to take a gradient through a float op that has no gradient registered the tape will throw an error instead of silently returning None. This way you know something has gone wrong.

For example, the tf.image.adjust_contrast function wraps raw_ops.AdjustContrastv2, which could have a gradient but the gradient is not implemented:

In [None]:
image = tf.Variable([[[0.5, 0.0, 0.0]]])
delta = tf.Variable(0.1)

with tf.GradientTape() as tape:
  new_image = tf.image.adjust_contrast(image, delta)

try:
  print(tape.gradient(new_image, [image, delta]))
  assert False   # This should not happen.
except LookupError as e:
  print(f'{type(e).__name__}: {e}')

If you need to differentiate through this op, you'll either need to implement the gradient and register it (using `tf.RegisterGradient` https://www.tensorflow.org/api_docs/python/tf/RegisterGradient) or re-implement the function using other ops.

### Zeros instead of None

In some cases it would be convenient to get 0 instead of None for unconnected gradients. You can decide what to return when you have unconnected gradients using the unconnected_gradients argument:

In [192]:
x = tf.Variable([2., 2.])
y = tf.Variable(3.)

with tf.GradientTape() as tape:
  z = y**2
print(tape.gradient(z, x, unconnected_gradients=tf.UnconnectedGradients.ZERO))

tf.Tensor([0. 0.], shape=(2,), dtype=float32)


### https://www.tensorflow.org/api_docs/python/tf/GradientTape

### BE CAREFUL!
Note that when using models you should ensure that your variables exist when using watch_accessed_variables=False. Otherwise it's quite easy to make your first iteration not have any gradients:

In [None]:
a = tf.keras.layers.Dense(32)
b = tf.keras.layers.Dense(32)

with tf.GradientTape(watch_accessed_variables=False) as tape:
  tape.watch(a.variables)  # Since `a.build` has not been called at this point
                           # `a.variables` will return an empty list and the
                           # tape will not be watching anything.
  result = b(a(inputs))
  tape.gradient(result, a.variables)  # The result of this computation will be
                                      # a list of `None`s since a's variables
                                      # are not being watched.