# 1. autodiff in tensorflow

In [3]:
"""
tensorflow中使用tf.GradientTape开启上下文管理器，并对Operations进行'监控'，用于自动微分。
Trainable variables：会被自动监控;
Tensors：需要手动监控，通过调用该上下文管理器的`watch` method;
"""
import tensorflow as tf
import tensorflow.keras.layers as layers
import numpy as np

# normal Tensor
x = tf.constant(3.0)
with tf.GradientTape() as g:
    g.watch(x)
    y = x * x
    dy_dx = g.gradient(y, x)
    print(f"y=x^2, dy/dx=2x={dy_dx}")

y=x^2, dy/dx=2x=6.0


In [5]:
w = tf.Variable(5.0)
with tf.GradientTape() as tape:
    z = w ** 3 
    dz_dw = tape.gradient(z, w)
    print(f"dz/dw={dz_dw}")

dz/dw=75.0


In [8]:
 """支持高阶导"""
x = tf.Variable(5.0)
with tf.GradientTape() as g:
    with tf.GradientTape() as gg:
        gg.watch(x)
        y = x * x
        dy_dx = gg.gradient(y, x)  # dy_dx = 2 * x
        print(f"dy/dx=2x={dy_dx}")
    d2y_dx2 = g.gradient(dy_dx, x)  # d2y_dx2 = 2
    print(f"d2y/dx2={d2y_dx2}")

dy/dx=2x=10.0
d2y/dx2=2.0


# 2. gradient in nn.layers

### 2.1 dense (full connent)

In [31]:
"""
y = x*w + b
"""
x = tf.Variable([[1.0, 1.0], [2.0, 2.0]])
w = tf.Variable([[2.0,], [3.0,]])
with tf.GradientTape(persistent = True) as tape:
    y = tf.matmul(x, w)
    print(y)
    dy_dw = tape.gradient(y, w)
    print(f"dy/dw={dy_dw}")
    dy_dx = tape.gradient(y, x)
    print(f"dy/dx={dy_dx}")
del tape

tf.Tensor(
[[ 5.]
 [10.]], shape=(2, 1), dtype=float32)
dy/dw=w^T=[[3.]
 [3.]]
dy/dx=[[2. 3.]
 [2. 3.]]


### 2.2 pooling layers

In [38]:
"""
max pooling (1D)
"""
x = tf.Variable(
    [
      [ [1,3,4,2,5,2],
        [4,2,5,6,8,3],
        [5,3,7,8,9,3] ]
    ],
    dtype=tf.float32
) # batch_size * seq_len * feature_dim
max_pool = layers.MaxPooling1D(pool_size=3, strides=1, padding='valid')
max_pool.build(input_shape=(1,6,3))

with tf.GradientTape() as tape:
    tape.watch(max_pool.variables)
    tape.watch(x)
    y = max_pool(x)
    print(f"x --max_pool_1d--> {y}")
    dy_dx = tape.gradient(y, x)
    print(f"dy/dx=\n{dy_dx}")

x --max_pool_1d--> [[[5. 3. 7. 8. 9. 3.]]]
dy/dx=
[[[0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1.]
  [1. 0. 1. 1. 1. 0.]]]


In [28]:
"""
max pooling层的梯度（dy/dx）可通过初始化一个形如inputs的全零矩阵，并将max取得的最大值的位置argmax对应的值置为1得到。
如果pooling时的窗口不重叠的话，也可以按如下的方式使用argmax找到最大值位置并进行one-hot编码
"""
argmax_x = np.argmax(x[0], axis=0) 
print(argmax_x)
one_hot_argmax_x = np.eye(x.shape[1])[argmax_x].T
print(one_hot_argmax_x)

[2 0 2 2 2 1]
[[0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 1. 1. 1. 0.]]


In [39]:
"""
mean pooling (1D)
"""
x = tf.Variable(
    [
      [ [1,3,4,2,5,2],
        [4,2,5,6,8,3],
        [5,3,7,8,9,3] ]
    ],
    dtype=tf.float32
) # batch_size * seq_len * feature_dim
mean_pool = layers.AveragePooling1D(pool_size=3, strides=1, padding='valid')
mean_pool.build(input_shape=(1,6,3))

with tf.GradientTape() as tape:
    tape.watch(max_pool.variables)
    tape.watch(x)
    y = mean_pool(x)
    print(f"x --mean_pool_1d--> {y}")
    dy_dx = tape.gradient(y, x)
    print(f"dy/dx=\n{dy_dx}")
"""
和max pooling时不同，mean pooling池化窗口内的梯度是平均分配的
"""

x --mean_pool_1d--> [[[3.3333333 2.6666667 5.3333335 5.3333335 7.3333335 2.6666667]]]
dy/dx=
[[[0.33333334 0.33333334 0.33333334 0.33333334 0.33333334 0.33333334]
  [0.33333334 0.33333334 0.33333334 0.33333334 0.33333334 0.33333334]
  [0.33333334 0.33333334 0.33333334 0.33333334 0.33333334 0.33333334]]]
