In [None]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
import time

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
#总计20640个样本，每个样本8个属性表示，以及房价作为target，所有属性值均为number
#目标变量：平均房屋价值
#输入变量（特征）：平均收入、住房平均年龄、平均房间、平均卧室、人口、平均占用、纬度和经度

X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [None]:
input_shape = X_train.shape[1:]

### 使用自动微分计算梯度

In [None]:
def f(w1, w2):
    return 3 * w1 ** 2 + 2 * w1 * w2

#### 通过在调整相应参数时测量函数输出的变化来计算每个偏导的近似值

每个参数至少要调用一个f（）即计算一次f（w1，w2），对大型神经网络来说很繁琐

In [None]:
w1, w2 = 5, 3
eps = 1e-6

(f(w1 + eps, w2) - f(w1, w2)) / eps#w1的梯度（函数关于w1的偏导:6*w1+2*w2的近似值）

36.000003007075065

In [None]:
(f(w1, w2 + eps) - f(w1, w2)) / eps#w2的梯度（函数关于w2的偏导:2*w1的近似值）

10.000000003174137

### 自动微分

结果准确（精度仅受浮点误差影响）

无论有多少变量，gradient（）都只经历一次已经记录的计算（反向模式）

#### tf.GradientTape():上下文，自动记录其涉及变量的每个操作。
默认只监控由tf.Variable创建的trainable=True属性的变量。

In [None]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
    z = f(w1, w2)

#### gradient()：求数值的梯度函数

In [None]:
gradients = tape.gradient(z, [w1, w2],unconnected_gradients='zero')
#针对两个变量[w1,w2]计算z的梯度
gradients
#w1的偏导:6*w1+2*w2=6*5.+2*3.=36.
#w2的偏导:2*w1=2*5.=10.

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

#### 调用tape的gradient（）方法后tape会立即被自动擦除

In [None]:
#两次调用tape的gradient（）会报错
with tf.GradientTape() as tape:
    z = f(w1, w2)
    
dz_dw1 = tape.gradient(z, w1,unconnected_gradients='zero')
print(dz_dw1)
try:
    dz_dw2 = tape.gradient(z, w2,unconnected_gradients='zero')
except RuntimeError as ex:
    print(ex)

tf.Tensor(36.0, shape=(), dtype=float32)
A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)


#### 设置tape属性persistent=True可使得tape有持久性，但需在每次使用完该tape后将其删除以释放资源

In [None]:
with tf.GradientTape(persistent=True) as tape:
    z = f(w1, w2)

dz_dw1 = tape.gradient(z, w1,unconnected_gradients='zero')
dz_dw2 = tape.gradient(z, w2,unconnected_gradients='zero')

del tape

In [None]:
(dz_dw1,dz_dw2)

(<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>)

#### tape仅跟踪涉及Variable变量的操作，针对tf.Variable变量以外的其他张量计算z梯度时,结果为None

In [None]:
c1, c2 = tf.constant(5.), tf.constant(3.)#创建张量tensor，类似Numpy的ndarray
with tf.GradientTape() as tape:
    z = f(c1, c2)

gradients = tape.gradient(z, [c1, c2],unconnected_gradients='zero')
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=0.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0>]

#### watch()可强制tape观察任何tensor，记录涉及他们的所有操作（可针对这些张量计算梯度，就像它们是变量一样）

当需要实现正则化损失，以便在input变化不大时惩罚那些变化很大的激活时，损失将基于激活相对于输入的梯度而定。因此需要tape去观察不是变量的input。

In [None]:
with tf.GradientTape() as tape:
    tape.watch(c1)
    tape.watch(c2)
    z = f(c1, c2)

gradients = tape.gradient(z, [c1, c2],unconnected_gradients='zero')
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

#### 计算向量的梯度(如包含多个损失的向量）时，tf将计算向量和的梯度

In [None]:
with tf.GradientTape() as tape:
    z1 = f(w1, w2 + 2.)
    z2 = f(w1, w2 + 5.)
    z3 = f(w1, w2 + 7.)

tape.gradient([z1, z2, z3], [w1, w2],unconnected_gradients='zero')

[<tf.Tensor: shape=(), dtype=float32, numpy=136.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=30.0>]

#### tf.stack()：矩阵拼接

In [None]:
with tf.GradientTape(persistent=True) as tape:
    z1 = f(w1, w2 + 2.)
    z2 = f(w1, w2 + 5.)
    z3 = f(w1, w2 + 7.)

tf.reduce_sum(tf.stack([tape.gradient(z, [w1, w2],unconnected_gradients='zero') 
                        for z in (z1, z2, z3)]), axis=0)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([136.,  30.], dtype=float32)>

In [None]:
del tape

#### 计算向量的梯度时tf将计算向量和的梯度，需要获得单独梯度时需调用tape的jacobian（）:使用磁带上下文中记录的操作计算jacobian

In [None]:
x = tf.constant([[4, 2],[1, 3]], dtype=tf.dtypes.float32) 
  
with tf.GradientTape() as gfg:
    gfg.watch(x) 
    y = x * x * x 
res  = gfg.jacobian(y, x)  
print("res:",res)

res: tf.Tensor(
[[[[48.  0.]
   [ 0.  0.]]

  [[ 0. 12.]
   [ 0.  0.]]]


 [[[ 0.  0.]
   [ 3.  0.]]

  [[ 0.  0.]
   [ 0. 27.]]]], shape=(2, 2, 2, 2), dtype=float32)


In [None]:
#求二阶导hessian
with tf.GradientTape(persistent=True) as hessian_tape:
    with tf.GradientTape() as jacobian_tape:
        z = f(w1, w2)
    jacobians = jacobian_tape.gradient(z, [w1, w2],unconnected_gradients='zero')
hessians = [hessian_tape.gradient(jacobian, [w1, w2],unconnected_gradients='zero')
            for jacobian in jacobians]
del hessian_tape

In [None]:
jacobians
#w1的偏导:6*w1+2*w2=6*5.+2*3.=36.
#w2的偏导:2*w1=2*5.=10.

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In [None]:
hessians
#w1的偏导:6*w1+2*w2；w1的二阶导：6.，w1的偏导对w2的偏导：2.
#w2的偏导:2*w1=2*5.=10.；w2的偏导对w1的偏导：2.

[[<tf.Tensor: shape=(), dtype=float32, numpy=6.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=2.0>],
 [<tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.0>]]

#### tf.stop_gradient（）：用于阻止梯度在神经网络的某些部分反向传播

在前向传递过程中返回其输入，在反向传播期间不让梯度通过（作用类似于常量）

In [None]:
def f(w1, w2):
    return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 * w2)

with tf.GradientTape() as tape:
    z = f(w1, w2)

tape.gradient(z, [w1, w2],unconnected_gradients='zero')#[tensor 30.,None]
#w1的偏导：6*w1+None=6*5.=30.
#w2的偏导：0+None=0

[<tf.Tensor: shape=(), dtype=float32, numpy=30.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.0>]

大数值输入来计算my_softplus（）的梯度结果为Nan，因为autodiff计算此函数的梯度时由于浮点精度误差，最终导致精度无穷除以无穷

In [None]:
def my_softplus(z): # return value is just tf.nn.softplus(z)
    return tf.math.log(tf.exp(z) + 1.0)

x = tf.Variable(100.)
with tf.GradientTape() as tape:
    z = my_softplus(x)

tape.gradient(z, [x])

[<tf.Tensor: shape=(), dtype=float32, numpy=nan>]

In [None]:
x = tf.Variable([100.])
with tf.GradientTape() as tape:
    z = my_softplus(x)

tape.gradient(z, [x])

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([nan], dtype=float32)>]

In [None]:
#较大输入值也可获得正确结果，但主要输出仍然会爆炸
@tf.custom_gradient
#修饰my_softplus()函数的梯度的计算，使它返回其正常输出又返回计算导数的函数
def my_better_softplus(z):
    exp = tf.exp(z)
    def my_softplus_gradients(grad):
        print(grad)#1.
        return grad / (1 + 1 / exp)#softplus函数的导数，在数值上稳定
    return tf.math.log(exp + 1), my_softplus_gradients
            #softplus函数（爆炸了）、优化的梯度(softplus的倒数)

x = tf.Variable([1000.])
with tf.GradientTape() as tape:
    z = my_better_softplus(x)

z, tape.gradient(z, [x])

tf.Tensor([1.], shape=(1,), dtype=float32)


(<tf.Tensor: shape=(1,), dtype=float32, numpy=array([inf], dtype=float32)>,
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>])

In [None]:
#在输入较大时返回输入
def my_better_softplus(z):
    return tf.where(z > 30., z, tf.math.log(tf.exp(z) + 1.))

x = tf.Variable([1000.])
with tf.GradientTape() as tape:
    z = my_better_softplus(x)

z, tape.gradient(z, [x])

(<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1000.], dtype=float32)>,
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([nan], dtype=float32)>])