In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.3.4
numpy 1.19.5
pandas 1.1.5
sklearn 0.24.2
tensorflow 2.2.0
tensorflow.keras 2.3.0-tf


# 手动实现近似求导

In [4]:
def f(x):
    return 3. * x ** 2 + 2. * x - 1
#近似求函数f在x处的导数值，x移动eps单位，也就是离自己很近的一个点的切线
def approximate_derivative(f, x, eps=1e-4):
    return (f(x + eps) - f(x - eps)) / (2. * eps)

print(approximate_derivative(f, 1.))

7.999999999994678


In [5]:
def g(x1, x2):
    return (x1 + 5) * (x2 ** 2)
# 求偏导数在一点的导数值
def approximate_gradient(g, x1, x2, eps=1e-3):
    dg_x1 = approximate_derivative(lambda x: g(x, x2), x1, eps)
    dg_x2 = approximate_derivative(lambda x: g(x1, x), x2, eps)
    # 返回了一个turple
    return dg_x1, dg_x2

print(approximate_gradient(g, 2., 3.))
    

(8.999999999993236, 41.999999999994486)


# tensorflow提供的接口求导

In [21]:
#在tf中tape只能用一次
# 被求导的只能用变量
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
# 这里的上下文与普通的有点不同，tape会在（dz_x1 = tape.gradient(z, x1)）才销毁
# 并且这个tape只能被调用一次，他不是持久的
# g(x1, x2) = (x1 + 5) * (x2 ** 2)
with tf.GradientTape() as tape:
    z = g(x1, x2)
#求偏导
dz_x1 = tape.gradient(z, x1)
print(dz_x1)
print(x1)
# 这个try表明tape只能用一次，用第二次就会报错
try:
    dz_x2 = tape.gradient(z, x2)
except RuntimeError as ex:
    print(ex)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>
tf.Tensor(9.0, shape=(), dtype=float32)
<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>
GradientTape.gradient can only be called once on non-persistent tapes.


In [23]:
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
#加入persistent后，可以让tape使用一次后不释放
with tf.GradientTape(persistent = True) as tape:
    z = g(x1, x2)

dz_x1 = tape.gradient(z, x1)
dz_x2 = tape.gradient(z, x2)
print(dz_x1)
print(dz_x2)
#加入persistent后，要手动释放tape
del tape

tf.Tensor(9.0, shape=(), dtype=float32)
tf.Tensor(42.0, shape=(), dtype=float32)


In [26]:
#同时求偏导，传入列表
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    z = g(x1, x2)
# 方便损失对多个w求导
dz_x1x2 = tape.gradient(z, [x1, x2])

print(dz_x1x2)
print(type(dz_x1x2))

[<tf.Tensor: shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: shape=(), dtype=float32, numpy=42.0>]
<class 'list'>


In [9]:
#同时求偏导，传入列表，可以在with中编写多个公式
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    z = g(x1, x2)
    z1=z+3

dz1_x1= tape.gradient(z1, x1)

print(dz1_x1)

tf.Tensor(9.0, shape=(), dtype=float32)


In [29]:
#两个目标函数对一个变量求导数
x = tf.Variable(5.0)
with tf.GradientTape() as tape:
    z1 = 3 * x
    z2 = x ** 2
    z3 = x ** 3
# 计算三个方程对x的求导并将结果相加
tape.gradient([z1,z2,z3], x)   #3+10+75

<tf.Tensor: shape=(), dtype=float32, numpy=88.0>

In [31]:
# g(x1, x2) = (x1 + 5) * (x2 ** 2)
#如何求二阶导数呢？
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
#因为会执行多次，所以加入persistent
with tf.GradientTape(persistent=True) as outer_tape:
    with tf.GradientTape(persistent=True) as inner_tape:
        z = g(x1, x2)
    # inner_grads是一个turple，里面有两个方程式，分别是z对x1的偏导和对x2的偏导
    inner_grads = inner_tape.gradient(z, [x1, x2])
outer_grads = [outer_tape.gradient(inner_grad, [x1, x2])
               for inner_grad in inner_grads]
print(outer_grads)
del inner_tape
del outer_tape
#结果依次是None，先对x2求导，在对x1求导，接着是反过来，接着是对x2求2阶导数

[[None, <tf.Tensor: shape=(), dtype=float32, numpy=6.0>], [<tf.Tensor: shape=(), dtype=float32, numpy=6.0>, <tf.Tensor: shape=(), dtype=float32, numpy=14.0>]]


In [34]:
#模拟梯度下降算法 SGD
learning_rate = 0.1
#随意给一个x,就像我们的w一开始给的是格罗特均匀分布，这里用x来代表w
x = tf.Variable(-1.)

for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    #求得导数后对x建更新
    x.assign_sub(learning_rate * dz_dx)
print(x)  #就是y最小的时候，-b/2*a，也是损失函数cost最小的时候

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.33333334>


In [15]:
#GradientTape与optimizer（优化器）结合使用
learning_rate = 0.1
x = tf.Variable(2.0)
# 优化器就是更新梯度的一种方式
optimizer = keras.optimizers.SGD(lr = learning_rate)

for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    # 就是x-=梯度*学习率，但是这个方法可以进行矩阵运算
    # 这里要把梯度和x组合成元组放进去
    # 注意是“优化方法optimizer”的方法，所以学习率肯定算进去了
    # 注意这几个括号，调用apply_gradients用的()以及传数据进去必须是list，list里面再放对应的可迭代的数据类型
    optimizer.apply_gradients([(dz_dx, x)])
print(x)
#不同优化器的区别就在于learning_rate的变化规律是不一样的

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.3333333>


In [16]:
list(zip((1,2,3),(4,5,6)))

[(1, 4), (2, 5), (3, 6)]