In [1]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from testCases import *
from dnn_utils import *

%matplotlib inline

plt.rcParams["figure.figsize"] = (5.0, 4.0)

plt.rcParams["image.interpolation"] = "nearset"

plt.rcParams["image.cmap"] = "gray"

np.random.seed(1)


In [2]:

train_x_orig, train_y_orig, test_x_orig, test_y_orig, classes = load_data()

train_x = train_x_orig.reshape(train_x_orig.shape[0],-1).T/225

train_y = train_y_orig

test_x = test_x_orig.reshape(test_x_orig.shape[0], -1).T /225

test_y = test_y_orig

In [3]:
def initialize(dim_info):
    
    np.random.seed(1)
    
    parameters = {}
    
    dim_len = len(dim_info)
    
    for l in range(1, dim_len):
        
        parameters["W"+str(l)] = np.random.randn(dim_info[l], dim_info[l-1])*np.sqrt(2/dim_info[l - 1]) 
        
        
        # 将w进行简单归一化处理
        
        parameters["b"+str(l)] = np.zeros((dim_info[l], 1))
        
        
    return parameters

    
    

In [4]:
def line_forward(A, W, b):

    Z = np.dot(W, A) + b
    
    forward_Z_parameters = (A, W, b)
    
    return Z, forward_Z_parameters


In [5]:
def line_activate(A_prev, W, b, activation):
    
    Z, forward_Z_parameters = line_forward(A_prev, W, b)
    
    if activation == "sigmoid":
        A = sigmoid(Z)
    
    else:
        activation == "rule"
        A = relu(Z)
        
    forward_activation_parameters = (forward_Z_parameters, Z)
    
    return A, forward_activation_parameters
 

In [6]:
def line_model(X, parameters):
    
    forward_model_cache = []
    
    A = X
    
    layer_deep = len(parameters)//2
    
    for l in range(1,layer_deep): #  对网络层进行遍历 
        A_prev = A
        
        A, activation_parameters = line_activate(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation = "rule")
        # 输入层与隐藏层使用rule激活函数
        
        forward_model_cache.append(activation_parameters)
        
    Al, activation_parameters = line_activate(A, parameters["W"+str(layer_deep)], parameters["b"+str(layer_deep)], activation = 'sigmoid')
    # 输出层的Al参数
    
    forward_model_cache.append(activation_parameters)
    
    return Al, forward_model_cache
        
        
        
        

<span class="burk">正向传播使用droupout</span>

In [7]:
def line_model_droupout(X, parameters, dropout_num, seed, keep_dropout):
    
    forward_model_cache = []
    
    random_dropout_cache = []
    
    layer_deep = len(parameters)//2
    
    A = X
    
    
    for l in range(1, layer_deep):
        A_prev = A
        
        A, activation_parameters = line_activate(A_prev, parameters["W"+str(l)], parameters["b"+str(l)],activation = 'rule')
        
        forward_model_cache.append(activation_parameters)
        
        if keep_dropout =="True":

            np.random.seed(seed+1)

            random_del = np.random.rand(A.shape[0], A.shape[1])
           
            zero_random = random_del < dropout_num # 失活百分比
            

            result_num = A * zero_random

            A = result_num / dropout_num # 使失活后的期望值与没失活之前的期望值保持一致
            
            random_dropout_cache.append(zero_random) # 将失活的比值储存 后面进行反向传播时再调用
        else: 
            continue
        
    Al, activation_parameters = line_activate(A, parameters["W"+str(layer_deep)], parameters["b"+str(layer_deep)], activation = 'sigmoid')
    
    forward_model_cache.append(activation_parameters)
    
    # 输出层不使用droupout失活 
    
    return Al, forward_model_cache, random_dropout_cache



In [8]:
def computer_cost(Al, Y):
    
    m = Y.shape[1]
    
    cost = (-1/m ) * np.sum(np.multiply(Y, np.log(Al)) + np.multiply(1 - Y, np.log(1 - Al)))
    
    return cost


In [9]:
def back_forward(dz, forward_Z_parameters):
    
    a_prev, W, b = forward_Z_parameters
    
    m = a_prev.shape[1]
    
    dW = np.dot(dz, forward_Z_parameters[0].T)/m
    
    db = np.sum(dz, axis=1, keepdims = True )/ m
    
    dA = np.dot(forward_Z_parameters[1].T, dz)
    
    return dA, dW, db



In [10]:
def line_activation_backward(dA, forward_model_cache, activation):
    
    linear_caches, activation_cache = forward_model_cache
    
    if activation == "relu":
        dz = relu_backward(dA, activation_cache)
        
    else:
        activation == "sigmoid"
        dz = sigmoid_backward(dA, activation_cache)
        
    da_prev, dw, db = back_forward(dz, linear_caches)
    # 根据这一步得出来的dz 算出上一网络层的偏导数da 与当前网络层的偏导数dw， db
    
    return da_prev, dw, db

        
        
        

In [11]:
def line_model_backward(Al, y, forward_model_cache ):
    
    grades = {}
    
    l = len(forward_model_cache)
    
    m = y.shape[1]
    
    y = y.reshape(Al.shape)
    
    dal = -(np.divide(y, Al) - np.divide(1-y, 1-Al))
    
    last_layer_parameters = forward_model_cache[-1]
    
    grades["dA"+str(l-1)], grades["dW"+str(l)], grades["db"+str(l)] = line_activation_backward(dal, last_layer_parameters, activation='sigmoid')

    # 根据da得出倒数第二层的dA,跟倒数第一层的dW，db
    
    
    for i in reversed(range(1, l)):
        grades["dA"+str(i - 1)],grades['dW'+str(i)], grades["db"+str(i)] = line_activation_backward(grades["dA"+str(i)], forward_model_cache[i-1], activation = 'relu')
        
        
    return grades

    
    

<span class="burk"># 反向失活函数</span>

In [12]:
def line_modle_backward_droupout(Al, y, forward_model_cache, dropout_cache, keep_dropout, drop_num = 0.5):
    
    grades = {}
    
    l = len(forward_model_cache)

    m = y.shape[1]
    
    y = y.reshape(Al.shape)
    
    # dal = -(np.divide(y, Al)) - np.divide(1-y, 1-Al)
    
    dal = -(np.divide(y, Al) - np.divide(1-y, 1-Al))
    
    last_layer_parameters = forward_model_cache[-1]
    
    grades["dA"+str(l - 1)], grades["dW"+ str(l)], grades['db'+str(l)] = line_activation_backward(dal, last_layer_parameters, activation='sigmoid')
    
    # if keep_dropout =="True":
    
        # grades["dA"+str(l - 1)] = grades['dA'+str(l - 1)] * dropout_cache[l - 1]

        # grades["dA"+str(l - 1)] = grades["dA"+str(l - 1)] / droup_num
        
        # 隐藏层最后一层梯度

    for i in reversed(range(1, l)):
        grades['dA'+ str(i - 1)], grades["dW"+str(i)], grades["db"+str(i)] = line_activation_backward(grades['dA'+ str(i)], forward_model_cache[i - 1], activation ='relu' )
        
        if keep_dropout =="True" and i != 1: 
            # 这里只对隐藏层进行反向的dropout梯度计算 在前向传播时也只对隐藏层进行dropout
        
            grades['dA'+ str(i - 1)] = grades["dA"+str(i - 1)] * dropout_cache[i - 2]

            grades["dA"+ str(i - 1)] = grades['dA'+str(i - 1)] / drop_num # 使期望值保持一致
            
        else:
            continue
    
     
    return grades 
    

In [13]:
def updata_parameters(parameters, grads, learning_rate):
    
    L = len(parameters)//2

    for i in range(1,L+1):
        
        parameters["W"+ str(i)] = parameters["W"+ str(i)] - grads['dW'+str(i)]*learning_rate
        
        
        parameters["b"+str(i)] = parameters["b"+str(i)] - grads['db'+str(i)]*learning_rate
    
        
    return parameters
            
    
    
    

<span class="burk">Aadm梯度下降</span>

In [14]:
def updata_parameters_Adam(parameters, grads, learning_rate, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8,  keep_Adam = "True"):
    
    # 这里使用Aadm梯度下降是要注意是对更新函数时的梯度进行Aadm修正
    
    L = len(parameters) // 2
    
    v_corrected = {}
    
    s_corrected = {}
    
    if keep_Adam == "True":
    
        for l in range(L):
            print('Adam')
            
            v['dW'+str(l + 1)] = beta1 * grads["dW"+str(l + 1)] + (1 - beta1) * grads["dW"+str(l + 1)]
            v['db'+str(l + 1)] = beta1 * grads["db"+str(l + 1)] + (1 - beta1) * grads["db"+str(l + 1)]
            
            v_corrected["dW"+str(l + 1)] = v["dW"+str(l + 1)] / np.power(1 - beta1, 2) # 修正函数
            v_corrected["db"+str(l + 1)] = v["dW"+str(l + 1)] / np.power(1 - beta1, 2)
            
            s['dW'+str(l + 1)] = beta2 * grads['dW'+str(l + 1)] + (1 - beta2) * np.power(grads["dW"+str(l + 1)], 2) # 进行RSM 后面与指数加平均的区别是后面是乘梯度的平方
            s['db'+str(l + 1)] = beta2 * grads["db"+str(l + 1)] + (1 - beta2) * np.power(grads["db"+str(l + 1)], 2)
            
            s_corrected['dW'+str(l + 1)] = s["dW"+str(l + 1)] / np.power(1 - beta2, 2)
            s_corrected['db'+str(l + 1)] = s['db'+str(l + 1)] / np.power(1 - beta2, 2)
            
            parameters["W"+str(l + 1)] = parameters['dW'+str(l + 1)] - learning_rate * v_corrected["dW"+str(l + 1)] / np.sqrt(s_corrected["dW"+str(l + 1)] + epsilon)
            parameters['b'+str(l + 1)] = parameters["db"+str(l + 1)] - learning_rate * v_corrected['db'+str(l + 1)] / np.sqrt(s_corrected["db"+str(l + 1)] + epsilon)
    else:
        keep_Adam == 'False'
        for l in range(L):
            parameters["W"+str(l + 1)] = parameters["W"+str(l + 1)] - learning_rate * grads["dW"+str(l + 1)]
            parameters['b'+str(l + 1)] = parameters["b"+str(l + 1)] - learning_rate * grads["db"+str(l + 1)]

    return parameters


In [15]:
def dnn_model(x,
              y,
              dim_info,
              learning_rate = 0.0075,
              num_iterations = 100,
              print_cost = True,
              dropout_num = 1,
              keep_dropout = False,
              decay_rate = 0.5,
              keep_Adam = "True",
              beta1 = 0.9,
              beta2 = 0.999,
              epsilon = 1e-8):

    # decay_rate 动量梯度下降超参数

    np.random.seed(1)
    costs = []
    m = y.shape[1]

    parameters = initialize(dim_info)

    seed = 0

    for i in range(0, num_iterations):

        # Al, forward_model_cache = line_model(x, parameters)
        seed += 1

        Al, forward_model_cache, dropout_cache = line_model_droupout(
            x, parameters, dropout_num, seed, keep_dropout)

        cost = computer_cost(Al, y)

        # grades = line_model_backward(Al, y, forward_model_cache) # 进行反向传播

        grades = line_modle_backward_droupout(Al, y, forward_model_cache,
                                              dropout_cache, keep_dropout,
                                              dropout_num)
        
        # 动态梯度下降

        if i % 500 == 0:

            epoch_num = i // 500
            print('epoch_num:', epoch_num)

            learning_rate = 1 / (1 + decay_rate * epoch_num) * learning_rate

            print('learning_rate:', learning_rate)

        # parameters = updata_parameters(parameters, grades, learning_rate)
        
        parameters = updata_parameters_Adam(parameters, grades, learning_rate, beta1 = 0.9, beta2 = 0.09, keep_Adam = "False", epsilon = 1e-8)

        if i % 100 == 0:
            if print_cost and i > 0:
                print('训练%i次后的成本是: %f' % (i, cost))
                # print(dropout_cache[1])
            costs.append(cost)

    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations(per tens)')
    plt.title('Learning rate =' + str(learning_rate))

    plt.show()

    return parameters

In [16]:
dim_info = [12288, 30, 8, 6, 1]

ßß
parameters = dnn_model(train_x, train_y, dim_info, learning_rate=0.0075, num_iterations = 4000, print_cost = True, dropout_num = 0.7, keep_dropout = "True", decay_rate = 0.5, beta1 = 0.9, beta2 = 0.99, keep_Adam = 'True', epsilon = 1e-8)





epoch_num: 0
learning_rate: 0.0075
训练100次后的成本是: 0.673286
训练200次后的成本是: 0.651721
训练300次后的成本是: 0.602577
训练400次后的成本是: 0.587147
epoch_num: 1
learning_rate: 0.004999999999999999
训练500次后的成本是: 0.516014
训练600次后的成本是: 0.463296
训练700次后的成本是: 0.441060


KeyboardInterrupt: 

In [None]:
def predict(X, parameters):
    m = X.shape[1]
    
    n = len(parameters) //2
    
    p = np.zeros((1,m))
    
    probas, caches = line_model(X, parameters) # 这里要记住在实际预测时要把droupout关闭在进行预测
    
    for i in range(0, probas.shape[1]):
        if probas[0, i] > 0.5:
            p[0, i] = 1
        else:
            p[0, i] = 0
            
    return p
    
    

In [None]:
pred_train = predict(train_x, parameters)

print('预测集的准确率是:'+str(np.sum((pred_train == train_y) / train_x.shape[1])))


In [None]:
pred_test = predict(test_x, parameters)

print('测试集的准确率是:'+str(np.sum((pred_test == test_y) / test_x.shape[1])))