In [66]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [67]:
df = pd.read_csv("cardio_train.csv", delimiter=";", index_col=0)
df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [68]:
pcs = MinMaxScaler()
variaveis_continuas = ["age", "height", "weight", "ap_hi", "ap_lo", "cholesterol"]
df[variaveis_continuas] = pcs.fit(df[variaveis_continuas]).transform(df[variaveis_continuas])

In [69]:
df.gender.value_counts()

1    45530
2    24470
Name: gender, dtype: int64

In [70]:
df.gender = df.gender.apply(lambda genero: 0 if genero == 2 else genero)

In [6]:
df.columns

Index(['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [71]:
x, y = df.iloc[:,:-1].to_numpy(), df.iloc[:,-1].to_numpy()

In [72]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
X_train = X_train.T
X_test = X_test.T
y_train = y_train.reshape((1, X_train.shape[1]))
y_test = y_test.reshape((1, X_test.shape[1]))

### Cada layer terá uma matriz de pesos W e um vetor bias b associado. A dimensão da matriz W é definida com a quantidade de linhas sendo igual a quantidade de nodos no layer e a quantidade colunas como a quantidade de inputs do layer. O vetor bias terá um bias para cada nodo layer

In [73]:
def get_parametros_iniciais(tamanho_layers):
    parametros = []
    for indice in range(1,len(tamanho_layers)):
        parametros.append([np.random.randn(tamanho_layers[indice], tamanho_layers[indice-1]) * (2/np.sqrt(tamanho_layers[indice-1])),
                           np.zeros((tamanho_layers[indice], 1))])
    return parametros
        

### Definição das funções de ativação utilizadas: sigmóide e relu

In [74]:
def sigmoid(Z):
    """
    Implements the sigmoid activation in numpy
    
    Arguments:
    Z -- numpy array of any shape
    
    Returns:
    A -- output of sigmoid(z), same shape as Z
    cache -- returns Z as well, useful during backpropagation
    """
    
    A = 1/(1+np.exp(-Z))
    
    return A

def relu(Z):
    """
    Implement the RELU function.

    Arguments:
    Z -- Output of the linear layer, of any shape

    Returns:
    A -- Post-activation parameter, of the same shape as Z
    cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
    """
    
    A = np.maximum(0,Z)

    return A

### Esquema forward propagation
<img src="imagens/model_architecture_kiank.png" style="width:600px;height:300px;">

In [183]:
def forward_propagation(X, parametros):
    cache = []
    A = X
    quantidade_layers = len(parametros)
    for indice_layer in range(0, quantidade_layers-1): # foward propagation até o ultimo layer antes do layer output
        #print(indice_layer)
        W = parametros[indice_layer][0]
        b = parametros[indice_layer][1]
        print(W.shape, A.shape, b.shape)
        Z = np.dot(W, A) + b
        cache.append((A, Z, W, b))
        A = relu(Z)
    W, b = parametros[indice_layer+1] # como é um problema de classificao, o último layer deve obrigatoriamente ter
                                      #  a funcao sigmoide como funcao de ativação
    
    Z = np.dot(W, A) 
    print(W.shape, A.shape,Z.shape, b.shape)
    Z = Z.T + b
    cache.append((A, Z, W, b))
    A = sigmoid(Z.T)
    
    return A, cache

### A função custo usada aqui é definida como: $$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{7}$$

In [83]:
def get_custo(A, Y):    
    m = Y.shape[1]
    if 0 in A or 1 in a:
        print("zerro")
    custo = (1./m) * (-np.dot(Y,np.log(A).T) - np.dot(1-Y, np.log(1-A).T))
    custo = float(np.squeeze(custo))
    return custo

In [59]:
tamanho_layers = [len(df.columns)-1, 6, 9, 5, 1]

In [60]:
parametros = get_parametros_iniciais(tamanho_layers)

In [61]:
A, cache = forward_propagation(X_train, parametros)

In [84]:
def derivada_relu(dA, Z):
    dZ = np.array(dA, copy=True) 
    dZ[Z <= 0] = 0
    return dZ

def derivada_sigmoide(dA, Z):
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)    
    return dZ

In [85]:
def backward_propagation(A, cache, Y):
    L = len(cache)
    m = A.shape[1]
    Y = Y.reshape(A.shape)
    gradientes = []
    
    dA = - (np.divide(Y, A) - np.divide(1 - Y, 1 - A)) # derivada do custo em função de A
    A_prev, Z, W, b = cache[-1]
    dZ = derivada_sigmoide(dA, Z)

    dA_layer_antetior = np.dot(W.T, dZ)
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    gradientes.append([dW, db])
    
    
    for l in (range(L-1))[::-1]:
        dA = dA_layer_antetior
        A_prev, Z, W, b = cache[l]
        dZ = derivada_relu(dA, Z)

        dA_layer_antetior = np.dot(W.T, dZ)
        dW = np.dot(dZ, A_prev.T) / m
        db = np.sum(dZ, axis=1, keepdims=True) / m
        gradientes.append([dW, db])
        
    return gradientes[::-1]


In [18]:
a = get_parametros_iniciais([4,3,2,1])

In [86]:
def get_parametros_atualizados(parametros, gradientes, taxa_aprendizado=0.05):
    L = len(parametros) 
    for l in range(L):
        parametros[l][0] -= taxa_aprendizado * gradientes[l][0]
        parametros[l][1] -= taxa_aprendizado * gradientes[l][1] 
    return parametros
    

In [110]:
def fit(X, Y, taxa_aprenzido, iteracoes, tamanhos_layers):
    global gradientes
    parametros = get_parametros_iniciais(tamanho_layers)
    custos = []
    for iteracao in range(iteracoes):
        A, cache = forward_propagation(X, parametros)
        custo = get_custo(A, Y)
        custos.append(custo)
        if iteracao % 500 == 0:
            print("Iteração: {} | Custo: {}".format(iteracao, custo))
        gradientes = backward_propagation(A, cache, Y)
        parametros = get_parametros_atualizados(parametros, gradientes)
    
    return parametros, custos        

In [94]:
parametros[0][0].flattern()

AttributeError: 'numpy.ndarray' object has no attribute 'flattern'

In [108]:
parametro = parametros[0]

In [109]:
gradientes = []

In [116]:
lista_parametros = np.concatenate([np.concatenate([parametro[0].flatten(), parametro[1].flatten()]) 
                                       for parametro in parametros])

In [117]:
tamanho_layers

[11, 10, 10, 8, 8, 6, 6, 4, 4, 2, 1]

In [175]:
def converte_array_em_parametros(array, tamanho_layers):
    parametros = []
    for index in range(1, len(tamanho_layers)):
        w_shape_linear = tamanho_layers[index]*tamanho_layers[index-1]
        w = array[:w_shape_linear].reshape((tamanho_layers[index], tamanho_layers[index-1]))
        b = array[w_shape_linear:w_shape_linear + tamanho_layers[index]].reshape((tamanho_layers[index], 1))
        parametros.append([w, b])
        array = array[w_shape_linear + tamanho_layers[index]:]
    return parametros

In [164]:
parametros = get_parametros_iniciais(tamanho_layers)

In [165]:
flat_parametros = np.concatenate([np.concatenate([parametro[0].flatten(), parametro[1].flatten()]) 
                                       for parametro in parametros])

In [166]:
for index in range(len(parametros)):
    print(np.array_equal(converte_array_em_parametros(flat_parametros, tamanho_layers)[index][0], parametros[index][0])
          and np.array_equal(converte_array_em_parametros(flat_parametros, tamanho_layers)[index][1], parametros[index][1]))

True
True
True
True
True
True
True
True
True
True


In [177]:
def verifica_backpropagation(parametros, gradientes, tamanho_layers, X, Y, epsilon=1e-7):
    parameters_values = np.concatenate([np.concatenate([parametro[0].flatten(), parametro[1].flatten()]) 
                                       for parametro in parametros])
    grads =  np.concatenate([np.concatenate([gradiente[0].flatten(), gradiente[1].flatten()]) 
                                       for gradiente in gradientes])
    
    num_parameters = parameters_values.shape[0]
    J_plus = np.zeros((num_parameters, 1))
    J_minus = np.zeros((num_parameters, 1))
    gradapprox = np.zeros((num_parameters, 1))
    
    for i in range(num_parameters):
        
        # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
        # "_" is used because the function you have to outputs two parameters but we only care about the first one
        ### START CODE HERE ### (approx. 3 lines)
        thetaplus =  np.copy(parameters_values)                                       # Step 1
        thetaplus[i] = thetaplus[i] + epsilon                                   # Step 2
        a = converte_array_em_parametros(thetaplus, tamanho_layers)
        J_plus[i], _ =  forward_propagation(X, a)  # Step 3
        J_plus[i] = get_custo(J_plus[i], Y)
        ### END CODE HERE ###
        
        # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
        ### START CODE HERE ### (approx. 3 lines)
        thetaminus = np.copy(parameters_values)                                       # Step 1
        thetaminus[i] = thetaminus[i] - epsilon                                 # Step 2       
        b = converte_array_em_parametros(thetaminus, tamanho_layers)
        J_minus[i], _ = forward_propagation(X, b) # Step 3
        J_minus[i] = get_custo(J_minus[i], Y)
        ### END CODE HERE ###
        
        # Compute gradapprox[i]
        ### START CODE HERE ### (approx. 1 line)
        gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon)
        ### END CODE HERE ###
    
    # Compare gradapprox to backward propagation gradients by computing difference.
    ### START CODE HERE ### (approx. 1 line)
    numerator = np.linalg.norm(grad - gradapprox)                                     # Step 1'
    denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)                   # Step 2'
    difference = numerator / denominator                                              # Step 3'
    ### END CODE HERE ###

    if difference > 1e-7:
        print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
    else:
        print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
    
    return difference
    

In [184]:
verifica_backpropagation(parametros, gradientes, tamanho_layers, X_train, y_train)

(10, 11) (11, 56000) (10, 1)
(10, 10) (10, 56000) (10, 1)
(8, 10) (10, 56000) (8, 1)
(8, 8) (8, 56000) (8, 1)
(6, 8) (8, 56000) (6, 1)
(6, 6) (6, 56000) (6, 1)
(4, 6) (6, 56000) (4, 1)
(4, 4) (4, 56000) (4, 1)
(2, 4) (4, 56000) (2, 1)
(1, 2) (2, 56000) (1, 56000) (1, 1)


ValueError: could not broadcast input array from shape (56000) into shape (1)

In [115]:
 np.concatenate([np.concatenate([gradiente[0].flatten(), gradiente[1].flatten()]) 
                                       for gradiente in gradientes])

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -1.24914528e-04,  1.07478068e-04,
        2.01755116e-07, -3.32213752e-05, -1.97849694e-06, -3.67136953e-07,
       -1.44192628e-04, -2.15693662e-04, -3.22384185e-05,  3.41651912e-06,
       -2.86979477e-06,  1.71696855e-05,  8.45778446e-06,  1.26625248e-05,
        1.07165480e-05,  1.21786666e-06,  4.86934933e-07,  6.69325320e-05,
        2.85295258e-05,  2.69479076e-07,  4.19465265e-08,  3.73852588e-06,
        2.79933113e-04, -2.70490618e-05,  9.72444105e-05,  3.00878039e-05,
        4.84994422e-06,  4.76921485e-06,  1.39620595e-04, -2.27108575e-05,
        0.00000000e+00, -

In [111]:
tamanho_layers = [len(df.columns)-1, 10, 10, 8, 8, 6, 6, 4, 4, 2, 1]
parametros, custos = fit(X_train, y_train, 0.05, 200000, tamanho_layers)

Iteração: 0 | Custo: 0.6931606018309444


KeyboardInterrupt: 