In [1]:
import numpy as np
from datetime import datetime

##### Ancilary Functions:
We will start by writing support functions:

In [2]:
def softmax(Z):
    '''
    Computes the softmax vector of the input
    
    Inputs:
    x = np.array of shape (n_classes, n_samples)
    
    Outputs:
    y = np.array of shape (n_classes, n_samples)
    '''
    
    x_exp = np.exp(Z)
    sum_x_exp = np.sum(x_exp, axis=0, keepdims=True)
    assert(sum_x_exp.shape == (1, Z.shape[1]))
    
    S = x_exp / sum_x_exp
    assert(S.shape == Z.shape)
    cache = Z
    
    return S, cache

In [3]:
def softmax_backward(dA, cache):
    
    Z = cache
    
    s, cache = softmax(Z)
    #print('Shape of Z: ' + str(Z.shape))
    #print('Shape of dA: ' + str(dA.shape))
    #print('Type of s: ' + str(type(s)))
    #print('Shape of s: ' + str(s.shape))
    dZ = dA * s * (1 - s)
    
    return dZ

In [5]:
def cost_function(AL, Y):
    
    assert(AL.shape == Y.shape)
    
    m = AL.shape[1]
    
    J = np.log(np.sum(AL*Y, axis=0, keepdims=True))
    cost = (-1 / m) * np.sum(J)
    
    return cost

In [6]:
def relu(Z):
    
    A = np.maximum(0, Z)
    cache = Z
    
    return A, cache

In [7]:
def sigmoid(Z):
    
    A = 1/(1 + np.exp(-Z))
    cache = Z
    
    return A, cache

In [65]:
def init_params(layer_dims):
    
    #np.random.seed(2)
    params = {}
    L = len(layer_dims)
    
    for i in range(1, L):
        params['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i-1]) * np.sqrt(2/layer_dims[i])
        params['b' + str(i)] = np.zeros((layer_dims[i], 1))
    
    return params

In [8]:
def linear_forward(A, W, b):
    '''
    Computes linear finction of the parameters W and b, and preveous activation A
    
    Inputs:
    A = activation from the previous layer, A.shape = (n_(l-1), m)
    W = parameters of layer l, W.shape = (n_l, n_(l-1))
    b = bias of layer l, b.shape = (n_l, 1)
    
    Outputs:
    Z = computed linear function of the node, Z.shape = (n_l, m)
    cache = variable that stores parameters A_prev, Wl, bl to be used in back propargation
    '''
    
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

In [10]:
def linear_backward(dZ, cache):
    
    A_prev, W, b = cache
    
    dW = np.dot(dZ, A_prev.T)
    db = np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db
    

In [11]:
def relu_backward(dA, cache):
    
    Z = cache
    
    assert(dA.shape == Z)
    
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    
    return dZ

In [12]:
def sigmoid_backward(dA, cache):
    
    Z = cache
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    return dZ

In [13]:
def activation_forward(A_prev, W, b, activation):
    
    Z, lin_cache = linear_forward(A_prev, W, b)
    
    if activation == 'relu':
        A, activ_cache = relu(Z)
    
    if activation == 'sigmoid':
        A, activ_cache = sigmoid(Z)
    
    if activation == 'softmax':
        A, activ_cache = softmax(Z)
        
        
    cache = lin_cache, activ_cache # cache = ((A_prev, W, b), Z)
    
    return A, cache

In [14]:
def activation_backward(dA, cache, activation):
    
    lin_cache, activ_cache = cache # lin_cache = (A, W, b), activ_cache = Z
    
    if activation == 'relu':
        dZ = relu_backward(dA, activ_cache)
        
    if activation == 'sigmoid':
        dZ = sigmoid_backward(dA, activ_cache)
    
    if activation == 'softmax':
        dZ = softmax_backward(dA, activ_cache)
    
    dA_prev, dW, db = linear_backward(dZ, lin_cache)
    
    return dA_prev, dW, db
    

In [15]:
def L_forward(X, parameters):
    
    caches = []
    n_layers = len(parameters)//2
    
    A_prev = X
    
    for layer in range(1, n_layers):
        W = parameters['W' + str(layer)]
        b = parameters['b' + str(layer)]
        Z, lin_cache = linear_forward(A_prev, W, b)
        A, activ_cache = relu(Z)
        
        A_prev = A
        
        caches.append((lin_cache, activ_cache))
    
    WL = parameters['W' + str(n_layers)]
    bL = parameters['b' + str(n_layers)]
    ZL, lin_cache = linear_forward(A_prev, WL, bL)
    AL, activ_cache = softmax(ZL) # 
    
    caches.append((lin_cache, activ_cache))
    
    return AL, caches

In [16]:
def L_backward(AL, Y, caches):
    
    grads = {}
    L = len(caches) # number of layers
    m = Y.shape[1] # number of samples
    
    # compute the output vector from the model:
    #S = softmax(AL)
    #assert(S.shape == Y.shape)
    
    # initialize back propagation:
    dAL = (-1/m) * np.divide(Y, AL)  # * S * (1 - S)
    
    #print('in L_backward, type/shape of dAL is: ' + str(type(dAL)) + ' ' + str(dAL.shape))
    
    current_cache = caches[L-1]
    dA_prev, dW, db = activation_backward(dAL, current_cache, activation='softmax')
    grads['dW' + str(L)] = dW
    grads['db' + str(L)] = db
    
    for i in reversed(range(L-1)):
        current_cache = caches[i]
        dA_prev, dW, db = activation_backward(dA_prev, current_cache, activation='relu')
        grads['dW' + str(i+1)] = dW
        grads['db' + str(i+1)] = db
    
    return grads

In [74]:
def update_parameters(parameters, grads, learning_rate):
    
    L = len(parameters) // 2
    
    for i in range(L):
        parameters['W' + str(i + 1)] = parameters['W' + str(i + 1)] - learning_rate * grads['dW' + str(i + 1)]
        parameters['b' + str(i + 1)] = parameters['b' + str(i + 1)] - learning_rate * grads['db' + str(i + 1)]
    
    return parameters

In [18]:
def predict(X, parameters):
    
    L = len(parameters) // 2
    m = X.shape[1]
    
    AL, caches = L_forward(X, parameters)
    
    S = AL
    
    max_prob = np.argmax(S, axis=0)
    
    #print(max_prob.shape)
    
    for i in range(m):
        S[:, i] = 0
        S[max_prob[i], i] = 1
    
    return S

In [46]:
def L_layer_model(X, Y, layer_dims, learning_rate =0.05, num_iterations=60, print_cost=True):
    
    costs = []
    
    parameters = init_params(layer_dims)
    
    for i in range(num_iterations):
    
        # forward pass:
        AL, caches = L_forward(X, parameters)

        # calculate the cost:
        #S = softmax(AL)
        cost = cost_function(AL, Y)
        costs.append(cost)

        # backward pass:
        grads = L_backward(AL, Y, caches)
        
        ### THIS IS WHERE GRADIENT CHECKING WOULD BE
        
        # update parameters:
        parameters = update_parameters(parameters, grads, learning_rate)
        
        # print the cost:
        if print_cost and any([(i+1)%100==0, i == 0]):
            print('Cost after ' + str(i + 1) + ' iterations is ' + str(cost) + ' at ' + str(datetime.now()))
    
    return parameters

### We can train our model now:

In [20]:
X = np.load('train_x.npy')
Y = np.load('train_y.npy')

In [21]:
m = X.shape[1]
mu = (1/m) * np.sum(X, axis=1, keepdims=True)
X = X - mu

sigma = (1/m) * np.sum(X**2, axis=1, keepdims=True)

X = X/sigma

In [17]:
layer_dims = (54, 70, 50, 30, 20, 7)

In [None]:
params = L_layer_model(X, Y, layer_dims, learning_rate =0.1, num_iterations=1000)

In [31]:
def accuracy(Y_hat, Y):
    
    assert(Y_hat.shape == Y.shape)
    
    m = Y.shape[1] # number of data entries
    
    w = 0
    c = 0
    
    A = Y_hat.T
    B = Y.T
    assert(A.shape == B.shape)
    for i in range(Y.shape[1]):
        if np.all(A[i] == B[i]):
            c += 1
        else:
            w += 1
    
    return c/m
    

In [32]:
out = predict(X, params)
out = out.squeeze()

In [None]:
accuracy(out, Y)

### Test the neuronex on a different set:

In [21]:
X = np.load('abc_train_X.npy')
Y = np.load('abc_train_Y.npy')

In [None]:
X.shape, Y.shape

In [32]:
Z = np.zeros((2, 300))

for i in range(300):
    if Y[0, i] == 1:
        Z[1, i] = 1
    else:
        Z[0, i] = 1

In [54]:
layer_dims = (2, 10, 5, 2)

In [None]:
params = L_layer_model(X, Z, layer_dims, learning_rate =0.01, num_iterations=100)