# Build my own deep learning functions

## Purpose

Learn by practicing. Notations follow Andrew Ng's Coursera deep learning course.

### Build the forward_prop function
Note: If there is error while importing python modules while running this notebook in vscode, make sure the both the vscode python interpreter and ipython kernal are both set properly. 

### Model structure
The model consist of L-1 relu layers and one sigmoid layer.

In [11]:
import numpy as np
import h5py
import matplotlib.pyplot as plt

%matplotlib inline
np.random.seed(1) 
# TODO: with certain seeds, e.g. seed=1, the cost generates NAN


In [12]:
# Below function taken from course assignments
def load_data():
    train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:])
    train_set_y_orig = np.array(train_dataset["train_set_y"][:])

    test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:])
    test_set_y_orig = np.array(test_dataset["test_set_y"][:])

    classes = np.array(test_dataset["list_classes"][:])
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [13]:
def relu(Z):
    '''
    Z: Input to the activate function

    A: Output of the relu function
    '''
    A = np.maximum(0, Z)
    assert(np.min(A) >= 0.0)
    return A


In [14]:
def forward_prop(X, Y, parameters):
    '''
    # TODO: What's the row # variable?
    X: Input data. (n0???, m). n0: feature #; m: # of examples
    Y: Labels. (1, m)
    parameters: The model parameters

    cost: The return
    cache: The intermediate values
    '''
    L = len(parameters) // 2
    Xl = X
    m = Y.shape[1]
    caches = {}
    for l in range(1, L):
        # print('l=', l)
        W = parameters['W'+str(l)]
        b = parameters['b'+str(l)]

        assert(W.shape[0] == b.shape[0])
        Z = np.dot(W, Xl) + b
        A = relu(Z)
        Xl = A
        caches['A'+str(l)] = A

    ZL = np.dot(parameters['W'+str(L)], Xl) + parameters['b'+str(L)]
    AL = 1 / (1 + np.exp(-ZL)) # sigmoid
    # print('ZL=', ZL)
    # print('AL=', AL)
    print('### debug ###: AL.max(), AL.min()=', AL.max(), AL.min())

    J = - np.dot(Y, np.log(AL).T) - np.dot(1 - Y, np.log(1 - AL).T)
    cost = np.squeeze(np.sum(J)) / m
    assert(cost.shape == ())

    return cost, caches


In [15]:
# load data and pre-processing
train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes = load_data()
# print('train_set_x_orig.shape=', train_set_x_orig.shape)

# plt.imshow(train_set_x_orig[7])
# plt.show()

train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T

# print('train_set_x_flatten.shape=', train_set_x_flatten.shape)
# print('test_set_x_flatten.shape=', test_set_x_flatten.shape)
# print(train_set_x_flatten)
# print(train_set_x_flatten.max())

train_set_x = train_set_x_flatten / 255.0


### Initialization
Initialize the parameters

### Lesson learned
When the weights are not initailized small, aka, without *0.01, the cost computation gives lots of NAN because the output are either too small or too large.

In [16]:
def model_init(layer_dims):
    parameters = {}
    for i in range(1, len(layer_dims)):
        parameters['W'+str(i)] = np.random.rand(layer_dims[i], layer_dims[i-1]) * 0.01 # the down-scaling is important
        parameters['b'+str(i)] = np.zeros((layer_dims[i], 1))
        # print(parameters['W'+str(i)].shape, parameters['b'+str(i)].shape)

    return parameters


### Back prop
The back propogation is to calculate the partial derivatives of the cost w.r.t. all the weights and biases. So that the learning algorithem can nudge the weights and biases to reduce the cost by a tiny bit. 

#### Notations
Following cousera's notation, Z1, Z2, etc. are the linear combination of the input with biases for layers 1, 2, and so on. A1, A2, etc. are the activation outputs for layers 1, 2, and so on.

$dZ_1=\frac{d cost}{d Z1}$

In [17]:
def back_prop(Y, parameters, caches):
    '''
    X: Input data. (n0???, m). n0: feature #; m: # of examples
    Y: Labels. (1, m)
    parameters: The model parameters

    grads: return the gradients
    '''
    grads = {}
    print('parameters[''W3''].shape=', parameters['W3'].shape)
    print('parameters[''b3''].shape=', parameters['b3'].shape)
    
    # The 3rd layer is a sigmoid layer
    grads['dZ3'] = caches['A3'] - Y
    grads['dW3'] = parameters['W3'] * grads['dZ3']
    grads['db3'] = grads['dZ3']

    # The 2nd layer is a RELU layer
    grads['dZ2'] = np.ones_like(caches['Z2']) * (caches['Z2'] > 0.0)
    print('caches[Z2]=', caches['Z2'])
    print('grads[Z2]=', grads['Z2'])
    grads['dW2'] = parameters['W2']
    
    # The 1st layer is a RELU layer
    grads['dZ1'] = np.ones_like(caches['Z1']) * (caches['Z1'] > 0.0)
    print('caches[Z1]=', caches['Z1'])
    print('grads[Z1]=', grads['Z1'])

    # m = 
    # dAl = 1/m

    # assert(len(grads) == ())

    return grads


### The overall model
Here is the overall learning model with hyper-parameters

In [18]:
def train_model(X, Y, layer_dims, number_of_iterations = 5, learning_rate = 0.01):
    # model initialization
    parameters = model_init(layer_dims)

    # results
    costs = np.zeros((number_of_iterations, 1))

    for i in range(number_of_iterations):
        
        costs[i], caches = forward_prop(train_set_x, train_set_y_orig, parameters)
        # print('len(train_set_y_orig)=', len(train_set_y_orig))
        # print('train_set_y_orig.shape', train_set_y_orig.shape)
        # print(tmp)
        
        grads = back_prop(train_set_y_orig, parameters, caches)

        parameters = parameters - grads * learning_rate

        print('i=', i)

    return parameters


In [19]:
layer_dims = [train_set_x.shape[0], 8, 4, 1]
model = train_model(train_set_x, train_set_y_orig, layer_dims, 2)
# predict(model, test_set_x_orig, test_set_y_orig)

### debug ###: AL.max(), AL.min()= 0.5064198968292074 0.500488435708394
parameters[W3].shape= (1, 4)
parameters[b3].shape= (1, 1)


KeyError: 'A3'

In [None]:
gate = np.random.rand(5, 1)
test_relu_grad = np.ones_like(gate)
test_relu_grad = test_relu_grad * (gate > 0.5)
print('test_relu_grad=', test_relu_grad)
print('gate=', gate)
print('test_relu_grad=', test_relu_grad)


test_relu_grad= [[1.]
 [1.]
 [1.]
 [1.]
 [1.]]
gate= [[0.93339344]
 [0.52109101]
 [0.96513247]
 [0.62705207]
 [0.75136111]]
test_relu_grad= [[1.]
 [1.]
 [1.]
 [1.]
 [1.]]
