**End to end neural net practice**

###Objectives:
1.   Difference in results with different initialization values i.e random, zeros and He initialization.
2.   Add regularization to reduce overfitting
3.   Optimize gradient descent using momentum, RMS prop and ADAM



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

###Functions

In [2]:
def load_data():
  np.random.seed(1)
  m = 4000 # number of examples
  N = int(m/2) # number of points per class
  D = 2 # dimensionality
  X = np.zeros((m,D)) # data matrix where each row is a single example
  Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue)
  a = 4 # maximum ray of the flower

  for j in range(2):
      ix = range(N*j,N*(j+1))
      t = np.linspace(j*3.12,(j+1)*3.12,N) + np.random.randn(N)*0.2 # theta
      r = a*np.sin(4*t) + np.random.randn(N)*0.2 # radius
      X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
      Y[ix] = j

  # X = X.T
  # Y = Y.T

  return train_test_split(X, Y, test_size=0.2, random_state=42)

def initialize_parameters(layer_dims, type="random"):
  L = len(layer_dims)
  parameters = {}
  for l in range(1, L):
    if type == 'random':
      parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 10
    elif type == 'zeros':
      parameters['W' + str(l)] = np.zeros((layer_dims[l], layer_dims[l-1]))
    elif type == "he":
      parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2/layer_dims[l-1])
    parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
  return parameters

def sigmoid(Z):
  A = 1/(1 + np.exp(-Z))
  cache = A
  return A, cache

def relu(Z):
  A = np.maximum(0, Z)
  cache = A
  return A, cache

def relu_backwards(dA, cache):
  Z = cache
  dZ = np.array(dA, copy=True)
  # When z <= 0, you should set dz to 0 as well.
  dZ[Z <= 0] = 0
  return dZ

def sigmoid_backwards(dA, cache):
  Z = cache
  s = 1/(1+np.exp(-Z))
  dZ = dA * s * (1-s)
  return dZ

def calculate_cost(AL, Y):
  m = AL.shape[0]
  J = (1/m)* (-np.dot(Y, np.log(AL).T) - np.dot((1-Y), np.log(1 - AL).T))
  return np.squeeze(J)

def update_parameters(parameters, grads, learning_rate=0.01):
  L = len(parameters)//2
  for l in range(1, L):
    parameters['W' +  str(l)] = parameters['W' + str(l)] - learning_rate * grads['dW' + str(l)]
    parameters['b' +  str(l)] = parameters['b' + str(l)] - learning_rate * grads['db' + str(l)]
  return parameters


###Forward pass

In [3]:
def linear_forward(W, b, A):
  Z = np.dot(W, A) + b
  cache = Z
  return Z, cache

def linear_forward_activation(W, b, A, activation_function):
  Z, linear_cache = linear_forward(W, b, A)
  if activation_function == 'relu':
    A, activation_cache = relu(Z)
  elif activation_function == 'sigmoid':
    A, activation_cache = sigmoid(Z)
  cache = (linear_cache, activation_cache)
  return A, cache


def forward_pass(X, parameters):
  L = len(parameters) // 2
  caches = []
  A = X
  for l in range(1, L):
    A_prev = A
    A, cache = linear_forward_activation(parameters['W' + str(l)], parameters['b' + str(l)], A_prev, activation_function="relu")
    caches.append(cache)

  AL, cache = linear_forward_activation(parameters['W' + str(L)], parameters['b' + str(L)], A, activation_function="sigmoid")
  caches.append(cache)

  return AL, caches

###Backward propagation

In [4]:
def linear_backwards(dZ, cache):
  A_prev, W = cache
  m = A_prev.shape[1]

  dW = 1./m * np.dot(dZ,A_prev.T)
  db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
  dA_prev = np.dot(W.T,dZ)

  return dA_prev, dW, db


def linear_backward_activation_function(dA, cache, activation_function):
  linear_cache, activation_cache = cache
  if activation_function == 'sigmoid':
    dZ = sigmoid_backwards(dA, activation_cache)
    dA_prev, dW, db = linear_backwards(dZ, linear_cache)
  elif activation_function == 'relu':
    dZ = relu_backwards(dA, activation_cache)
    dA_prev, dW, db = linear_backwards(dZ, linear_cache)

  return dA_prev, dW, db


def backward_prop(AL, Y, caches):
  grads = []
  m = AL.shape[1]
  Y = Y.reshape(AL.shape)
  L = len(caches)

  cur_cache = caches[L-1]
  dAL = - (np.divide(Y, AL) - np.divide((1 - Y), (1 - AL)))

  grads['dA' + str(L)], grads['dW' + str(L)], grads['db' + str(L)] = linear_backward_activation_function(dAL, cur_cache, activation_function="sigmoid")

  for l in reversed(range(L-1)):
    cur_cache = caches[l]
    grads['dA' + str(l + 1)], grads['dW' + str(l + 1)], grads['db' + str(l + 1)] = linear_backward_activation_function('dA' + str(l + 2), cur_cache, activation_function="relu")

  return grads



###Model

In [5]:
def nn_model(X, Y, layer_dims, learning_rate=1.2, iterations=1000, initialization="random"):
  costs = []
  parameters = initialize_parameters(layer_dims)
  for iter in range(iterations):
    AL, cache = forward_pass(X, parameters)
    cost = calculate_cost(AL, Y)
    grads = backward_prop(AL, Y, cache)
    parameters = update_parameters(parameters, grads, learning_rate)

    if cost%100 == 0:
      costs.append(cost)

  plt.plot(np.squeeze(cost))
  plt.ylabel('Cost')
  plt.xlabel('Iterations')
  plt.title('Learning Rate')
  plt.show()

  return parameters

###Trigger

In [6]:
X_train, Y_train, X_test, Y_test = load_data()
layer_dims = [X_train.shape[0], 3, 4, 5, 6, 7, 4, 5, Y_train.shape[0]]
parameters = nn_model(X_train, Y_train, layer_dims, learning_rate=1.2, iterations=2000, initialization='zeros')
print(calculate_cost())

  A = 1/(1 + np.exp(-Z))
  J = (1/m)* (-np.dot(Y, np.log(AL).T) - np.dot((1-Y), np.log(1 - AL).T))
  dAL = - (np.divide(Y, AL) - np.divide((1 - Y), (1 - AL)))


ValueError: too many values to unpack (expected 2)