<a href="https://colab.research.google.com/github/pranavkantgaur/Coursera_DL_specialization_from_scratch/blob/master/course1/week4/build_nn_step_by_step.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Objectives
To be able to build and train NNs for various depth and width. This assignment is not tied to any application, therefore it will be evaluated using test cases. The functions developed in this notebook will be used in the next assignment. Target is to build:


*   A 2-layer NN
*   A L-layer NN

Effectively, I will be able to build and train a L-layer fully connected NN, entirely using Numpy.


## Load dataset

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting
import h5py # data loading for hdf5 dataset
from PIL import Image # for loading your images for processing
from scipy import ndimage 
import os


In [0]:
# implementing utility function for loading cat vs non-cat datasets
def load_dataset():
  train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
  test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
  train_set_x = np.array(train_dataset["train_set_x"][:])
  train_set_y = np.array(train_dataset["train_set_y"][:])
  test_set_x  = np.array(test_dataset["test_set_x"][:])
  test_set_y = np.array(test_dataset["test_set_y"][:])
  classes = np.array(train_dataset["list_classes"][:])

  # lets reshape the arrays
  train_set_y = train_set_y.reshape((1, train_set_y.shape[0]))
  test_set_y = test_set_y.reshape((1, test_set_y.shape[0]))

  return train_set_x, train_set_y, test_set_x, test_set_y, classes

In [0]:
# load training dataset
x_train_images, y_train_images, x_test_images, y_test_images, classes = load_dataset()
# x_train_images: (m, nx, ny, nc)
# y_train_images: (1, m)

In [0]:
# lets inspect the dataset
image_id = 25
plt.imshow(x_train_images[image_id])
print("y = ", y_train_images[0][image_id], "Its a " + classes[y_train_images[0][image_id]].decode("utf-8") + " picture!!")

In [0]:
# lets inspect the shapes of the dataset arrays
print("Training set shape: ", x_train_images.shape) #  m, nx, ny, nc
print("Training set labels shape: ", y_train_images.shape) # 1, m
print("Test set shape: ", x_test_images.shape)
print("Test set labels shape: ", y_test_images.shape)
print("Each image is of shape: ", x_train_images[0].shape)

Training set shape:  (209, 64, 64, 3)
Training set labels shape:  (1, 209)
Test set shape:  (50, 64, 64, 3)
Test set labels shape:  (1, 50)
Each image is of shape:  (64, 64, 3)


In [0]:
# Lets flatten the train and test image dataset for training
x_train_images_flattened = x_train_images.reshape((x_train_images.shape[0], x_train_images.shape[1] * 
                                                   x_train_images.shape[2] * 
                                                   x_train_images.shape[3], 1)) 
x_train_images_flattened = np.squeeze(x_train_images_flattened)
x_train_images_flattened = x_train_images_flattened.T
x_test_images_flattened = x_test_images.reshape((x_test_images.shape[0], x_train_images.shape[1] * 
                                                 x_test_images.shape[2] * 
                                                 x_test_images.shape[3], 1))

x_test_images_flattened = np.squeeze(x_test_images_flattened)
x_test_images_flattened = x_test_images_flattened.T

# lets print
print("Flattened version of train images: ", x_train_images_flattened.shape)
print("Label set for training dataset: ", y_train_images.shape)
print("Flattened version of test images: ", x_test_images_flattened.shape)
print("Label set for test dataset: ", y_test_images.shape)


print("Sanity check after reshaping: ", x_train_images_flattened[0:5, 0])

Flattened version of train images:  (12288, 209)
Label set for training dataset:  (1, 209)
Flattened version of test images:  (12288, 50)
Label set for test dataset:  (1, 50)
Sanity check after reshaping:  [17 31 56 22 33]


In [0]:
# normalization
x_train_images = x_train_images_flattened / 255.0
x_test_images = x_test_images_flattened / 255.0

In [0]:
def sigmoid(z):
  sig = 1 / (1 + np.exp(-z))
  return sig

In [0]:
class TwoLayerNN(object):
  def __init__(self, n_hidden_units):
    # hyperparameters
    self.n_hidden_units = n_hidden_units # len(self.n_hidden_units = number of layers)
    self.n_epochs = 0
    self.lr = 0
    # parameters
    self.w1 = 0
    self.b1 = 0
    self.w2 = 0
    self.b2 = 0


  def forward(self, X):
    # layer 1
    z1 = np.dot(self.w1, X) + self.b1
    a1 = np.tanh(z1)
    
    # layer 2
    z2 = np.dot(self.w2, a1) + self.b2
    a2 = sigmoid(z2)

    return a1, a2

  def initialize_parameters(self, input_size):
    self.w1 = np.random.randn(self.n_hidden_units[0], input_size) * 0.01
    self.b1 = np.zeros((self.n_hidden_units[0], 1))
    self.w2 = np.random.randn(self.n_hidden_units[1], self.n_hidden_units[0]) * 0.01
    self.b2 = np.zeros((self.n_hidden_units[1], 1))
    


  def backward(self, A1, A2, X, Y):
    '''
    Notice that the only difference across layers lies in computation of dZ, 
    rest all can be parameterized on layer ID.
    '''
    m = X.shape[1]

    dZ2 = A2 - Y # output layer
    dw2 = np.dot(dZ2, A1.T) / m
    db2 = np.mean(dZ2)
    
    dZ1 = np.dot(self.w2.T, dZ2) * (1 - np.power(A1, 2))
    dw1 = np.dot(dZ1, X.T) / m
    db1 = np.mean(dZ1)

    return dw1, db1, dw2, db2

  def update_parameters(self, dw1, db1, dw2, db2):
    self.w1 = self.w1 - self.lr * dw1
    self.b1 = self.b1 - self.lr * db1
    self.w2 = self.w2 - self.lr * dw2
    self.b2 = self.b2 - self.lr * db2


  def compute_cost(self, A, Y):
    cost = -float((np.dot(Y, np.log(A).T) + np.dot(1-Y, np.log(1-A).T))) / Y.shape[1]  
    return cost
  
  def train(self, X, Y, lr = 0.001, n_epochs = 1000):
    self.initialize_parameters(X.shape[0]) 
    self.lr = lr
    self.n_epochs = n_epochs

    for i in range(n_epochs):
      A1, A2  = self.forward(X)
      dw1, db1, dw2, db2 = self.backward(A1, A2, X, Y)     
      self.update_parameters(dw1, db1, dw2, db2)
      cost = self.compute_cost(A2, Y)
      if (i % 100 == 0):
        print("Cost at epoch: ", i, "is: ", cost)
    # save final parameters
    try:  
      os.mkdir("params")  
    except OSError as error:  
      print(error)   
    np.save("params/w1.npy", self.w1)    
    np.save("params/b1.npy", self.b1)
    np.save("params/w2.npy", self.w2)
    np.save("params/b2.npy", self.b2)

  def evaluate(self, X_test, Y_test):
    self.w1 = np.load("params/w1.npy")         
    self.b1 = np.load("params/b1.npy")
    self.w2 = np.load("params/w2.npy")
    self.b2 = np.load("params/b2.npy")

    _, A_predict = self.forward(X_test)

    Y_predict = (A_predict > 0.5)
    accuracy = 100 * float(np.dot(Y_test, Y_predict.T) + 
                           np.dot(1-Y_test, (1-Y_predict).T)) / Y_test.shape[1]
    print("Accuracy of trained model: ", accuracy)              
   

In [0]:
twoNN = TwoLayerNN(n_hidden_units = [7, 1])

In [0]:
twoNN.train(x_train_images, y_train_images, lr = 0.0075, n_epochs = 2500)

Cost at epoch:  0 is:  0.6960598883051189
Cost at epoch:  100 is:  0.6536072145590778
Cost at epoch:  200 is:  0.6433395957351822
Cost at epoch:  300 is:  0.6322147317400523
Cost at epoch:  400 is:  0.6090849610699225
Cost at epoch:  500 is:  0.5693893707912853
Cost at epoch:  600 is:  0.5192816899228848
Cost at epoch:  700 is:  0.46402643499119944
Cost at epoch:  800 is:  0.4054218086958313
Cost at epoch:  900 is:  0.3483431158020084
Cost at epoch:  1000 is:  0.3661978135585861
Cost at epoch:  1100 is:  0.31715161465176755
Cost at epoch:  1200 is:  0.27375951665337434
Cost at epoch:  1300 is:  0.23799486072934836
Cost at epoch:  1400 is:  0.20560113547168046
Cost at epoch:  1500 is:  0.17546035130293572
Cost at epoch:  1600 is:  0.14661803198282103
Cost at epoch:  1700 is:  0.12003370522266767
Cost at epoch:  1800 is:  0.10260553857061602
Cost at epoch:  1900 is:  0.09075074425096824
Cost at epoch:  2000 is:  0.08105347562643966
Cost at epoch:  2100 is:  0.07293505112329696
Cost at ep

In [0]:
# train accuracy
twoNN.evaluate(x_train_images, y_train_images)

Accuracy of trained model:  100.0


In [0]:
# test accuracy
twoNN.evaluate(x_test_images, y_test_images)

Accuracy of trained model:  72.0


## Generalizing to L-layer NN
Now we want to generalize the above 2-layer NN to a L-layer NN, the interface should be such that:

* For L=2 and keeping other hyper-parameters same, we should be able to reproduce  the results obtained above. In other words, training a `LLayerNN(n_hidden_units = [7, 1])` network as `lllayerNN.train(x_train_images, y_train_images, lr = 0.0075, n_epochs = 2500)` should result in $72\%$ test and $100\%$ train accuracy.
*   NN should be implemented by stacking layer along both forward and backward directions:
  * forward:  
      * Layers:
        * hidden layers with tanh/relu activation
        * output layer with sigmoid activation
      * Input:
        * activation from last layer
        * weight(and bias) matrix for current layer
      * Output:
        * activation from current layer        
  * backward:
      * Layers:
        * hidden backward propogation layer
        * output bacwkard propogation layer
      * Input:
        * activation from successive layer
        * weight metrix from successive layer
        * gradients from successive layer
      * Output:
        * gradients for current layer, $dw$, $db$ and $dz$.        

In [0]:
# Assumes a binary problem.
class LLayerNN(object):
  def __init__(self, n_hidden_units):
    self.w = []
    self.b = []
    self.dw = []
    self.db = []
    self.a = [] # activations across layers
    self.lr = None
    self.n_hidden_units = n_hidden_units '''total number of layers, 
                                          n_hidden_units + 1'''
    
  def initialize_parameters(self, input_size):
    # layer 0
    self.w.append(np.random.rand(self.n_hidden_units[0], input_size)) * 0.01
    self.b.append(np.zeros((self.n_hidden_units[0], 1)))
    # layer 1 to L 
    n_hidden_layers = len(self.n_hidden_units)
    for i in range(1,  n_hidden_layers): ''' (1 for last hidden layer, 
                                                    1 for output layer)'''
      self.w.append(np.random.randn(self.n_hidden_units[i], 
                                  self.n_hidden_units[i-1]) * 0.01)
      self.b.append(np.zeros(self.n_hidden_units[i], 1))
    # output layer, assuming binary classification
    self.w.append(np.random.rand(1, self.n_hidden_units[n_hidden_layers - 1])) * 0.01
    self.b.append(np.zeros((1, 1)))   
    # w: n_hidden_layers + 1       

  def forward_propogation(self, layer_id, activation = None):
    z = np.dot(self.w[layer_id - 1], self.a[layer_id - 1]) + self.b[layer_id  -1]  # NOT READABLE!!
    if (activation == 'tanh'):
      self.a[layer_id] = np.tanh(z)
    if (activation == 'relu'):
      self.a[layer_id] = np.max(0, z)      
    if (activation == 'sigmoid'):    
      self.a[layer_id] = sigmoid(z)

   def backward_propogation(self, Y, layer_id, activation = None):
     if activation == 'tanh':
       da_dz = (1 - np.power(self.a[layer_id], 2))
     if activation == 'relu':
       da_dz = None      
     if activation == 'sigmoid':
       da_dz = np.dot(self.a[layer_id], 1 - self.a[layer_id]) 
     if (layer_id == L): # output layer
       self.dz[layer_id] = self.a[layer_id] - Y
     else:
       self.dz[layer_id] = np.dot(self.w[l+1].T, self.dz[l+1]) * da_dz       
     self.dw[layer_id] = np.dot(self.dz[l], self.a[l - 1]) / m
     self.db[layer_id] = np.mean(self.dz[l])      
    

    def update_parameters(self, layer_id):
      self.w[layer_id] = self.w[layer_id] - self.lr * self.dw[layer_id]
      self.b[layer_id] = self.b[layer_id] - self.lr * self.db[layer_id]

    def compute_cost(self, Y):
      last_layer_id = len(num_hidden_units + 1)
      cost = -(np.dot(Y, np.log(self.a[last_layer_id])).T) + np.dot(1 - Y, 
             np.log(1 - self.a[last_layer_id]))) / Y.shape[1] '''averaged over 
                                                              all samples'''
      return cost  

    def train(self, X, Y, learning_rate = 0.001, n_epochs = 1000):
      
      self.initialize_parameters() # initializes parameters for all layers.
      
      self.lr = learning_rate
      n_hidden_layers = len(self.n_hidden_units)
      
      self.a.append(X) # activation from input layer, a[0]
      for i in range(n_epochs):
        for l in range(1, n_hidden_layers + 1):
          self.forward_propogation(l, activation = 'relu')
        # for output layer
        self.forward_propogation(n_hidden_layers, activation = 'sigmoid')   
        
        # backprop for the output layer
        self.backward_propogation(n_hidden_layers, activation = 'sigmoid') 
        
        for l in range(n_hidden_layers  - 1 , 0, -1):
          self.backward_propogation(l, activation = 'tanh') 
          self.update_parameters(l) # updates weights for layer 'l'
        cost = self.compute_cost() # ends the epoch

In [0]:
llayernn = LLayerNN()

In [0]:
llayernn.train(x_train_images, y_train_images)

In [0]:
llayernn.evaluate(x_test_images, y_test_images)