# Problem 5.1.2 
Consider binary classification with the case of 2 features and 3 data points (m=3):

Let $X = \begin{bmatrix} 1 & 2 & 4 \\ -2 & -5 & -8 \end{bmatrix}, Y = \begin{bmatrix} 0 & 1 & 0 \end{bmatrix}$

Assume that layer 1 has 2 units and that layer 2 has 1 units with parameter matrices:

$W^{[1]} = \begin{bmatrix} 0.5 & 0.5 \\ 0.5 & -0.5 \end{bmatrix}, b^{[1]} = \begin{bmatrix} 0.5 \\ 0.5 \end{bmatrix},  W^{[2]} = \begin{bmatrix} -1 & 1  \end{bmatrix}, b^{[2]} = \begin{bmatrix} -0.1  \end{bmatrix}$

Assume activation functions $f^{[1]}  (z)=log‚Å°(1+e^z$) and $f^{[2]}  (z)=\frac{1}{1+e^{-z}}$ and binary cross entropy loss function.


In [None]:
import numpy as np

In [None]:
# inputs
X = np.array([[1,2,4],[-2,-5,-8]])
Y = np.array([[0,1,0]])
W1 = np.array([[0.5,0.5],[0.5,-0.5]])
b1 = np.array([[0.5],[0.5]])
W2 = np.array([[-1,1]])
b2 = np.array([[-0.1]])
m = X.shape[1]

**(a)**	Compute the value of the loss function for the above $W^{[1]}, b^{[1]},  W^{[2]}, b^{[2]}$


In [None]:
# forward propagation
# layer 1
Z1 = np.dot(W1,X)+b1
A1 = np.log(1+np.exp(Z1))
print("Z1: \n{}".format(Z1))
print("A1: \n{}".format(A1))
# layer2
Z2 = np.dot(W2,A1)+b2
A2 = 1/(1+np.exp(-Z2))
print("Z2: {}".format(Z2))
print("A2: {}".format(A2))
# compute loss
L = -np.sum(Y*np.log(A2+1e-16)+(1-Y)*np.log(1-A2+1e-16))/m
print("Loss: {}".format(L))

**(b)**	Perform 1  epoch of training using Gradient Descent with learning rate of 0.1 and recompute the loss function with the updated $W^{[1]}, b^{[1]},  W^{[2]}, b^{[2]}$

In [None]:
# back propagation
# derivative of loss
dLdA2 = -1/m*(Y/A2 - (1-Y)/(1-A2))
print("dLdA2: {}".format(dLdA2))
# layer 2
dA2dZ2 = A2 - np.square(A2)
print("dA2dZ2: {}".format(dA2dZ2))
dLdZ2 = dLdA2*dA2dZ2
print("dLdZ2: {}".format(dLdZ2))
grad_W2L = np.dot(dLdZ2,A1.T)
grad_b2L = np.sum(dLdZ2,axis=1,keepdims=True)
print("grad_W2L: {}".format(grad_W2L))
print("grad_b2L: {}".format(grad_b2L))

In [None]:
# layer 1
dLdA1 = np.dot(W2.T,dLdZ2)
print("dLdA1: \n{}".format(dLdA1))
# ***For softplus dA/dZ = 1-exp(-A)
dA1dZ1 = 1 - np.exp(-A1)
print("dA1dZ1: \n{}".format(dA1dZ1))
dLdZ1 = dLdA1*dA1dZ1
print("dLdZ1: \n{}".format(dLdZ1))
grad_W1L = np.dot(dLdZ1,X.T)
grad_b1L = np.sum(dLdZ1,axis=1,keepdims=True)
print("grad_W1L: \n{}".format(grad_W1L))
print("grad_b1L: \n{}".format(grad_b1L))

In [None]:
# gradient descent epoch 1 - update parameter matrices
alpha = 0.1
# update parameters
W1 = W1 - alpha*grad_W1L
b1 = b1 - alpha*grad_b1L
W2 = W2 - alpha*grad_W2L
b2 = b2 - alpha*grad_b2L
print("W1 update: \n{}".format(W1))
print("b1 update: \n{}".format(b1))
print("W2 update: {}".format(W2))
print("b2 update: {}".format(b2))

In [None]:
# forward propagation with new W1,b1,W2,b
# layer 1
Z1 = np.dot(W1,X)+b1
A1 = np.log(1+np.exp(Z1))
print("Z1: \n{}".format(Z1))
print("A1: \n{}".format(A1))
# layer2
Z2 = np.dot(W2,A1)+b2
A2 = 1/(1+np.exp(-Z2))
print("Z2: {}".format(Z2))
print("A2: {}".format(A2))
# recompute loss
L = -np.sum(Y*np.log(A2+1e-16)+(1-Y)*np.log(1-A2+1e-16))/m
print("Loss epoch 1: {}".format(L))

**(c)**	Compute the prediction based on input feature matrix X above after the 1 epoch.

In [None]:
# forward propagation
# layer 1
Z1 = np.dot(W1,X)+b1
A1 = np.log(1+np.exp(Z1))
print("Z1: \n{}".format(Z1))
print("A1: \n{}".format(A1))
# layer2
Z2 = np.dot(W2,A1)+b2
A2 = 1/(1+np.exp(-Z2))
print("Z2: {}".format(Z2))
print("A2: {}".format(A2))
# prediction
Y_pred = np.round(A2)
print("Prediction: {}".format(Y_pred))

**(d)**	Compute the accuracy of the prediction in (c) when compared against the actual Y specified above.

In [None]:
# compute accuracy
accuracy = np.mean(np.absolute(Y-Y_pred)<1e-7)
print("accuracy: {}".format(accuracy))