# Problem 4.7.1 
Consider binary classification using Logistic Regression and the following training data:

$X = \begin{bmatrix} 3 & 1 & -1 \\ 1 & -2 & 2 \end{bmatrix}, Y = \begin{bmatrix} 0 & 1 & 1 \end{bmatrix}$

Assume that 

$W = \begin{bmatrix} W_0 & W_1 \end{bmatrix} = \begin{bmatrix} 2 & -1 \end{bmatrix},      b=-1 $

In [1]:
import numpy as np

In [2]:
X = np.array([[3, 1, -1], [1, -2, 2]])
Y = np.array([[0, 1, 1]])
W = np.array([[2, -1]])
b = np.array([[-1]])
m = X.shape[1]

**(a)** Perform forward propagation using above training data and parameters. 

In [3]:
# forward propagation
Z = np.dot(W,X)+b
A = 1/(1+np.exp(-Z))
print("Z: {}".format(Z))
print("A: {}".format(A))

Z: [[ 4  3 -5]]
A: [[0.98201379 0.95257413 0.00669285]]


**(b)**	Compute the value of the loss function (binary cross entropy) after forward propagation.

In [4]:
# Loss function
L = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
print("Loss: {}".format(L))

Loss: 3.0244842093268907


**(c)**	Perform back propagation for the above training data and parameter matrices to determine $\nabla_WL$ and $\nabla_bL$

In [5]:
# back propagation
grad_AL = -1/m*(Y/A - (1-Y)/(1-A))
print("grad_AL: {}".format(grad_AL))
dAdZ = A-np.square(A)
print("dAdZ: {}".format(dAdZ))
grad_ZL = grad_AL*dAdZ
print("grad_ZL: {}".format(grad_ZL))
grad_WL = np.dot(grad_ZL,X.T)
grad_bL = np.sum(grad_ZL,axis=1,keepdims=True)
print("grad_WL: {}".format(grad_WL))
print("grad_bL: {}".format(grad_bL))

grad_AL: [[ 18.53271668  -0.34992902 -49.80438637]]
dAdZ: [[0.01766271 0.04517666 0.00664806]]
grad_ZL: [[ 0.32733793 -0.01580862 -0.33110238]]
grad_WL: [[ 1.29730755 -0.30324959]]
grad_bL: [[-0.01957308]]


**(d)** Perform 1 epoch of training using Gradient Descent with learning rate of 0.1 and recompute the loss function with the updated W,b

In [6]:
alpha = 0.1
# update W and b
W = W - alpha*grad_WL
b = b - alpha*grad_bL
print("update W and b")
print("W epoch 1: {}".format(W))
print("b epoch 1: {}".format(b))
# recompute loss
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
L = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
print("Loss epoch 1: {}".format(L))

update W and b
W epoch 1: [[ 1.87026925 -0.96967504]]
b epoch 1: [[-0.99804269]]
Loss epoch 1: 2.847697779437626


**(e)**	Compute the prediction based on input feature matrix X above and the updated W,b from (d)

In [7]:
# prediction
# forward propagation
Z = np.dot(W,X)+b
A = 1/(1+np.exp(-Z))
Y_pred = np.round(A)
print("A: {}".format(A))
print("predicted Y: {}".format(Y_pred))

A: [[0.97449612 0.94329821 0.00810077]]
predicted Y: [[1. 1. 0.]]


**(f)**	Compute the accuracy of the prediction in (e) when compared against the actual Y specified above.

In [8]:
# compute accuracy
accuracy = np.mean(np.absolute(Y-Y_pred)<1e-7)
print("accuracy: {}".format(accuracy))

accuracy: 0.3333333333333333


**(g)** Compute the derivatives $\frac{\partial L}{\partial W_0}, \frac{\partial L}{\partial W_1}, \frac{\partial L}{\partial b}$ using the centred differences method with $\epsilon=0.1$

In [9]:
eps = 0.1

# estimated dLdW0
print("dLdW0 ****")
W = np.array([[2+eps,-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
print("A plus: {}".format(A))
Lossp = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
print("Loss plus: {}".format(Lossp))
W = np.array([[2-eps,-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
print("A minus: {}".format(A))
Lossm = -np.mean(Y*np.log(A)+(1-Y)*np.log(1-A))
print("Loss minus: {}".format(Lossm))
dLdW0 = (Lossp - Lossm)/2/eps
print("Estimated dL/dW0: {}".format(dLdW0))

dLdW0 ****
A plus: [[0.98663891 0.95697341 0.0060716 ]]
Loss plus: 3.154506978388696
A minus: [[0.97591902 0.94794311 0.00740592]]
Loss minus: 2.895089977671675
Estimated dL/dW0: 1.2970850035851034


In [10]:
# estimated dLdW1
print("dLdW1 ****")
W = np.array([[2,-1+eps]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
print("A plus: {}".format(A))
Lossp = -np.mean(Y*np.log(A)+(1-Y)*np.log(1-A))
print("Loss plus: {}".format(Lossp))
W = np.array([[2,-1-eps]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
print("A minus: {}".format(A))
Lossm = -np.mean(Y*np.log(A)+(1-Y)*np.log(1-A))
print("Loss minus: {}".format(Lossm))
dLdW1 = (Lossp - Lossm)/2/eps
print("Estimated dL/dW1: {}".format(dLdW1))

dLdW1 ****
A plus: [[0.98372886 0.9427815  0.00817843]]
Loss plus: 2.9945125857168264
A minus: [[0.98019772 0.96090787 0.00549699]]
Loss minus: 3.0551299564927556
Estimated dL/dW1: -0.30308685387964607


In [11]:
# estimated dLdb
print("dLdb ****")
W = np.array([[2, -1]])
b = np.array([[-1+eps]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
print("A plus: {}".format(A))
Lossp = -np.mean(Y*np.log(A)+(1-Y)*np.log(1-A))
print("Loss plus: {}".format(Lossp))
b = np.array([[-1-eps]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
print("A minus: {}".format(A))
Lossm = -np.mean(Y*np.log(A)+(1-Y)*np.log(1-A))
print("Loss minus: {}".format(Lossm))
dLdb = (Lossp - Lossm)/2/eps
print("Estimated dL/db: {}".format(dLdb))

dLdb ****
A plus: [[0.9836975  0.95689275 0.00739154]]
Loss plus: 3.022639936446724
A minus: [[0.98015969 0.94784644 0.0060598 ]]
Loss minus: 3.0265602600267143
Estimated dL/db: -0.019601617899951407


**(h)** Focusing on the derivative $\frac{\partial L}{\partial W_0}$, redo the calculation in (f) with $\epsilon=0.02, 0.01, 0.005$. Confirm that the error in the approximate derivative when compared to the actual derivative computed in (b) decreases by a factor of 4 when $\epsilon$ is cut in half.

In [12]:
eps = 0.02

# estimated dLdW0
print("dLdW0, eps = {}".format(eps))
W = np.array([[2+eps,-1]])
b = np.array([[-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
Lossp = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
W = np.array([[2-eps,-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
Lossm = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
dLdW0 = (Lossp - Lossm)/2/eps
print("Estimated dL/dW0: {}".format(dLdW0))
error = np.absolute(grad_WL[0,0]-dLdW0)
print("Error: {}".format(error))

dLdW0, eps = 0.02
Estimated dL/dW0: 1.2972962763501683
Error: 1.1272320457944573e-05


In [13]:
eps = 0.01

# estimated dLdW0
print("dLdW0, eps = {}".format(eps))
W = np.array([[2+eps,-1]])
b = np.array([[-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
Lossp = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
W = np.array([[2-eps,-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
Lossm = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
dLdW0 = (Lossp - Lossm)/2/eps
print("Estimated dL/dW0: {}".format(dLdW0))
error = np.absolute(grad_WL[0,0]-dLdW0)
print("Error: {}".format(error))

dLdW0, eps = 0.01
Estimated dL/dW0: 1.2973047308642682
Error: 2.8178063580241997e-06


In [14]:
eps = 0.005

# estimated dLdW0
print("dLdW0, eps = {}".format(eps))
W = np.array([[2+eps,-1]])
b = np.array([[-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
Lossp = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
W = np.array([[2-eps,-1]])
Z = np.dot(W,X) + b
A = 1/(1+np.exp(-Z))
Lossm = -np.sum(Y*np.log(A)+(1-Y)*np.log(1-A))/m
dLdW0 = (Lossp - Lossm)/2/eps
print("Estimated dL/dW0: {}".format(dLdW0))
error = np.absolute(grad_WL[0,0]-dLdW0)
print("Error: {}".format(error))

dLdW0, eps = 0.005
Estimated dL/dW0: 1.2973068442364788
Error: 7.044341474582438e-07


Notice that the error is 1.12e-5 when $\epsilon=0.02$ and error is 2.82e-6 when $\epsilon=0.01$. Hence error is one fourth the value when $\epsilon$ drops by one half. Similarly, error is 2.82e-6 when $\epsilon=0.01$ and 7.04e-7 when $\epsilon=0.005$. Hence again, error is one fourth the value when $\epsilon$ drops by one half.