In this notebook we are going to create an artificial neural network with one hidden layer, and train it to perform a XOR function.

![alt text](https://grez911.github.io/files/ANN_XOR.png)

In [1]:
import numpy as np

In [2]:
# This is helper functions.

def tanh(Z):
  '''Activation function in the hidden layer.'''
  return np.tanh(Z)

def sigmoid(Z):
  '''Activation function in the output layer.'''
  return 1 / (1 + np.exp(-Z))

def cost(A, Y):
  '''A mean squared error based cost.'''
  return 1/2 * np.mean((A - Y)**2)

def linear(W, A, b):
  '''One linear fully connected layer.'''
  return np.dot(W, A) + b

## Initialization (unrolled)
---

In [3]:
np.random.seed(2) # Set a seed so that the results are consistent.

A0 = np.array([[0, 1, 0, 1], [1, 0, 0, 1]])  # Inputs.
print("A0 is:")
print(A0)

Y = np.array([1, 1, 0, 0])  # Outputs. This is a XOR function.
print("Y is:", Y, '\n')

n = len(A0)     # Number of features.
m = len(Y)  # Number of training exmaples.
print(f"n={n}", f"m={m}")

n_h = 2  # Number of hidden perceptrons.
print(f"n_h={n_h}")

n_o = 1  # Number of output perceptrons.
print(f"n_o={n_o}", '\n')

W0 = np.random.randn(n_h, n) * 0.1   # Weights between input and hidden layers.
print("W0 is:")
print(W0)

b0 = np.full((n_h,1), 0.1)    # Biases between input and hidden layer.
print("b0 is:")
print(b0, '\n')

W1 = np.random.randn(n_o, n_h) * 0.1 # Weights between hidden and output layers.
print("W1 is:", W1)

b1 = np.full((n_o,1), 0.1)     # Biases between hidden and output layers.
print("b1 is:", b1, '\n')

Z1 = np.zeros((n_h,m))
A1 = np.zeros((n_h,m))
print("Z1 is:")
print(Z1)
print("A1 is:")
print(A1, '\n')

Z2 = np.zeros((n_o,m))
A2 = np.zeros((n_o,m))
print("Z2 is:", Z2)
print("A2 is:", A2)

A0 is:
[[0 1 0 1]
 [1 0 0 1]]
Y is: [1 1 0 0] 

n=2 m=4
n_h=2
n_o=1 

W0 is:
[[-0.04167578 -0.00562668]
 [-0.21361961  0.16402708]]
b0 is:
[[0.1]
 [0.1]] 

W1 is: [[-0.17934356 -0.08417474]]
b1 is: [[0.1]] 

Z1 is:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]]
A1 is:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]] 

Z2 is: [[0. 0. 0. 0.]]
A2 is: [[0. 0. 0. 0.]]


## Forward pass (unrolled)
---

In [4]:
# Z1 for two perceptrons on a first example:
Z1[0,0] = W0[0,0] * A0[0,0] + W0[0,1] * A0[1,0] + b0[0]
Z1[1,0] = W0[1,0] * A0[0,0] + W0[1,1] * A0[1,0] + b0[1]
print(Z1[0,0])
print(Z1[1,0])

0.09437331727736706
0.26402708084049886


In [5]:
# Z1 for two perceptrons on a second example:
Z1[0,1] = W0[0,0] * A0[0,1] + W0[0,1] * A0[1,1] + b0[0]
Z1[1,1] = W0[1,0] * A0[0,1] + W0[1,1] * A0[1,1] + b0[1]

# Z1 for two perceptrons on a third example:
Z1[0,2] = W0[0,0] * A0[0,2] + W0[0,1] * A0[1,2] + b0[0]
Z1[1,2] = W0[1,0] * A0[0,2] + W0[1,1] * A0[1,2] + b0[1]

# Z1 for two perceptrons on a forth example:
Z1[0,3] = W0[0,0] * A0[0,3] + W0[0,1] * A0[1,3] + b0[0]
Z1[1,3] = W0[1,0] * A0[0,3] + W0[1,1] * A0[1,3] + b0[1]

print("Z1 is:")
print(Z1)

Z1 is:
[[ 0.09437332  0.05832422  0.1         0.05269753]
 [ 0.26402708 -0.11361961  0.1         0.05040747]]


In [6]:
# This also can be done via matrix multiplication:
print(np.dot(W0, A0) + b0)

[[ 0.09437332  0.05832422  0.1         0.05269753]
 [ 0.26402708 -0.11361961  0.1         0.05040747]]


In [7]:
# In the first layer activation function is tanh.
A1 = np.tanh(Z1)
print("A1 is:")
print(A1)

A1 is:
[[ 0.09409414  0.05825817  0.09966799  0.05264881]
 [ 0.25805832 -0.1131332   0.09966799  0.05036482]]


In [8]:
# Z2 on a first example for a first (and only) perceptron.
# It's the same as Z1[0,0], only W0, A0 and b0 changed with W1, A1 and b1.
Z2[0,0] = W1[0,0] * A1[0,0] + W1[0,1] * A1[1,0] + b1[0]
print(Z2[0,0])

0.06140283090901357


In [9]:
# Z2 on a second example:
Z2[0,1] = W1[0,0] * A1[0,1] + W1[0,1] * A1[1,1] + b1[0]

# Z2 on a third example:
Z2[0,2] = W1[0,0] * A1[0,2] + W1[0,1] * A1[1,2] + b1[0]

# Z2 on a forth example:
Z2[0,3] = W1[0,0] * A1[0,3] + W1[0,1] * A1[1,3] + b1[0]

print("Z2 is:", Z2)

Z2 is: [[0.06140283 0.09907473 0.07373566 0.08631833]]


In [10]:
# As in previous layer it can be calculated via matrix multiplication:
print(np.dot(W1, A1) + b1)

[[0.06140283 0.09907473 0.07373566 0.08631833]]


In [11]:
# Output activation function is a sigmoid.
A2 = sigmoid(Z2)
print("A2 is:", A2)

A2 is: [[0.51534589 0.52474844 0.51842557 0.52156619]]


In [12]:
# MSE cost.
J = cost(A2, Y)
print(f"J={J}")

J=0.12519375208599626


## Forward pass (compressed)
---

In [13]:
# Let's extract all formulas from unrolled version of a forward pass
# and write them again:

Z1 = linear(W0, A0, b0)
A1 = tanh(Z1)
Z2 = linear(W1, A1, b1)
A2 = sigmoid(Z2)
J = cost(A2, Y)

## Backpropogation (unrolled)
---

In [14]:
# Let's divide our model by each step.

def from_A2(A2, Y):
  return cost(A2, Y)

def from_Z2(Z2, Y):
  A2 = sigmoid(Z2)
  return from_A2(A2, Y)

def from_A1(W1, A1, b1, Y):
  Z2 = linear(W1, A1, b1)
  return from_Z2(Z2, Y)

def from_Z1(Z1, W1, b1, Y):
  A1 = tanh(Z1)
  return from_A1(W1, A1, b1, Y)

def from_A0(A0, W0, b0, W1, b1, Y):
  Z1 = linear(W0, A0, b0)
  return from_Z1(Z1, W1, b1, Y)

In [15]:
# Check that we have the same cost:
J = from_A0(A0, W0, b0, W1, b1, Y)
print(f"J={J}")

J=0.12519375208599626


In [16]:
# Let's calculate a partial derivative of a from_A2 function with respect to A2
# via derivative definition. It shows how a cost J will change if we change each
# A2 element a little bit.

delta = 10**(-9)
d = np.zeros(A2.shape)
for index, _ in np.ndenumerate(A2):
  A2_ = np.copy(A2)
  A2_[index] += delta
  d[index] = np.round((from_A2(A2_, Y) - from_A2(A2, Y)) / delta, 5)
print(d)

[[-0.12116 -0.11881  0.12961  0.13039]]


In [17]:
# This can be shortened to:
dA2 = (A2 - Y) / m
print(np.round(dA2, 5))

[[-0.12116 -0.11881  0.12961  0.13039]]


In [18]:
# Derivative of a from_Z2 function with respect to Z2. It again shows how
# cost changes if we change Z2.

delta = 10**(-9)
d = np.zeros(A2.shape)
for index, _ in np.ndenumerate(Z2):
  Z2_ = np.copy(Z2)
  Z2_[index] += delta
  d[index] = np.round((from_Z2(Z2_, Y) - from_Z2(Z2, Y)) / delta, 5)
print(d)

[[-0.03026 -0.02963  0.03236  0.03254]]


In [19]:
# It is equal to:
dZ2 = dA2 * sigmoid(Z2) * (1-sigmoid(Z2))
print(np.round(dZ2, 5))

[[-0.03026 -0.02963  0.03236  0.03254]]


In [20]:
# Derivative of a from_A1 function with respect to W1.

delta = 10**(-9)
d = np.zeros(W1.shape)
for index, _ in np.ndenumerate(W1):
  W1_ = np.copy(W1)
  W1_[index] += delta
  d[index] = np.round((from_A1(W1_, A1, b1, Y) - from_A1(W1, A1, b1, Y)) / delta, 5)
print(d)

[[0.00036 0.00041]]


In [21]:
# This is also can be calculated as:

dW1 = np.dot(dZ2, A1.T)
print(np.round(dW1, 5))

[[0.00036 0.00041]]


In [22]:
# Derivative of a from_A1 function with respect to b1.

delta = 10**(-9)
d = np.zeros(b1.shape)
for index, _ in np.ndenumerate(b1):
  b1_ = np.copy(b1)
  b1_[index] += delta
  d[index] = np.round((from_A1(W1, A1, b1_, Y) - from_A1(W1, A1, b1, Y)) / delta, 5)
print(d)

[[0.005]]


In [23]:
# Analytically:

db1 = np.sum(dZ2, keepdims=True)
print(np.round(db1, 5))

[[0.005]]


In [24]:
# Derivative of a from_A1 function with respect to A1.

delta = 10**(-9)
d = np.zeros(A1.shape)
for index, _ in np.ndenumerate(A1):
  A1_ = np.copy(A1)
  A1_[index] += delta
  d[index] = np.round((from_A1(W1, A1_, b1, Y) - from_A1(W1, A1, b1, Y)) / delta, 5)
print(d)

[[ 0.00543  0.00531 -0.0058  -0.00584]
 [ 0.00255  0.00249 -0.00272 -0.00274]]


In [25]:
# Analitical form:

dA1 = np.dot(W1.T, dZ2)
print(np.round(dA1, 5))

[[ 0.00543  0.00531 -0.0058  -0.00584]
 [ 0.00255  0.00249 -0.00272 -0.00274]]


In [26]:
# Derivative of a from_Z1 function with respect to Z1.

delta = 10**(-9)
d = np.zeros(Z1.shape)
for index, _ in np.ndenumerate(Z1):
  Z1_ = np.copy(Z1)
  Z1_[index] += delta
  d[index] = np.round((from_Z1(Z1_, W1, b1, Y) - from_Z1(Z1, W1, b1, Y)) / delta, 5)
print(d)

[[ 0.00538  0.0053  -0.00575 -0.00582]
 [ 0.00238  0.00246 -0.0027  -0.00273]]


In [27]:
# Analitical form:

dZ1 = dA1 * (1 - tanh(A1)**2)
print(np.round(dZ1, 5))

[[ 0.00538  0.0053  -0.00575 -0.00582]
 [ 0.00238  0.00246 -0.0027  -0.00273]]


In [28]:
# Derivative of a from_A0 function with respect to W0.

delta = 10**(-9)
d = np.zeros(W0.shape)
for index, _ in np.ndenumerate(W0):
  W0_ = np.copy(W0)
  W0_[index] += delta
  d[index] = np.round((from_A0(A0, W0_, b0, W1, b1, Y) - from_A0(A0, W0, b0, W1, b1, Y)) / delta, 5)
print(d)

[[-0.00052 -0.00044]
 [-0.00027 -0.00035]]


In [29]:
# Analitical form:

dW0 = np.dot(dZ1, A0.T)
print(np.round(dW0, 5))

[[-0.00052 -0.00044]
 [-0.00027 -0.00035]]


In [30]:
# Derivative of a from_A0 function with respect to b0.

delta = 10**(-9)
d = np.zeros(b0.shape)
for index, _ in np.ndenumerate(b0):
  b0_ = np.copy(b0)
  b0_[index] += delta
  d[index] = np.round((from_A0(A0, W0, b0_, W1, b1, Y) - from_A0(A0, W0, b0, W1, b1, Y)) / delta, 5)
print(d)

[[-0.00089]
 [-0.00059]]


In [31]:
# Analytically:

db0 = np.sum(dZ1, axis=1, keepdims=True)
print(np.round(db0, 5))

[[-0.00089]
 [-0.00058]]


## Backpropogation (compressed)
---

In [32]:
# Again, extract only the analytical formulas from a previous section:

dA2 = (Y - A2) / m
dZ2 = dA2 * sigmoid(Z2) * (1-sigmoid(Z2))
dW1 = np.dot(dZ2, A1.T)
db1 = np.sum(dZ2, keepdims=True)
dA1 = np.dot(W1.T, dZ2)
dZ1 = dA1 * (1 - tanh(A1)**2)
dW0 = np.dot(dZ1, A0.T)
db0 = np.sum(dZ1, axis=1, keepdims=True)

## Full network and gradient descent
---

In [33]:
# Now we are going to implement initialization, forward and backward passes
# into one class.

class ANN:
  '''Artificial Neural Network class.'''
  
  def __init__(self, n, n_h):
    self.n = n      # Number of input features.
    self.n_h = n_h  # Number of hidden units.
    self.n_o = 1    # Number of output units.
    
    np.random.seed(2)
    self.W0 = np.random.randn(n_h, n) * 0.1
    self.b0 = np.full((n_h,1), 0.1)
    
    self.W1 = np.random.randn(self.n_o, n_h) * 0.1
    self.b1 = np.full((self.n_o,1), 0.1)
    
  def forward(self, A0, Y):
    self.Z1 = linear(self.W0, A0, self.b0)
    self.A1 = tanh(self.Z1)
    self.Z2 = linear(self.W1, self.A1, self.b1)
    self.A2 = sigmoid(self.Z2)
    self.J = cost(self.A2, Y)
    
  def backward(self, A0, Y):
    dA2 = (Y - self.A2) / len(Y)
    dZ2 = dA2 * sigmoid(self.Z2) * (1-sigmoid(self.Z2))
    dA1 = np.dot(self.W1.T, dZ2)
    dZ1 = dA1 * (1 - tanh(self.A1)**2)
    
    self.dW1 = np.dot(dZ2, self.A1.T)
    self.db1 = np.sum(dZ2, keepdims=True)
    self.dW0 = np.dot(dZ1, A0.T)
    self.db0 = np.sum(dZ1, axis=1, keepdims=True)

In [34]:
nn = ANN(2, 2)
A0 = np.array([[0, 1, 0, 1], [1, 0, 0, 1]])
Y = np.array([1, 1, 0, 0])

In [35]:
# Create a training loop.

learning_rate = 10

for i in range(1000):
  nn.forward(A0, Y)
  if i % 100 == 0:
    print(nn.J)
  nn.backward(A0, Y)
  nn.W1 += nn.dW1 * learning_rate
  nn.b1 += nn.db1 * learning_rate
  nn.W0 += nn.dW0 * learning_rate
  nn.b0 += nn.db0 * learning_rate

0.12519375208599626
0.044109941906362596
0.0015874322882328462
0.0007888131666023057
0.0005181010814237375
0.00038344681417277733
0.00030337361837437526
0.0002504765312000076
0.00021301226568328868
0.00018512952151354307


In [36]:
# As you can see, predictions of the ANN are close to Y:
print("A2 =", np.round(nn.A2.squeeze(), 3))
print("Y =", Y)

A2 = [0.98  0.98  0.016 0.016]
Y = [1 1 0 0]
