In [1]:
import numpy as np

In [2]:
# define the sigmoid function
def sigmoid(x, derivative=False):

    if (derivative == True):
        return sigmoid(x,derivative=False) * (1 - sigmoid(x,derivative=False))
    else:
        return 1 / (1 + np.exp(-x))

In [3]:
# choose a random seed for reproducible results
np.random.seed(1)

In [4]:
# learning rate
alpha = .1

In [5]:
# number of nodes in the hidden layer
num_hidden = 3

In [6]:
# inputs
X = np.array([  
    [0, 0, 1],
    [0, 1, 1],
    [1, 0, 0],
    [1, 1, 0],
    [1, 0, 1],
    [1, 1, 1],
])

In [7]:
X.shape

(6, 3)

In [8]:
# outputs
# x.T is the transpose of x, making this a column vector
y = np.array([[0, 1, 0, 1, 1, 0]]).T

In [9]:
y

array([[0],
       [1],
       [0],
       [1],
       [1],
       [0]])

In [10]:
y.shape

(6, 1)

In [11]:
# initialize weights randomly with mean 0 and range [-1, 1]
# numpy.random.random - gives [0,1]
# the +1 in the 1st dimension of the weight matrices is for the bias weight
hidden_weights = 2*np.random.random((X.shape[1] + 1, num_hidden)) - 1
output_weights = 2*np.random.random((num_hidden + 1, y.shape[1])) - 1

In [12]:
# number of iterations of gradient descent
num_iterations = 10000

In [13]:
hidden_weights

array([[-0.16595599,  0.44064899, -0.99977125],
       [-0.39533485, -0.70648822, -0.81532281],
       [-0.62747958, -0.30887855, -0.20646505],
       [ 0.07763347, -0.16161097,  0.370439  ]])

In [14]:
output_weights

array([[-0.5910955 ],
       [ 0.75623487],
       [-0.94522481],
       [ 0.34093502]])

In [15]:
# for each iteration of gradient descent
for i in range(num_iterations):

    # forward phase
    # np.hstack((np.ones(...), X) adds a fixed input of 1 for the bias weight
    input_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), X))
    hidden_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), sigmoid(np.dot(input_layer_outputs, hidden_weights))))
    output_layer_outputs = np.dot(hidden_layer_outputs, output_weights)

    # backward phase
    # output layer error term
    output_error = output_layer_outputs - y
    # hidden layer error term
    # [:, 1:] removes the bias term from the backpropagation
    hidden_error = hidden_layer_outputs[:, 1:] * (1 - hidden_layer_outputs[:, 1:]) * np.dot(output_error, output_weights.T[:, 1:])

    # partial derivatives
    hidden_pd = input_layer_outputs[:, :, np.newaxis] * hidden_error[: , np.newaxis, :]
    output_pd = hidden_layer_outputs[:, :, np.newaxis] * output_error[:, np.newaxis, :]

    # average for total gradients
    total_hidden_gradient = np.average(hidden_pd, axis=0)
    total_output_gradient = np.average(output_pd, axis=0)

    # update weights
    hidden_weights += - alpha * total_hidden_gradient
    output_weights += - alpha * total_output_gradient

In [16]:
# print the final outputs of the neural network on the inputs X
print("Output After Training: \n{}".format(output_layer_outputs))

Output After Training: 
[[2.11135662e-04]
 [9.99525588e-01]
 [1.66889680e-04]
 [9.99576185e-01]
 [9.99362960e-01]
 [1.30185107e-03]]


## Iteration 1

In [15]:
# forward phase
# np.hstack((np.ones(...), X) adds a fixed input of 1 for the bias weight
input_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), X))
hidden_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), sigmoid(np.dot(input_layer_outputs, hidden_weights))))
output_layer_outputs = np.dot(hidden_layer_outputs, output_weights)

In [18]:
input_layer_outputs

array([[1., 0., 0., 1.],
       [1., 0., 1., 1.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 0., 1.],
       [1., 1., 1., 1.]])

In [16]:
np.dot(input_layer_outputs, hidden_weights)

array([[-0.08832252,  0.27903802, -0.62933225],
       [-0.7158021 , -0.02984053, -0.8357973 ],
       [-0.56129085, -0.26583923, -1.81509406],
       [-1.18877042, -0.57471778, -2.02155911],
       [-0.48365738, -0.4274502 , -1.44465506],
       [-1.11113695, -0.73632875, -1.65112011]])

In [17]:
sigmoid(np.dot(input_layer_outputs, hidden_weights))

array([[0.47793371, 0.56931036, 0.34766196],
       [0.32831806, 0.49254042, 0.30242066],
       [0.36324884, 0.43392884, 0.14002359],
       [0.23347892, 0.36014894, 0.11695787],
       [0.38138886, 0.39473536, 0.19082552],
       [0.24765899, 0.32380747, 0.16095762]])

In [19]:
hidden_layer_outputs

array([[1.        , 0.47793371, 0.56931036, 0.34766196],
       [1.        , 0.32831806, 0.49254042, 0.30242066],
       [1.        , 0.36324884, 0.43392884, 0.14002359],
       [1.        , 0.23347892, 0.36014894, 0.11695787],
       [1.        , 0.38138886, 0.39473536, 0.19082552],
       [1.        , 0.24765899, 0.32380747, 0.16095762]])

In [20]:
output_layer_outputs

array([[-0.64926151],
       [-0.70526557],
       [-0.67881543],
       [-0.71507728],
       [-0.6107305 ],
       [-0.6550019 ]])

In [18]:
# backward phase
# output layer error term
output_error = output_layer_outputs - y
# hidden layer error term
# [:, 1:] removes the bias term from the backpropagation
hidden_error = hidden_layer_outputs[:, 1:] * (1 - hidden_layer_outputs[:, 1:]) * np.dot(output_error, output_weights.T[:, 1:])

In [25]:
output_error

array([[-0.64926151],
       [-1.70526557],
       [-0.67881543],
       [-1.71507728],
       [-1.6107305 ],
       [-0.6550019 ]])

In [27]:
hidden_layer_outputs[:, 1:]

array([[0.47793371, 0.56931036, 0.34766196],
       [0.32831806, 0.49254042, 0.30242066],
       [0.36324884, 0.43392884, 0.14002359],
       [0.23347892, 0.36014894, 0.11695787],
       [0.38138886, 0.39473536, 0.19082552],
       [0.24765899, 0.32380747, 0.16095762]])

In [30]:
(1 - hidden_layer_outputs[:, 1:])

array([[0.52206629, 0.43068964, 0.65233804],
       [0.67168194, 0.50745958, 0.69757934],
       [0.63675116, 0.56607116, 0.85997641],
       [0.76652108, 0.63985106, 0.88304213],
       [0.61861114, 0.60526464, 0.80917448],
       [0.75234101, 0.67619253, 0.83904238]])

In [19]:
output_weights.T[:, 1:]

array([[ 0.75623487, -0.94522481,  0.34093502]])

In [29]:
np.dot(output_error, output_weights.T[:, 1:])

array([[-0.49099419,  0.61369809, -0.22135598],
       [-1.28958129,  1.61185933, -0.58138475],
       [-0.5133439 ,  0.64163319, -0.23143195],
       [-1.29700125,  1.6211336 , -0.58472991],
       [-1.21809058,  1.52250244, -0.54915444],
       [-0.49533528,  0.61912405, -0.22331309]])

In [26]:
hidden_error

array([[-0.12250947,  0.15047636, -0.05020201],
       [-0.28438532,  0.40287514, -0.12265032],
       [-0.11873599,  0.15760731, -0.02786834],
       [-0.23211979,  0.37357675, -0.06039016],
       [-0.28738581,  0.3637553 , -0.08479556],
       [-0.09229286,  0.13556104, -0.03015849]])

In [31]:
# partial derivatives
hidden_pd = input_layer_outputs[:, :, np.newaxis] * hidden_error[: , np.newaxis, :]
output_pd = hidden_layer_outputs[:, :, np.newaxis] * output_error[:, np.newaxis, :]

In [34]:
input_layer_outputs[:, :, np.newaxis]

array([[[1.],
        [0.],
        [0.],
        [1.]],

       [[1.],
        [0.],
        [1.],
        [1.]],

       [[1.],
        [1.],
        [0.],
        [0.]],

       [[1.],
        [1.],
        [1.],
        [0.]],

       [[1.],
        [1.],
        [0.],
        [1.]],

       [[1.],
        [1.],
        [1.],
        [1.]]])

In [35]:
hidden_error[: , np.newaxis, :]

array([[[-0.12250947,  0.15047636, -0.05020201]],

       [[-0.28438532,  0.40287514, -0.12265032]],

       [[-0.11873599,  0.15760731, -0.02786834]],

       [[-0.23211979,  0.37357675, -0.06039016]],

       [[-0.28738581,  0.3637553 , -0.08479556]],

       [[-0.09229286,  0.13556104, -0.03015849]]])

In [32]:
hidden_pd

array([[[-0.12250947,  0.15047636, -0.05020201],
        [-0.        ,  0.        , -0.        ],
        [-0.        ,  0.        , -0.        ],
        [-0.12250947,  0.15047636, -0.05020201]],

       [[-0.28438532,  0.40287514, -0.12265032],
        [-0.        ,  0.        , -0.        ],
        [-0.28438532,  0.40287514, -0.12265032],
        [-0.28438532,  0.40287514, -0.12265032]],

       [[-0.11873599,  0.15760731, -0.02786834],
        [-0.11873599,  0.15760731, -0.02786834],
        [-0.        ,  0.        , -0.        ],
        [-0.        ,  0.        , -0.        ]],

       [[-0.23211979,  0.37357675, -0.06039016],
        [-0.23211979,  0.37357675, -0.06039016],
        [-0.23211979,  0.37357675, -0.06039016],
        [-0.        ,  0.        , -0.        ]],

       [[-0.28738581,  0.3637553 , -0.08479556],
        [-0.28738581,  0.3637553 , -0.08479556],
        [-0.        ,  0.        , -0.        ],
        [-0.28738581,  0.3637553 , -0.08479556]],

       [[-

In [36]:
hidden_layer_outputs[:, :, np.newaxis] 

array([[[1.        ],
        [0.47793371],
        [0.56931036],
        [0.34766196]],

       [[1.        ],
        [0.32831806],
        [0.49254042],
        [0.30242066]],

       [[1.        ],
        [0.36324884],
        [0.43392884],
        [0.14002359]],

       [[1.        ],
        [0.23347892],
        [0.36014894],
        [0.11695787]],

       [[1.        ],
        [0.38138886],
        [0.39473536],
        [0.19082552]],

       [[1.        ],
        [0.24765899],
        [0.32380747],
        [0.16095762]]])

In [37]:
output_error[:, np.newaxis, :]

array([[[-0.64926151]],

       [[-1.70526557]],

       [[-0.67881543]],

       [[-1.71507728]],

       [[-1.6107305 ]],

       [[-0.6550019 ]]])

In [33]:
output_pd

array([[[-0.64926151],
        [-0.31030396],
        [-0.3696313 ],
        [-0.22572353]],

       [[-1.70526557],
        [-0.55986948],
        [-0.83991222],
        [-0.51570754]],

       [[-0.67881543],
        [-0.24657891],
        [-0.29455759],
        [-0.09505017]],

       [[-1.71507728],
        [-0.40043439],
        [-0.61768326],
        [-0.20059179]],

       [[-1.6107305 ],
        [-0.61431467],
        [-0.63581229],
        [-0.30736849]],

       [[-0.6550019 ],
        [-0.16221711],
        [-0.21209451],
        [-0.10542755]]])

In [40]:
# average for total gradients
total_hidden_gradient = np.average(hidden_pd, axis=0)
total_output_gradient = np.average(output_pd, axis=0)

In [41]:
total_hidden_gradient

array([[-0.18957154,  0.26397532, -0.06267748],
       [-0.12175574,  0.17175007, -0.03386876],
       [-0.10146633,  0.15200216, -0.03553316],
       [-0.13109558,  0.17544464, -0.04796773]])

In [42]:
total_output_gradient

array([[-1.16902536],
       [-0.38228642],
       [-0.49494853],
       [-0.24164484]])

In [None]:
# current output_weights
# array([[-0.5910955 ],
#        [ 0.75623487],
#        [-0.94522481],
#        [ 0.34093502]])

In [43]:
# update weights
hidden_weights += - alpha * total_hidden_gradient
output_weights += - alpha * total_output_gradient

In [45]:
hidden_weights

array([[-0.14699884,  0.41425146, -0.9935035 ],
       [-0.38315928, -0.72366323, -0.81193593],
       [-0.61733294, -0.32407876, -0.20291174],
       [ 0.09074303, -0.17915544,  0.37523577]])

In [46]:
output_weights

array([[-0.47419296],
       [ 0.79446351],
       [-0.89572996],
       [ 0.3650995 ]])

In [47]:
# print the final outputs of the neural network on the inputs X
print("Output After Training: \n{}".format(output_layer_outputs))

Output After Training: 
[[-0.64926151]
 [-0.70526557]
 [-0.67881543]
 [-0.71507728]
 [-0.6107305 ]
 [-0.6550019 ]]


## Iteration 2

In [48]:
# forward phase
# np.hstack((np.ones(...), X) adds a fixed input of 1 for the bias weight
input_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), X))
hidden_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), sigmoid(np.dot(input_layer_outputs, hidden_weights))))
output_layer_outputs = np.dot(hidden_layer_outputs, output_weights)

In [49]:
input_layer_outputs

array([[1., 0., 0., 1.],
       [1., 0., 1., 1.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 0., 1.],
       [1., 1., 1., 1.]])

In [50]:
hidden_layer_outputs

array([[1.        , 0.48593976, 0.55850479, 0.35017553],
       [1.        , 0.33769372, 0.47776898, 0.30551335],
       [1.        , 0.37048001, 0.42325833, 0.14119021],
       [1.        , 0.24094765, 0.34671949, 0.11832889],
       [1.        , 0.39188035, 0.38023116, 0.19306695],
       [1.        , 0.2579314 , 0.30732694, 0.16340403]])

In [51]:
output_layer_outputs

array([[-0.46055212],
       [-0.52231684],
       [-0.5074368 ],
       [-0.55013407],
       [-0.43295411],
       [-0.4848991 ]])

In [52]:
# backward phase
# output layer error term
output_error = output_layer_outputs - y
# hidden layer error term
# [:, 1:] removes the bias term from the backpropagation
hidden_error = hidden_layer_outputs[:, 1:] * (1 - hidden_layer_outputs[:, 1:]) * np.dot(output_error, output_weights.T[:, 1:])

In [53]:
output_error

array([[-0.46055212],
       [-1.52231684],
       [-0.5074368 ],
       [-1.55013407],
       [-1.43295411],
       [-0.4848991 ]])

In [54]:
hidden_layer_outputs[:, 1:]

array([[0.48593976, 0.55850479, 0.35017553],
       [0.33769372, 0.47776898, 0.30551335],
       [0.37048001, 0.42325833, 0.14119021],
       [0.24094765, 0.34671949, 0.11832889],
       [0.39188035, 0.38023116, 0.19306695],
       [0.2579314 , 0.30732694, 0.16340403]])

In [55]:
(1 - hidden_layer_outputs[:, 1:])

array([[0.51406024, 0.44149521, 0.64982447],
       [0.66230628, 0.52223102, 0.69448665],
       [0.62951999, 0.57674167, 0.85880979],
       [0.75905235, 0.65328051, 0.88167111],
       [0.60811965, 0.61976884, 0.80693305],
       [0.7420686 , 0.69267306, 0.83659597]])

In [56]:
np.dot(output_error, output_weights.T[:, 1:])

array([[-0.36589185,  0.41253033, -0.16814735],
       [-1.20942519,  1.3635848 , -0.55579712],
       [-0.40314002,  0.45452634, -0.18526492],
       [-1.23152496,  1.38850153, -0.56595318],
       [-1.13842976,  1.28353993, -0.52317084],
       [-0.38523464,  0.43433865, -0.17703642]])

In [57]:
hidden_error

array([[-0.09140063,  0.10172057, -0.03826237],
       [-0.27049601,  0.34022229, -0.11792622],
       [-0.09402216,  0.11095475, -0.0224644 ],
       [-0.22523591,  0.31450266, -0.05904429],
       [-0.27129936,  0.30247315, -0.08150589],
       [-0.07373499,  0.09246075, -0.02420144]])

In [58]:
# partial derivatives
hidden_pd = input_layer_outputs[:, :, np.newaxis] * hidden_error[: , np.newaxis, :]
output_pd = hidden_layer_outputs[:, :, np.newaxis] * output_error[:, np.newaxis, :]

In [59]:
input_layer_outputs[:, :, np.newaxis]

array([[[1.],
        [0.],
        [0.],
        [1.]],

       [[1.],
        [0.],
        [1.],
        [1.]],

       [[1.],
        [1.],
        [0.],
        [0.]],

       [[1.],
        [1.],
        [1.],
        [0.]],

       [[1.],
        [1.],
        [0.],
        [1.]],

       [[1.],
        [1.],
        [1.],
        [1.]]])

In [60]:
hidden_error[: , np.newaxis, :]

array([[[-0.09140063,  0.10172057, -0.03826237]],

       [[-0.27049601,  0.34022229, -0.11792622]],

       [[-0.09402216,  0.11095475, -0.0224644 ]],

       [[-0.22523591,  0.31450266, -0.05904429]],

       [[-0.27129936,  0.30247315, -0.08150589]],

       [[-0.07373499,  0.09246075, -0.02420144]]])

In [61]:
hidden_pd

array([[[-0.09140063,  0.10172057, -0.03826237],
        [-0.        ,  0.        , -0.        ],
        [-0.        ,  0.        , -0.        ],
        [-0.09140063,  0.10172057, -0.03826237]],

       [[-0.27049601,  0.34022229, -0.11792622],
        [-0.        ,  0.        , -0.        ],
        [-0.27049601,  0.34022229, -0.11792622],
        [-0.27049601,  0.34022229, -0.11792622]],

       [[-0.09402216,  0.11095475, -0.0224644 ],
        [-0.09402216,  0.11095475, -0.0224644 ],
        [-0.        ,  0.        , -0.        ],
        [-0.        ,  0.        , -0.        ]],

       [[-0.22523591,  0.31450266, -0.05904429],
        [-0.22523591,  0.31450266, -0.05904429],
        [-0.22523591,  0.31450266, -0.05904429],
        [-0.        ,  0.        , -0.        ]],

       [[-0.27129936,  0.30247315, -0.08150589],
        [-0.27129936,  0.30247315, -0.08150589],
        [-0.        ,  0.        , -0.        ],
        [-0.27129936,  0.30247315, -0.08150589]],

       [[-

In [62]:
hidden_layer_outputs[:, :, np.newaxis] 

array([[[1.        ],
        [0.48593976],
        [0.55850479],
        [0.35017553]],

       [[1.        ],
        [0.33769372],
        [0.47776898],
        [0.30551335]],

       [[1.        ],
        [0.37048001],
        [0.42325833],
        [0.14119021]],

       [[1.        ],
        [0.24094765],
        [0.34671949],
        [0.11832889]],

       [[1.        ],
        [0.39188035],
        [0.38023116],
        [0.19306695]],

       [[1.        ],
        [0.2579314 ],
        [0.30732694],
        [0.16340403]]])

In [63]:
output_error[:, np.newaxis, :]

array([[[-0.46055212]],

       [[-1.52231684]],

       [[-0.5074368 ]],

       [[-1.55013407]],

       [[-1.43295411]],

       [[-0.4848991 ]]])

In [64]:
output_pd

array([[[-0.46055212],
        [-0.22380058],
        [-0.25722056],
        [-0.16127408]],

       [[-1.52231684],
        [-0.51407684],
        [-0.72731577],
        [-0.46508812]],

       [[-0.5074368 ],
        [-0.18799519],
        [-0.21477685],
        [-0.07164511]],

       [[-1.55013407],
        [-0.37350116],
        [-0.5374617 ],
        [-0.18342564]],

       [[-1.43295411],
        [-0.56154656],
        [-0.5448538 ],
        [-0.27665608]],

       [[-0.4848991 ],
        [-0.1250707 ],
        [-0.14902256],
        [-0.07923447]]])

In [65]:
# average for total gradients
total_hidden_gradient = np.average(hidden_pd, axis=0)
total_output_gradient = np.average(output_pd, axis=0)

In [66]:
total_hidden_gradient

array([[-0.17103151,  0.21038903, -0.0572341 ],
       [-0.1107154 ,  0.13673188, -0.03120267],
       [-0.09491115,  0.12453095, -0.03352866],
       [-0.11782183,  0.13947946, -0.04364932]])

In [67]:
total_output_gradient

array([[-0.99304884],
       [-0.33099851],
       [-0.40510854],
       [-0.20622058]])

In [68]:
# current output_weights
# array([[-0.47419296],
#        [ 0.79446351],
#        [-0.89572996],
#        [ 0.3650995 ]])

In [69]:
# update weights
hidden_weights += - alpha * total_hidden_gradient
output_weights += - alpha * total_output_gradient

In [70]:
hidden_weights

array([[-0.12989569,  0.39321255, -0.98778009],
       [-0.37208774, -0.73733641, -0.80881567],
       [-0.60784183, -0.33653186, -0.19955887],
       [ 0.10252521, -0.19310338,  0.37960071]])

In [71]:
output_weights

array([[-0.37488808],
       [ 0.82756337],
       [-0.85521911],
       [ 0.38572156]])

In [72]:
# print the final outputs of the neural network on the inputs X
print("Output After Training: \n{}".format(output_layer_outputs))

Output After Training: 
[[-0.46055212]
 [-0.52231684]
 [-0.5074368 ]
 [-0.55013407]
 [-0.43295411]
 [-0.4848991 ]]
