In [None]:
import numpy as np

X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1],
])

y = np.atleast_2d([0, 1, 1, 0]).T

print('X.shape:', X.shape)
print('y.shape:', y.shape)

# [2, 2, 1] will also work for the XOR problem presented
LAYERS = [2, 2, 2, 1]
ETA = .1
THETA = []

# activation and derivative

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1-sigmoid(x))

def cost(y_hat, y):
    return np.mean([_ * _ for _ in (y_hat - y)])

# forward propagation

In [None]:
def initialize_parameters():
    for idx in range(1, len(LAYERS)):
        THETA.append(np.random.rand(LAYERS[idx], LAYERS[idx-1]+1))

In [None]:
def forward_propagation(X, initialize=True):
    if initialize:
        initialize_parameters()
    # adding bias column to the input X
    A = [np.hstack((np.ones((X.shape[0],1)), X))]
    Z = []
    activate = False
    for idx, theta in enumerate(THETA):
        Z.append(np.matmul(A[-1], theta.T))
        # adding bias column to the output of previous layer
        A.append(np.hstack((np.ones((Z[-1].shape[0],1)), sigmoid(Z[-1]))))
    # bias is not needed in the final output
    A[-1] = A[-1][:, 1:]
    y_hat = A[-1]
    return A, Z, y_hat

In [None]:
THETA = []
A, Z, y_hat = forward_propagation(X)
print('THETA=> \t', [_.shape for _ in THETA])
print("A=> \t\t", [_.shape for _ in A])
print("Z=> \t\t", [_.shape for _ in Z])
print("y_hat=> \t", y_hat.shape)

# backward propagation

In [None]:
def back_propagation(X, y, initialize=True, debug=False, verbose=False):
    # run a forward pass
    A, Z, y_hat = forward_propagation(X, initialize)
    # calculate delta at final output
    del_ = [(y_hat - y) * sigmoid_prime(Z[len(Z)-1])]
    if verbose:
        print(cost(y_hat, y))
    # flag to signify whether a layer has bias column of not
    bias_free = True
    # running in reverse because delta is propagated backwards
    for idx in reversed(range(1, len(THETA))):
        if bias_free:
            # true only for the final layer where there is no bias
            temp = np.matmul(del_[0], THETA[idx]) * np.hstack((np.ones((Z[idx-1].shape[0], 1)), sigmoid_prime(Z[idx-1])))
            bias_free=False
        else:
            # true for all the layers except the input and output layer
            temp = np.matmul(del_[0][:,1:], THETA[idx]) * np.hstack((np.ones((Z[idx-1].shape[0], 1)), sigmoid_prime(Z[idx-1])))
        del_ = [temp] + del_
    del_theta = []
    bias_free = True
    # calculation for the delta in the parameters
    for idx in reversed(range(len(del_))):
        if bias_free:
            # true only for the final layer where there is no bias
            del_theta = [-ETA * np.matmul(del_[idx].T, A[idx])] + del_theta
            bias_free = False
        else:
            # true for all the layers except the input and output layer
            del_theta = [-ETA * np.matmul(del_[idx][:, 1:].T, A[idx])] + del_theta
    # update parameters
    for idx in range(len(THETA)):
        # asserting that the matrix sizes are same
        assert THETA[idx].shape == del_theta[idx].shape
        THETA[idx] = THETA[idx] + del_theta[idx]
    if debug:
        return (A, Z, y_hat, del_, del_theta)


In [None]:
THETA=[]

A, Z, y_hat, del_, del_theta = back_propagation(X, y, True, True, verbose=True)
print('THETA=> \t', [_.shape for _ in THETA])
print("A=> \t\t", [_.shape for _ in A])
print("Z=> \t\t", [_.shape for _ in Z])
print("y_hat=> \t", [y_hat.shape])
print("del_=> \t\t", [_.shape for _ in del_])
print('del_theta=> \t', [_.shape for _ in del_theta])

# gradient checking

In [None]:
epsilon = 0.01
idx = 0

In [None]:
print(del_theta[idx] / (-2*ETA))

In [None]:
grad_check = np.zeros(THETA[idx].shape)
for i in range(THETA[idx].shape[0]):
    for j in range(THETA[idx].shape[1]):
        THETA[idx][i][j] = THETA[idx][i][j] + epsilon
        A, Z, y_hat = forward_propagation(X, initialize=False)
        J_plus_epsilon = cost(y_hat, y)
        THETA[idx][i][j] = THETA[idx][i][j] - 2* epsilon
        A, Z, y_hat = forward_propagation(X, initialize=False)
        J_minus_epsilon = cost(y_hat, y)
        grad_check[i][j] = (J_plus_epsilon - J_minus_epsilon)/ (2*epsilon)
        THETA[idx][i][j] = THETA[idx][i][j] + epsilon
print(grad_check)

In [None]:
print("from backprop:")
print(del_theta[idx] / (-2*ETA))
print("from grad check:")
print(grad_check)

# training

In [None]:
initialize=True
verbose=True
THETA=[]
for i in range(50000):
    if i % 10000 == 0:
        verbose=True
    back_propagation(X, y, initialize, debug=False, verbose=verbose)
    verbose=False
    initialize=False

# inference

In [None]:
A, Z, y_hat = forward_propagation(X, initialize=False)

In [None]:
y_hat