In [11]:
"""
Simplistic implementation of the two-layer neural network.
Training method is stochastic (online) gradient descent with momentum.
As an example it computes XOR for given input.
Some details:
- tanh activation for hidden layer
- sigmoid activation for output layer """
#dependencies
import numpy as np

In [12]:
# variables
n_hidden=10  # neurons in 1 hidden layer 
n_input=10   # input layer 
n_output=10   # output layer
n_sample=300  # no. of sample

# hyperparameter
learning_rate=0.01
momentum=0.9

# seed
np.random.seed(0)

In [13]:
def sigmoid(x):          # activation function 1
    return 1/(1+np.exp(-x))

In [14]:
def tanh_prime(x):      # activation function 2
    return 1- np.tanh(x)**2

In [15]:
# parameters accepted are data input,data target,layer1,layer2,bias layer 1 , bias layer 2
def train(x,t,L1,L2,b1,b2):
    # forward propogation
    a=np.dot(x,L1)+b1    # layer 1
    z=np.tanh(a)
    b=np.dot(z,L2)+b2    #layer 2
    y=sigmoid(b)
    # backward propoagation
    el2=y-t         # error in L2 layer 
    el1=tanh_prime(a)*np.dot(el2,L2)   #error in L1 layer
    dl2=np.outer(z,el2)                # differentiation of error in L2
    dl1=np.outer(x,el1)                # of L1
    # here we are doing classification and thus we use cross entropy as are error/ cost function
    loss=-np.mean ( t * np.log(y) + (1 - t) * np.log(1 - y) )
    return loss,(dl1,dl2,el1,el2)

In [16]:
def predict(x,L1,L2,b1,b2):
    a = np.dot(x, L1) + b1             #passing through 1st hidden layer
    b = np.dot(np.tanh(a), L2) + b2    #passing through 2nd hidden layer
    return (sigmoid(b)>0.5).astype(int)

In [25]:
# assigning initial values to the parameters 
L1 = np.random.normal(scale=0.1, size=(n_input, n_hidden))           #layer 1 
L2 = np.random.normal(scale=0.1, size=(n_hidden, n_output))          #layer 2

b1=np.zeros(n_hidden)          # bias for layer 1
b2=np.zeros(n_output)          # bias for layer 2

# for our convience
parameter=[L1,L2,b1,b2]

# generating random dataset
x=np.random.binomial(1,0.5,(n_sample,n_input))
t=x^1


# now training the dataset on the model created
for epoch in range(100):
    err = []
    upd = [0]*len(parameter)

    for i in range(x.shape[0]):
        loss, grad = train(x[i], t[i], *parameter)

        for j in range(len(parameter)):
            parameter[j] -= upd[j]

        for j in range(len(parameter)):
            upd[j] = learning_rate * grad[j] + momentum * upd[j]

        err.append( loss )

    print("epochs:\t",epoch,"\nloss:\t",np.mean(err))


# testing our model on the test dataset
x = np.random.binomial(1, 0.5, n_input)
print ("XOR prediction:",x)
print (predict(x, *parameter))

epochs:	 0 
loss:	 0.479751776883375
epochs:	 1 
loss:	 0.12218161840381417
epochs:	 2 
loss:	 0.053902778762558716
epochs:	 3 
loss:	 0.031164888095424104
epochs:	 4 
loss:	 0.02173115005649793
epochs:	 5 
loss:	 0.016646473795091963
epochs:	 6 
loss:	 0.013470905246095477
epochs:	 7 
loss:	 0.011289144199811717
epochs:	 8 
loss:	 0.009693600920549758
epochs:	 9 
loss:	 0.008475037117921467
epochs:	 10 
loss:	 0.007514050407517927
epochs:	 11 
loss:	 0.006737139143237239
epochs:	 12 
loss:	 0.0060964508219048886
epochs:	 13 
loss:	 0.0055594476591042195
epochs:	 14 
loss:	 0.00510321268319299
epochs:	 15 
loss:	 0.004711124795634085
epochs:	 16 
loss:	 0.0043708261564136845
epochs:	 17 
loss:	 0.004072931620825234
epochs:	 18 
loss:	 0.0038101822197661285
epochs:	 19 
loss:	 0.003576873798301845
epochs:	 20 
loss:	 0.003368461471862667
epochs:	 21 
loss:	 0.003181279560158236
epochs:	 22 
loss:	 0.0030123392783200234
epochs:	 23 
loss:	 0.002859179976181966
epochs:	 24 
loss:	 0.00271