In [1]:
import numpy as np

In [26]:
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
y = np.array([[0,1,1,0]]).T


In [4]:
X

array([[0, 0, 1],
       [0, 1, 1],
       [1, 0, 1],
       [1, 1, 1]])

In [27]:
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1


In [6]:
syn0

array([[ 0.87560397,  0.97440574, -0.02816065, -0.10306781],
       [-0.77211275,  0.5681985 , -0.88812913, -0.8911695 ],
       [-0.28865131, -0.01255446,  0.12598176,  0.24277991]])

In [7]:
syn1

array([[ 0.72607692],
       [-0.73595849],
       [ 0.85218147],
       [ 0.67803997]])

In [28]:
for j in xrange(60000):
    l1 = 1/(1+np.exp(-(np.dot(X,syn0))))
    l2 = 1/(1+np.exp(-(np.dot(l1,syn1))))
    l2_delta = (y - l2)*(l2*(1-l2))
    l1_delta = l2_delta.dot(syn1.T) * (l1 * (1-l1))
    syn1 += l1.T.dot(l2_delta)
    syn0 += X.T.dot(l1_delta)

In [2]:
def nonlin(x,deriv=False):
    if(deriv == True):
        return x*(1-x) #x is sigmoid output itself s(x)*(1-s(x))
        #If the sigmoid's output is a variable "out", then the derivative is simply out * (1-out)
    return 1/(1+np.exp(-x))

In [3]:
X = np.array([[0,0,1],
              [0,1,1],
              [1,0,1],
              [1,1,1] ])

In [4]:
nonlin(X)

array([[ 0.5       ,  0.5       ,  0.73105858],
       [ 0.5       ,  0.73105858,  0.73105858],
       [ 0.73105858,  0.5       ,  0.73105858],
       [ 0.73105858,  0.73105858,  0.73105858]])

In [11]:
y = np.array([[0,0,1,1]]).T

In [12]:
np.random.seed(1)

In [13]:
syn0 = 2*np.random.random((3,1)) - 1
#it's a good idea to have a mean of zero in weight initialization. 

In [14]:
syn0

array([[-0.16595599],
       [ 0.44064899],
       [-0.99977125]])

In [23]:
for iter in xrange(10000):
    #forward propogation
    l0 = X
    l1 = nonlin(np.dot(l0,syn0)) #if I add new layer this output will act as input and will also be a new feature
    
    # how much did we miss?
    l1_error = y - l1
    
    # multiply how much we missed by the
    # slope of the sigmoid at the values in l1
    #The Error Weighted Derivative
    #Weight the error with derivative of predicted values to improve weights
    #Better ways available
    l1_delta = l1_error * nonlin(l1,True)
    
    #update weights
    #Transpose Data 3x4 and dot with l1_delta 4x1 to get new weights
    #back propogate error , update weight per observation
    #It computes the weight updates for each weight for each training example, sums them, and updates the weights, all in a simple line
    syn0 += np.dot(l0.T,l1_delta)
    
print "Output After Training:"
print l1  
    
    

Output After Training:
[[ 0.00274054]
 [ 0.00223536]
 [ 0.99817573]
 [ 0.99776324]]


#### What the training below is going to do is amplify that correlation. It's both going to update syn1 to map it to the output, and update syn0 to be better at producing it from the input!
##### Adding layers to neural network creates more features which are correlated to output or feature combined together produce output which is highly correlated to output

In [29]:
X = np.array([[0,0,1],
[0,1,1],
[1,0,1],
[1,1,1]])

In [30]:
y = np.array([[0],
[1],
[1],
[0]])

In [31]:
np.random.seed(1)

In [32]:
# randomly initialize our weights with mean 0
syn0 = 2*np.random.random((3,4)) - 1 #4 set of weights for 3 input features as next layer has 4 neurons, this 4 neuron become new kind of feature
#, i can also create 5,6, this will in turn generae a whole new data/hidden features for processing
syn1 = 2*np.random.random((4,1)) - 1 #can increases this set of weights too but have to add next layer as output is a vector so last layer should have one set of weights, here 4 weights are given as output of previous layer has 4 hidden feature nothing to do with 4 observations

In [None]:
for j in xrange(60000):

    # Feed forward through layers 0, 1, and 2
    l0 = X
    l1 = nonlin(np.dot(l0,syn0))
    l2 = nonlin(np.dot(l1,syn1))

    # how much did we miss the target value?
    l2_error = y - l2
    
    if (j% 10000) == 0:
        print "Error:" + str(np.mean(np.abs(l2_error)))
        
    # in what direction is the target value?
    # were we really sure? if so, don't change too much. thats why derivative it will calculate the delta to change the weight
    l2_delta = l2_error*nonlin(l2,deriv=True)

    # how much did each l1 value contribute to the l2 error (according to the weights)?
    #back propogation
    l1_error = l2_delta.dot(syn1.T)#4x1 . 1x4 = 4x4 propogate error for previous layer
    
    # in what direction is the target l1?
    # were we really sure? if so, don't change too much.
    l1_delta = l1_error * nonlin(l1,deriv=True)

    syn1 += l1.T.dot(l2_delta)
    syn0 += l0.T.dot(l1_delta)


#### Line 43: uses the "confidence weighted error" from l2 to establish an error for l1. To do this, it simply sends the error across the weights from l2 to l1. This gives what you could call a "contribution weighted error" because we learn how much each node value in l1 "contributed" to the error in l2. This step is called "backpropagating" and is the namesake of the algorithm. We then update syn0 using the same steps we did in the 2 layer implementation.