## Neural Networks from Scratch
### Chapter 9

Below is an example of a neural network with 3 input neurons and the full forward pass through a single neuron and a ReLU function

In [1]:
x = [1.0, -2.0, 3.0]
w = [-3.0, -1.0, 2.0]
b = 1.0

In [6]:
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]
print(xw0, xw1, xw2, b)

-3.0 2.0 6.0 1.0


In [5]:
z = xw0 + xw1 + xw2 + b
print(z)

6.0


In [7]:
y = max(z, 0)
print(y)

6.0


The full forward pass can be treat as a big function, as specified below:

$$
ReLU(\sum_{i=0}^{3} x_{i}w_{i} + b)
$$

We want to know, how to adjust $w$ such that the cost function $C$ is minimized. For that, we can use a gradient descent algorithm which requires the derivative of the function above with respect to $w_{i}$. The derivative of the function above is:

$$
\frac{\partial}{\partial w}[ReLU(\sum_{i=0}^{3} x_{i}w_{i} + b)] = $\frac{\partial ReLU(\sum_{i=0}^{3} x_{i}w_{i} + b)}{\partial \sum_{i=0}^{3} x_{i}w_{i} + b} * \frac{\partial \sum_{i=0}^{3} x_{i}w_{i} + b}{\partial x_{i}w_{i}} * \frac{\partial x_{i}w_{i}}{\partial w_{i}}
$$

Let's assume that our neuron receives a gradient of 1 from the next layer.

In [10]:
dvalue = 1

The derivarive of ReLU is (1. if z > 0 else 0.). Then we multiply it by the value received from the next layer.

In [11]:
drelu_dz = dvalue * (1. if z > 0 else 0.)

In [12]:
print(drelu_dz)

1.0


Next, we compute the gradient for each $w_{i}$ and $b$:

In [14]:
dsum_dxw0 = 1
dsum_dxw1 = 1
dsum_dxw2 = 1
dsum_db = 1

drelu_dxw0 = drelu_dz * dsum_dxw0
drelu_dxw1 = drelu_dz * dsum_dxw1
drelu_dxw2 = drelu_dz * dsum_dxw2
drelu_db = drelu_dz * dsum_db

print(drelu_dxw0, drelu_dxw1, drelu_dxw2, drelu_db)

1.0 1.0 1.0 1.0


Finally is the gradients for the inputs and weights:

In [15]:
dmul_dx0 = w[0]
dmul_dx1 = w[1]
dmul_dx2 = w[2]

dmul_dw0 = x[0]
dmul_dw1 = x[1]
dmul_dw2 = x[2]

drelu_dx0 = drelu_dxw0 * dmul_dx0
drelu_dx1 = drelu_dxw1 * dmul_dx1
drelu_dx2 = drelu_dxw2 * dmul_dx2

drelu_dw0 = drelu_dxw0 * dmul_dw0
drelu_dw1 = drelu_dxw1 * dmul_dw1
drelu_dw2 = drelu_dxw2 * dmul_dw2

print(drelu_dx0, drelu_dw0, drelu_dx1, drelu_dw1, drelu_dx2, drelu_dw2)

-3.0 1.0 -1.0 -2.0 2.0 3.0


Here are our gradients

In [16]:
dx = [drelu_dx0, drelu_dx1, drelu_dx2]
dw = [drelu_dw0, drelu_dw1, drelu_dw2]
db = drelu_db

Applying the negative of the gradients to our weights and bias:

In [18]:
print(*w, b)

-3.0 -1.0 2.0 1.0


In [19]:
w[0] += -0.001 * dw[0]
w[1] += -0.001 * dw[1]
w[2] += -0.001 * dw[2]
b += -0.001 * db

In [20]:
print(*w, b)

-3.001 -0.998 1.997 0.999


And here is another forward pass

In [22]:
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]
print(xw0, xw1, xw2, b)

z = xw0 + xw1 + xw2 + b
print(z)

y = max(z, 0)
print(y)

-3.001 1.996 5.9910000000000005 0.999
5.985
5.985


##

In [35]:
import numpy as np

dvalues = np.array([1.,1.,1.])
weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]]).T

In [36]:
dx0 = sum(weights[0])*dvalues[0]
dx1 = sum(weights[1])*dvalues[0]
dx2 = sum(weights[2])*dvalues[0]
dx3 = sum(weights[3])*dvalues[0]

In [38]:
dinputs = np.array([dx0, dx1, dx2, dx3])

In [39]:
print(dinputs)

[ 0.44 -0.38 -0.07  1.37]


In [42]:
dvalues = np.array([[1.,1.,1.]])
d_inputs = np.dot(dvalues[0], weights.T)
d_inputs

array([ 0.44, -0.38, -0.07,  1.37])

In [45]:
dvalues = np.array([[1.,1.,1.], [2.,2.,2.], [3.,3.,3.]])
d_inputs = np.dot(dvalues, weights.T)
d_inputs

array([[ 0.44, -0.38, -0.07,  1.37],
       [ 0.88, -0.76, -0.14,  2.74],
       [ 1.32, -1.14, -0.21,  4.11]])

In [46]:
import numpy as np

dvalues = np.array([[1., 1., 1.],
[2., 2., 2.],
[3., 3., 3.]])

inputs = np.array([[1, 2, 3, 2.5],
[2., 5., -1., 2],
[-1.5, 2.7, 3.3, -0.8]])

dweights = np.dot(inputs.T, dvalues)
print(dweights)

[[ 0.5  0.5  0.5]
 [20.1 20.1 20.1]
 [10.9 10.9 10.9]
 [ 4.1  4.1  4.1]]


In [48]:
import numpy as np

dvalues = np.array([[1., 1., 1.],
[2., 2., 2.],
[3., 3., 3.]])

biases = np.array([[2, 3, 0.5]])

dbiases = np.sum(dvalues, axis=0, keepdims=True)
print(dbiases)

[[6. 6. 6.]]


In [49]:
import numpy as np
# Example layer output
z = np.array([[1, 2, -3, -4],
[2, -7, -1, 3],
[-1, 2, 5, -1]])
dvalues = np.array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]])
# ReLU activation's derivative
drelu = np.zeros_like(z)
drelu[z > 0] = 1
print(drelu)
# The chain rule
drelu *= dvalues
print(drelu)

[[1 1 0 0]
 [1 0 0 1]
 [0 1 1 0]]
[[ 1  2  0  0]
 [ 5  0  0  8]
 [ 0 10 11  0]]


In [59]:
import numpy as np

# Passed in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

# We have 3 sets of inputs - samples
inputs = np.array([[1, 2, 3, 2.5],
                   [2., 5., -1., 2],
                   [-1.5, 2.7, 3.3, -0.8]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]]).T


# One bias for each neuron
# biases are the row vector with a shape (1, neurons)
biases = np.array([[2, 3, 0.5]])

# Forward pass
layer_outputs = np.dot(inputs, weights) + biases # Dense layer
relu_outputs = np.maximum(0, layer_outputs)
layer_outputs, relu_outputs

(array([[ 4.8  ,  1.21 ,  2.385],
        [ 8.9  , -1.81 ,  0.2  ],
        [ 1.41 ,  1.051,  0.026]]),
 array([[4.8  , 1.21 , 2.385],
        [8.9  , 0.   , 0.2  ],
        [1.41 , 1.051, 0.026]]))

In [61]:
drelu = relu_outputs.copy()
drelu[layer_outputs <= 0] = 0
drelu

array([[4.8  , 1.21 , 2.385],
       [8.9  , 0.   , 0.2  ],
       [1.41 , 1.051, 0.026]])

In [68]:
np.dot(inputs, weights)

array([[ 2.561562  , -1.8269043 ,  1.83359695],
       [ 6.522304  , -4.8328256 , -0.3403356 ],
       [-0.7261887 , -1.98542257, -0.50381082]])

In [62]:
import numpy as np

dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

inputs = np.array([[1, 2, 3, 2.5],
                   [2., 5., -1., 2],
                   [-1.5, 2.7, 3.3, -0.8]])

weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]]).T


biases = np.array([[2, 3, 0.5]])

# Forward pass
layer_outputs = np.dot(inputs, weights) + biases # Dense layer
relu_outputs = np.maximum(0, layer_outputs) # ReLU activation

drelu = relu_outputs.copy()
drelu[layer_outputs <= 0] = 0

dinputs = np.dot(drelu, weights.T)

dweights = np.dot(inputs.T, drelu)

dbiases = np.sum(drelu, axis=0, keepdims=True)

weights += -0.001 * dweights
biases += -0.001 * dbiases
print(weights)
print(biases)

[[ 0.179515   0.5003665 -0.262746 ]
 [ 0.742093  -0.9152577 -0.2758402]
 [-0.510153   0.2529017  0.1629592]
 [ 0.971328  -0.5021842  0.8636583]]
[[1.98489  2.997739 0.497389]]


In [67]:
import numpy as np

class LayerDense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(np.array(inputs), self.weights) + self.biases
        
    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weights.T)

class ActivationReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
    
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

class ActivationSoftmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        self.output = exp_values/np.sum(exp_values, axis=1, keepdims=True)
   
    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)
        
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            single_output = single_output.reshape(-1, 1)
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix,
            single_dvalues)


class Loss:
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

class LossCategoricalCrossentropy(Loss):
    def forward(self, y_pred, y_true):
        n_samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
        
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(n_samples), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
        
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])
        
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        
        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples