# Simple neural network with numpy only

Using one hidden layer with relU activation and squared error loss for the output.

In [1]:
import numpy as np

## Initialize variables

In [2]:
# N is batch size; D_in is input dimension; H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [3]:
x = np.random.randn(N, D_in)
x.shape

(64, 1000)

In [4]:
y = np.random.randn(N, D_out)
y.shape

(64, 10)

In [5]:
# input -> hidden weights
w1 = np.random.randn(D_in, H)
w1.shape

(1000, 100)

In [6]:
# hidden -> output weights
w2 = np.random.randn(H, D_out)
w2.shape

(100, 10)

In [7]:
learning_rate = 1e-6

## Forward propagation

In [8]:
h = x.dot(w1)
h.shape

(64, 100)

In [9]:
h_relu = np.maximum(h, 0)

In [10]:
ypred = h_relu.dot(w2)
ypred.shape

(64, 10)

In [11]:
loss = np.square(ypred - y).sum()

## Backpropagation

Compute gradient of the loss with respect to prediction:

In [12]:
grad_loss_ypred = 2.0 * (ypred - y)
grad_loss_ypred.shape

(64, 10)

Compute gradient of loss with respect to w2:

In [13]:
# gradient of prediction wrt w2 is just h_relu
grad_ypred_w2 = h_relu
# gradient of loss with respect to w2 (chain rule)
grad_loss_w2 = grad_ypred_w2.T.dot(grad_loss_ypred)
grad_loss_w2.shape

(100, 10)

Compute gradient of loss with respect to h_relu:

In [14]:
# gradient of prediction wrt h_relu is just w2
grad_ypred_hrelu = w2
# gradient of loss with respect to  h_relu (chain rule)
grad_loss_hrelu = grad_loss_ypred.dot(grad_ypred_hrelu.T)
grad_loss_hrelu.shape

(64, 100)

Compute gradient of loss with respect to h (pre-activation):

In [15]:
# set gradient to 0 if h<0 
grad_loss_h = grad_loss_hrelu.copy()
grad_loss_h[h < 0] = 0

Compute gradient of loss with respect to w1:

In [16]:
# gradient of pre-activation wrt w1 is just input
grad_h_w1 = x
# gradient of loss with respect to w1 (chain rule)
grad_loss_w1 = grad_h_w1.T.dot(grad_loss_h)
grad_loss_w1.shape

(1000, 100)

Update weights:

In [17]:
w1 -= learning_rate * grad_loss_w1
w2 -= learning_rate * grad_loss_w2

## Training loop

In [20]:
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    ypred = h_relu.dot(w2)
    loss = np.square(ypred - y).sum()
    if t % 10 == 0: print("Iteration: {}\t loss:{}".format(t, loss))
    grad_loss_ypred = 2.0 * (ypred - y)
    grad_ypred_w2 = h_relu
    grad_loss_w2 = grad_ypred_w2.T.dot(grad_loss_ypred)
    grad_ypred_hrelu = w2
    grad_loss_hrelu = grad_loss_ypred.dot(grad_ypred_hrelu.T)
    grad_loss_h = grad_loss_hrelu.copy()
    grad_loss_h[h < 0] = 0
    grad_h_w1 = x
    grad_loss_w1 = grad_h_w1.T.dot(grad_loss_h)
    w1 -= learning_rate * grad_loss_w1
    w2 -= learning_rate * grad_loss_w2

Iteration: 0	 loss:1.759570956100218e-20
Iteration: 10	 loss:1.0625794096369956e-20
Iteration: 20	 loss:6.579967656912601e-21
Iteration: 30	 loss:4.193953250918141e-21
Iteration: 40	 loss:2.7717575574265786e-21
Iteration: 50	 loss:1.8972658167704706e-21
Iteration: 60	 loss:1.3477257802523253e-21
Iteration: 70	 loss:9.860068920803912e-22
Iteration: 80	 loss:7.373461041192204e-22
Iteration: 90	 loss:5.664133259862861e-22
Iteration: 100	 loss:4.452531277218781e-22
Iteration: 110	 loss:3.581374626682208e-22
Iteration: 120	 loss:2.943461586389527e-22
Iteration: 130	 loss:2.4514769288430213e-22
Iteration: 140	 loss:2.0754561366015177e-22
Iteration: 150	 loss:1.7764404564351458e-22
Iteration: 160	 loss:1.549942659058507e-22
Iteration: 170	 loss:1.3620412485312384e-22
Iteration: 180	 loss:1.2104159678200468e-22
Iteration: 190	 loss:1.0848448275990592e-22
Iteration: 200	 loss:9.708521201586209e-23
Iteration: 210	 loss:8.915654197952746e-23
Iteration: 220	 loss:8.163680330612299e-23
Iteration: 2