# Implementing a deep neural network with NumPy

In [1]:
import numpy as np

The magic behind deep neural nets is backpropagation.
To see that there is no rocket science in simple backprop, we are going to build a neural network with NumPy alone, thus implementing backpropagation ourselves.



In [4]:
# define basic dimensions of our network
n = 64
num_features = 1000
hidden_dim = 100
output_dim = 10

# hyperparameters
learning_rate = 1e-6
num_epochs = 500

In [5]:
# create input data
X = np.random.randn(n, num_features)           # 64 * 1000
y = np.random.randn(n, output_dim)             # 64 * 10
X.shape, y.shape

((64, 1000), (64, 10))

In [7]:
# initialize weights
W1 = np.random.randn(num_features, hidden_dim) # 1000 * 100
W2 = np.random.randn(hidden_dim, output_dim)   # 100 * 10

In [8]:
W1

array([[ 0.36132714,  1.22576587, -0.05194421, ..., -0.36190339,
        -0.43697589, -0.31445682],
       [ 1.71998368,  0.85543911,  0.05228614, ..., -0.16534124,
        -2.70286056,  0.12855479],
       [-0.85330732,  1.17836695,  1.83711836, ..., -0.38646043,
        -0.26723985,  0.60119462],
       ..., 
       [ 0.07949523,  0.34050307, -0.06565425, ...,  0.52930321,
        -0.93189631,  0.99069479],
       [ 0.46378986, -0.53799953, -1.46890349, ..., -0.41189977,
        -1.12399145,  0.18397312],
       [-1.8530992 , -0.62059888,  0.88532707, ..., -1.08826959,
         0.20346044, -0.85422642]])

In [9]:
for epoch in range(num_epochs):
    
    ###  Forward pass ###
    
    # hidden layer
    hidden_pre_activation = X.dot(W1)          # 64 * 100
    # reLU activation
    hidden_activation = np.maximum(hidden_pre_activation, 0) 
    # ouput                                    # prediction is just linear - no activation applied
    y_pred = hidden_activation.dot(W2)         # 64 * 10

    ###  Compute and print loss  ###
    loss = np.square(y_pred - y).sum()         # squared error loss (RSS)
    if epoch % 10 == 0: print(epoch, loss)

    ###  Backprop  ###
    
    # Step 1: output loss
    gradient_y_pred = 2.0 * (y_pred - y)       # gradient for squared error, 64 * 10
    
    # Step 2: loss for W2 / hidden activation
    gradient_W2 = hidden_activation.T.dot(gradient_y_pred)             # gradient w.r.t. W2, 100 * 10
    gradient_hidden_activation = gradient_y_pred.dot(W2.T)             # gradient w.r.t. hidden activation, 64 * 100
    
    # step 3: gradient for hidden pre-activation
    gradient_hidden_pre_activation = gradient_hidden_activation.copy() # gradient w.r.t. hidden pre-activation
    gradient_hidden_pre_activation[hidden_pre_activation < 0] = 0
    
    # step 4: gradient for W1
    gradient_W1 = X.T.dot(gradient_hidden_pre_activation)              # gradient w.r.t. W1, 1000 * 100

    ### Update weights ###
    W1 -= learning_rate * gradient_W1
    W2 -= learning_rate * gradient_W2

0 30048037.4452
10 896577.934518
20 183787.071936
30 61764.3442444
40 25298.1444104
50 11653.5051003
60 5780.83821191
70 3001.77813303
80 1607.61528996
90 879.354733926
100 488.604966362
110 274.881033505
120 156.178788661
130 89.4464644647
140 51.5773915823
150 29.9184870772
160 17.4474409194
170 10.2234272049
180 6.01711616324
190 3.55595032873
200 2.1096265434
210 1.25612406174
220 0.750535691915
230 0.449938832292
240 0.270597814219
250 0.163238898833
260 0.0987686465505
270 0.0599320081536
280 0.0364683442299
290 0.0222510825048
300 0.0136125996377
310 0.00834901087892
320 0.0051336462588
330 0.00316430305399
340 0.00195499806325
350 0.00121066155349
360 0.000751412708868
370 0.000467363286073
380 0.000291308255073
390 0.000181933118592
400 0.000113846961665
410 7.13723269271e-05
420 4.48238245967e-05
430 2.81993631479e-05
440 1.77691888223e-05
450 1.12147850687e-05
460 7.08854770301e-06
470 4.48702256168e-06
480 2.84417845298e-06
490 1.8051952739e-06
