# Implementing a deep neural network with NumPy

In [1]:
import numpy as np

The magic behind deep neural nets is backpropagation.
To see that there is no rocket science in simple backprop, we are going to build a neural network with NumPy alone, thus implementing backpropagation ourselves.



In [2]:
# define basic dimensions of our network
n = 64
num_features = 1000
hidden_dim = 100
output_dim = 10

# hyperparameters
learning_rate = 1e-6
num_epochs = 500

In [3]:
# create input data
X = np.random.randn(n, num_features)           # 64 * 1000
y = np.random.randn(n, output_dim)             # 64 * 10
X.shape, y.shape

((64, 1000), (64, 10))

In [4]:
# initialize weights
W1 = np.random.randn(num_features, hidden_dim) # 1000 * 100
W2 = np.random.randn(hidden_dim, output_dim)   # 100 * 10

In [5]:
for epoch in range(num_epochs):
    
    ###  Forward pass ###
    
    # hidden layer
    hidden_pre_activation = X.dot(W1)          # 64 * 100
    # reLU activation
    hidden_activation = np.maximum(hidden_pre_activation, 0) 
    # ouput                                    # prediction is just linear - no activation applied
    y_pred = hidden_activation.dot(W2)         # 64 * 10

    ###  Compute and print loss  ###
    loss = np.square(y_pred - y).sum()         # squared error loss (RSS)
    if epoch % 10 == 0: print(epoch, loss)

    ###  Backprop  ###
    
    # Step 1: output loss
    gradient_y_pred = 2.0 * (y_pred - y)       # gradient for squared error, 64 * 10
    
    # Step 2: loss for W2 / hidden activation
    gradient_W2 = hidden_activation.T.dot(gradient_y_pred)             # gradient w.r.t. W2, 100 * 10
    gradient_hidden_activation = gradient_y_pred.dot(W2.T)             # gradient w.r.t. hidden activation, 64 * 100
    
    # step 3: gradient for hidden pre-activation
    gradient_hidden_pre_activation = gradient_hidden_activation.copy() # gradient w.r.t. hidden pre-activation
    gradient_hidden_pre_activation[hidden_pre_activation < 0] = 0
    
    # step 4: gradient for W1
    gradient_W1 = X.T.dot(gradient_hidden_pre_activation)              # gradient w.r.t. W1, 1000 * 100

    ### Update weights ###
    W1 -= learning_rate * gradient_W1
    W2 -= learning_rate * gradient_W2

0 36039354.2859
10 1107377.86887
20 223788.831893
30 74448.3927778
40 30588.5287447
50 14548.0852616
60 7640.99248799
70 4311.31890829
80 2564.0782734
90 1585.08298479
100 1008.53639513
110 655.691456916
120 433.35125501
130 290.085479884
140 196.249841286
150 133.904862291
160 91.982364619
170 63.5307245658
180 44.0838019396
190 30.712201033
200 21.4713768333
210 15.0590210191
220 10.6000954248
230 7.48021631279
240 5.29060377226
250 3.74961469033
260 2.66246697974
270 1.8939164217
280 1.34934213375
290 0.962734090346
300 0.68780768503
310 0.49199971179
320 0.352334194884
330 0.252581525497
340 0.181248488809
350 0.130179976371
360 0.093579397641
370 0.0673225793493
380 0.0484688960547
390 0.0349190419537
400 0.0251732575668
410 0.0181584794304
420 0.0131057618799
430 0.00946401928291
440 0.00683754518776
450 0.00494227087858
460 0.00357388643825
470 0.00258542835387
480 0.00187106394385
490 0.00135457027746
