### Numpy Version

In [1]:
import numpy as np

In [2]:
# N = sample size
N = 64

# D_in = number of features
D_in = 1000

# H = Number of hidden layers
H = 100

# D_out = Number of output layers
D_out = 10

In [3]:
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [4]:
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [5]:
learning_rate = 1e-6

In [6]:
for t in range(500):
    # Forward pass
    z = x.dot(w1)
    z_relu = np.maximum(z, 0)
    y_pred = z_relu.dot(w2)
    
    # Compute the loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = z_relu.T.dot(grad_y_pred)
    grad_z_relu = grad_y_pred.dot(w2.T)
    grad_z = grad_z_relu.copy()
    grad_z[z < 0] = 0
    grad_w1 = x.T.dot(grad_z)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 34687350.42475495
1 35725145.74962754
2 40253405.9431926
3 39918844.497269765
4 30806125.637332026
5 17520718.69540666
6 8047445.132882288
7 3574962.1051908717
8 1842771.9679809674
9 1161656.1338414147
10 849685.974295649
11 673126.0868919721
12 554185.1008219891
13 465208.45729462826
14 394972.81560685753
15 337972.8671315616
16 291096.8796951584
17 252148.81061594462
18 219640.95215774735
19 192215.50810486532
20 168899.16054512426
21 148954.43838092955
22 131804.66054083395
23 117001.68040527811
24 104152.14083988548
25 92964.07249109633
26 83185.61243893398
27 74608.36495442604
28 67057.58095261239
29 60391.94171370997
30 54493.483417182404
31 49262.747461619554
32 44606.94461933996
33 40454.368653705154
34 36740.82780257509
35 33414.17233283577
36 30427.28457306868
37 27742.045330443427
38 25322.2686602963
39 23139.718729703312
40 21167.543809297702
41 19383.053226705168
42 17766.710482669198
43 16299.827745116057
44 14967.040538942307
45 13755.846063701412
46 12652.419115036897

368 0.00010865761399379165
369 0.00010315227134619033
370 9.792686405326153e-05
371 9.296717827344003e-05
372 8.826010455105771e-05
373 8.379273812660804e-05
374 7.955264512876114e-05
375 7.552692480433402e-05
376 7.170589607130473e-05
377 6.807888655573446e-05
378 6.463598299277823e-05
379 6.136780058969893e-05
380 5.82657094088322e-05
381 5.5321092948251954e-05
382 5.2525685739871716e-05
383 4.9871809336743996e-05
384 4.7352527297681596e-05
385 4.49611751263341e-05
386 4.2691287114879304e-05
387 4.0535962027032754e-05
388 3.848964331982354e-05
389 3.654711575213333e-05
390 3.470317335965169e-05
391 3.2952437854382463e-05
392 3.129082041818147e-05
393 2.9712850415042066e-05
394 2.8214681920224215e-05
395 2.6792322616626756e-05
396 2.54419773539872e-05
397 2.4159916426492493e-05
398 2.2942660597462906e-05
399 2.178689933093445e-05
400 2.068954165877626e-05
401 1.964757737327933e-05
402 1.8658309677068036e-05
403 1.7719191314201095e-05
404 1.682740205119272e-05
405 1.5980574482611113e-0

### PyTorch Version

In [9]:
import torch

In [10]:
dtype = torch.float
device = torch.device('cpu')

In [11]:
# N = sample size
N = 64

# D_in = number of features
D_in = 1000

# H = Number of hidden layers
H = 100

# D_out = Number of output layers
D_out = 10

In [12]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [13]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [14]:
learning_rate = 1e-6

In [17]:
for t in range(500):
    # Forward pass
    z = x.mm(w1)
    z_relu = z.clamp(min=0)
    y_pred = z_relu.mm(w2)
    
    # Compute the loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
    
    # Backprop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = z_relu.t().mm(grad_y_pred)
    grad_z_relu = grad_y_pred.mm(w2.T)
    grad_z = grad_z_relu.clone()
    grad_z[z < 0] = 0
    grad_w1 = x.t().mm(grad_z)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 2.8561457838804927e-06
199 2.3970642359927297e-06
299 2.078137413263903e-06
399 1.7613215277378913e-06
499 1.5936099089230993e-06
