In [None]:
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print (t, loss)
    
    grad_y_pred = 2 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [6]:
import torch

dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    print (t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2

0 29027772.0
1 24014650.0
2 23267664.0
3 23062336.0
4 21466510.0
5 17602062.0
6 12742699.0
7 8234862.0
8 5047880.0
9 3071486.0
10 1952458.875
11 1322782.25
12 961434.1875
13 741042.0
14 597131.25
15 496043.625
16 420558.75
17 361474.4375
18 313621.65625
19 273934.5
20 240529.046875
21 212084.21875
22 187670.765625
23 166590.9375
24 148306.03125
25 132375.53125
26 118453.8125
27 106253.671875
28 95518.5234375
29 86046.765625
30 77665.4140625
31 70233.4609375
32 63626.7265625
33 57740.0546875
34 52477.765625
35 47770.00390625
36 43560.1328125
37 39781.37890625
38 36381.99609375
39 33325.6640625
40 30563.89453125
41 28065.21875
42 25801.396484375
43 23747.583984375
44 21881.578125
45 20184.353515625
46 18637.98828125
47 17227.013671875
48 15938.564453125
49 14760.087890625
50 13681.7744140625
51 12693.5966796875
52 11787.5380859375
53 10954.7109375
54 10189.197265625
55 9484.4892578125
56 8835.197265625
57 8236.609375
58 7683.908203125
59 7173.3193359375
60 6701.13232421875
61 6264.176757

447 0.0009950874373316765
448 0.0009688494028523564
449 0.0009457521955482662
450 0.0009224653476849198
451 0.0008989466587081552
452 0.0008762314100749791
453 0.0008539531845599413
454 0.0008327242685481906
455 0.0008123394218273461
456 0.0007924484089016914
457 0.0007720642606727779
458 0.000755122397094965
459 0.0007361455936916173
460 0.0007188338204286993
461 0.0007030011038295925
462 0.0006861077272333205
463 0.0006700821686536074
464 0.0006536655710078776
465 0.0006393103976733983
466 0.0006250927108339965
467 0.000610060291364789
468 0.0005969475605525076
469 0.0005820296937599778
470 0.0005692942650057375
471 0.0005572065128944814
472 0.0005449694581329823
473 0.000532620819285512
474 0.0005211801617406309
475 0.0005100520211271942
476 0.0004989581648260355
477 0.0004886059323325753
478 0.00047762319445610046
479 0.0004672808281611651
480 0.00045915861846879125
481 0.00044859902118332684
482 0.00043951021507382393
483 0.0004302644811104983
484 0.0004208571626804769
485 0.00041

In [10]:
import torch

dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print (t, loss.item())
    
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 26143940.0
1 23506112.0
2 24184854.0
3 24797768.0
4 23138714.0
5 18483198.0
6 12655846.0
7 7654493.5
8 4398606.5
9 2560730.75
10 1594669.375
11 1083550.5
12 800030.75
13 629162.75
14 516617.90625
15 435977.03125
16 374383.6875
17 325181.8125
18 284707.0625
19 250743.171875
20 221895.265625
21 197200.09375
22 175856.609375
23 157403.453125
24 141294.609375
25 127174.5390625
26 114746.03125
27 103768.3046875
28 94036.015625
29 85392.03125
30 77684.4453125
31 70802.28125
32 64642.7734375
33 59109.50390625
34 54145.484375
35 49666.1875
36 45611.94921875
37 41936.7265625
38 38603.83984375
39 35574.12890625
40 32816.296875
41 30303.259765625
42 28005.984375
43 25906.3515625
44 23985.80078125
45 22224.314453125
46 20608.78515625
47 19128.0703125
48 17765.076171875
49 16511.16015625
50 15355.1416015625
51 14288.8037109375
52 13304.873046875
53 12396.015625
54 11555.5478515625
55 10778.1787109375
56 10058.0400390625
57 9390.833984375
58 8772.7373046875
59 8199.060546875
60 7666.6708984375
61 

470 0.00010722072329372168
471 0.00010550885781412944
472 0.00010314468818251044
473 0.00010133162868442014
474 9.966791549231857e-05
475 9.791506454348564e-05
476 9.584124927641824e-05
477 9.429350757272914e-05
478 9.30301466723904e-05
479 9.093060361919925e-05
480 8.957980026025325e-05
481 8.794652967480943e-05
482 8.648368384456262e-05
483 8.497774979332462e-05
484 8.338389307027683e-05
485 8.209746738430113e-05
486 8.079464896582067e-05
487 7.905002712504938e-05
488 7.788140646880493e-05
489 7.668704347452149e-05
490 7.535025360994041e-05
491 7.417136657750234e-05
492 7.313955575227737e-05
493 7.188008748926222e-05
494 7.048888801364228e-05
495 6.899517757119611e-05
496 6.83342877891846e-05
497 6.723441038047895e-05
498 6.617474718950689e-05
499 6.512318941531703e-05


In [4]:
import numpy as np
import tensorflow as tf

In [6]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y - y_pred) ** 2.0)

grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

alpha = 1e-6
new_w1 = w1.assign(w1 - alpha*grad_w1)
new_w2 = w2.assign(w2 - alpha*grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    
    for t in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2], feed_dict={x: x_value, y: y_value})
        print (t, loss_value)

0 37132708.0
1 32257888.0
2 29351472.0
3 24186892.0
4 17209100.0
5 10696995.0
6 6200935.5
7 3612740.0
8 2249776.0
9 1530157.2
10 1129543.5
11 885865.9
12 722838.25
13 604691.4
14 513835.6
15 441412.06
16 382132.9
17 332952.78
18 291699.66
19 256761.11
20 226954.56
21 201311.1
22 179185.5
23 160009.12
24 143297.95
25 128684.836
26 115854.09
27 104553.82
28 94568.77
29 85718.81
30 77846.734
31 70836.086
32 64569.81
33 58967.664
34 53953.6
35 49439.66
36 45369.6
37 41694.875
38 38368.867
39 35359.82
40 32624.457
41 30134.934
42 27863.621
43 25789.031
44 23891.932
45 22154.242
46 20560.547
47 19097.266
48 17752.322
49 16515.104
50 15375.193
51 14324.193
52 13355.088
53 12459.881
54 11631.971
55 10865.858
56 10156.42
57 9498.512
58 8888.463
59 8321.896
60 7795.7476
61 7306.5586
62 6851.2627
63 6427.323
64 6032.2744
65 5663.965
66 5320.552
67 4999.874
68 4700.5293
69 4420.669
70 4158.961
71 3914.1865
72 3685.0737
73 3470.6619
74 3269.9634
75 3081.8657
76 2905.465
77 2739.9846
78 2584.8223
79

In [1]:
import torch

In [2]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

loss_fn = torch.nn.MSELoss(reduction='sum')
alpha = 1e-4

for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print (t, loss.item())
    
    model.zero_grad()
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= alpha * param.grad

0 708.4017944335938
1 652.093505859375
2 603.8182373046875
3 561.7349853515625
4 524.6866455078125
5 491.6518859863281
6 461.9724426269531
7 434.8180847167969
8 409.8164367675781
9 386.77789306640625
10 365.35272216796875
11 345.353759765625
12 326.4742431640625
13 308.5834655761719
14 291.5970153808594
15 275.41607666015625
16 259.9713439941406
17 245.21783447265625
18 231.16578674316406
19 217.84181213378906
20 205.15283203125
21 193.00257873535156
22 181.4163055419922
23 170.44154357910156
24 160.0634765625
25 150.26776123046875
26 141.0020751953125
27 132.25062561035156
28 124.0119400024414
29 116.2523422241211
30 108.94319915771484
31 102.0633316040039
32 95.59586334228516
33 89.53822326660156
34 83.86141967773438
35 78.54115295410156
36 73.5600357055664
37 68.91087341308594
38 64.5558853149414
39 60.48472213745117
40 56.69178009033203
41 53.15340805053711
42 49.85300827026367
43 46.77638244628906
44 43.909175872802734
45 41.23411560058594
46 38.74028015136719
47 36.41707229614258

389 9.505743946647272e-05
390 9.205554670188576e-05
391 8.914907812140882e-05
392 8.633274410385638e-05
393 8.360704669030383e-05
394 8.097414684016258e-05
395 7.8417724580504e-05
396 7.594475755468011e-05
397 7.35479625291191e-05
398 7.123207615222782e-05
399 6.898315768921748e-05
400 6.681447848677635e-05
401 6.470626249210909e-05
402 6.267287244554609e-05
403 6.0697020671796054e-05
404 5.878622687305324e-05
405 5.693937055184506e-05
406 5.514619988389313e-05
407 5.341138967196457e-05
408 5.173284080228768e-05
409 5.0108519644709304e-05
410 4.853274003835395e-05
411 4.7005647502373904e-05
412 4.5531596697401255e-05
413 4.410204928717576e-05
414 4.2717983887996525e-05
415 4.137539872317575e-05
416 4.0078357415040955e-05
417 3.881987868226133e-05
418 3.760275649256073e-05
419 3.6421624827198684e-05
420 3.527947410475463e-05
421 3.417264451854862e-05
422 3.310241299914196e-05
423 3.206653491361067e-05
424 3.1059560569701716e-05
425 3.0088202038314193e-05
426 2.9146349334041588e-05
427 2

In [4]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

loss_fn = torch.nn.MSELoss(reduction='sum')
alpha = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=alpha)

for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print (t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 687.9999389648438
1 670.8938598632812
2 654.281005859375
3 638.1936645507812
4 622.59375
5 607.4892578125
6 592.8231811523438
7 578.5949096679688
8 564.7568969726562
9 551.3889770507812
10 538.368896484375
11 525.6524658203125
12 513.2454223632812
13 501.1505432128906
14 489.3999328613281
15 477.96978759765625
16 466.9394836425781
17 456.18853759765625
18 445.655517578125
19 435.3912658691406
20 425.462158203125
21 415.8222351074219
22 406.4775695800781
23 397.4743347167969
24 388.6735534667969
25 380.05902099609375
26 371.62249755859375
27 363.4452209472656
28 355.47454833984375
29 347.6846008300781
30 340.03155517578125
31 332.5684509277344
32 325.2412109375
33 318.0621337890625
34 311.01922607421875
35 304.11865234375
36 297.3783264160156
37 290.7820129394531
38 284.3012390136719
39 277.92431640625
40 271.6582946777344
41 265.5035705566406
42 259.43487548828125
43 253.4685821533203
44 247.61041259765625
45 241.8771514892578
46 236.26406860351562
47 230.74412536621094
48 225.335372

368 0.0001978159707505256
369 0.00018695234030019492
370 0.0001766597997630015
371 0.0001669350895099342
372 0.00015771454491186887
373 0.00014900133828632534
374 0.0001407488016411662
375 0.00013294034579303116
376 0.00012556159344967455
377 0.00011857550998684019
378 0.00011197029380127788
379 0.00010571730672381818
380 9.981406037695706e-05
381 9.421958384336904e-05
382 8.893814811017364e-05
383 8.394036558456719e-05
384 7.921798533061519e-05
385 7.47554367990233e-05
386 7.053207809804007e-05
387 6.654700700892136e-05
388 6.277745706029236e-05
389 5.9218542446615174e-05
390 5.585304825217463e-05
391 5.2673476602649316e-05
392 4.967135100741871e-05
393 4.6834109525661916e-05
394 4.415454532136209e-05
395 4.1627346945460886e-05
396 3.923978147213347e-05
397 3.698349974001758e-05
398 3.485407432890497e-05
399 3.284310514573008e-05
400 3.094557541771792e-05
401 2.915340337494854e-05
402 2.7465644961921498e-05
403 2.5867504518828355e-05
404 2.4364386263187043e-05
405 2.294664409419056e-0