In [4]:
# numpy tensors
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    
    # forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # loss
    loss = np.square(y_pred - y).sum()
    print('Loss at round {}: {}'.format(t, loss))
    
    # backprop: compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

Loss at round 0: 29522651.409277495
Loss at round 1: 25449746.431576107
Loss at round 2: 26500131.284128733
Loss at round 3: 28522996.953142162
Loss at round 4: 27644948.58401368
Loss at round 5: 22872640.770590372
Loss at round 6: 15437068.846479135
Loss at round 7: 9023717.412381198
Loss at round 8: 4838505.4344838895
Loss at round 9: 2646729.004670292
Loss at round 10: 1567720.9114123741
Loss at round 11: 1038434.6509161
Loss at round 12: 758953.9372948292
Loss at round 13: 595335.7696933404
Loss at round 14: 488018.6993984984
Loss at round 15: 410570.4624497715
Loss at round 16: 350661.94432777877
Loss at round 17: 302369.61716356984
Loss at round 18: 262426.96215828496
Loss at round 19: 228884.3386646831
Loss at round 20: 200441.4548639501
Loss at round 21: 176130.24708663556
Loss at round 22: 155300.70737091827
Loss at round 23: 137316.3621024067
Loss at round 24: 121709.12454715275
Loss at round 25: 108129.22322359771
Loss at round 26: 96283.27105410467
Loss at round 27: 85920.4

In [7]:
# pytorch tensors
import torch

dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    
    # forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # loss
    loss = (y_pred - y).pow(2).sum().item()
    print('Loss at round {}: {}'.format(t, loss))
    
    # backprop: compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

Loss at round 0: 38416064.0
Loss at round 1: 43674996.0
Loss at round 2: 55785888.0
Loss at round 3: 60257204.0
Loss at round 4: 45497556.0
Loss at round 5: 21367376.0
Loss at round 6: 7489886.5
Loss at round 7: 2967260.0
Loss at round 8: 1725957.125
Loss at round 9: 1281231.25
Loss at round 10: 1040756.1875
Loss at round 11: 871574.6875
Loss at round 12: 739661.9375
Loss at round 13: 633076.8125
Loss at round 14: 545454.6875
Loss at round 15: 472562.21875
Loss at round 16: 411489.15625
Loss at round 17: 359931.625
Loss at round 18: 316267.5
Loss at round 19: 278990.78125
Loss at round 20: 247084.96875
Loss at round 21: 219626.4375
Loss at round 22: 195827.625
Loss at round 23: 175109.6875
Loss at round 24: 157013.515625
Loss at round 25: 141158.671875
Loss at round 26: 127210.046875
Loss at round 27: 114887.5546875
Loss at round 28: 103982.1640625
Loss at round 29: 94304.171875
Loss at round 30: 85680.4296875
Loss at round 31: 77987.0625
Loss at round 32: 71105.7421875
Loss at round 3

In [9]:
# autograd
dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    
    # forward pass: compute predicted y
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # loss
    loss = (y_pred - y).pow(2).sum()
    print('Loss at round {}: {}'.format(t, loss.item()))
    
    # backprop: compute gradients of w1 and w2 with respect to loss
    loss.backward()
    
    # update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

Loss at round 0: 36037248.0
Loss at round 1: 33340364.0
Loss at round 2: 32583728.0
Loss at round 3: 28454918.0
Loss at round 4: 20927150.0
Loss at round 5: 12754392.0
Loss at round 6: 7054277.5
Loss at round 7: 3873779.0
Loss at round 8: 2316627.5
Loss at round 9: 1547989.75
Loss at round 10: 1141359.25
Loss at round 11: 899158.5
Loss at round 12: 737514.75
Loss at round 13: 619393.9375
Loss at round 14: 527929.375
Loss at round 15: 454559.875
Loss at round 16: 394246.25
Loss at round 17: 344067.375
Loss at round 18: 301823.71875
Loss at round 19: 265885.8125
Loss at round 20: 235145.3125
Loss at round 21: 208669.0
Loss at round 22: 185768.328125
Loss at round 23: 165836.625
Loss at round 24: 148445.015625
Loss at round 25: 133192.40625
Loss at round 26: 119779.640625
Loss at round 27: 107942.515625
Loss at round 28: 97460.03125
Loss at round 29: 88166.875
Loss at round 30: 79891.3203125
Loss at round 31: 72504.765625
Loss at round 32: 65903.3125
Loss at round 33: 59981.51953125
Loss 

In [11]:
# define new autograd functions
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    
    relu = MyReLU.apply
    
    # forward pass: compute predicted y
    y_pred = relu(x.mm(w1)).mm(w2)
    
    # loss
    loss = (y_pred - y).pow(2).sum()
    print('Loss at round {}: {}'.format(t, loss.item()))
    
    # backprop: compute gradients of w1 and w2 with respect to loss
    loss.backward()
    
    # update weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

Loss at round 0: 30528440.0
Loss at round 1: 27093026.0
Loss at round 2: 24774936.0
Loss at round 3: 21011356.0
Loss at round 4: 15933730.0
Loss at round 5: 10799482.0
Loss at round 6: 6849454.5
Loss at round 7: 4267007.5
Loss at round 8: 2747044.25
Loss at round 9: 1874518.5
Loss at round 10: 1366070.5
Loss at round 11: 1053725.875
Loss at round 12: 848473.3125
Loss at round 13: 703854.0
Loss at round 14: 595875.5625
Loss at round 15: 511338.96875
Loss at round 16: 442843.65625
Loss at round 17: 386045.8125
Loss at round 18: 338248.3125
Loss at round 19: 297629.0
Loss at round 20: 262803.78125
Loss at round 21: 232739.265625
Loss at round 22: 206665.828125
Loss at round 23: 183977.8125
Loss at round 24: 164154.203125
Loss at round 25: 146781.53125
Loss at round 26: 131505.296875
Loss at round 27: 118057.1015625
Loss at round 28: 106179.2578125
Loss at round 29: 95652.25
Loss at round 30: 86298.5546875
Loss at round 31: 77976.2734375
Loss at round 32: 70553.3125
Loss at round 33: 63921

In [13]:
# tensorflow static graph
import tensorflow as tf

N, D_in, H, D_out = 64, 1000, 100, 10

x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y - y_pred) ** 2.0)

grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    
    for _ in range(500):
        loss_value, _, _ = session.run([loss, new_w1, new_w2], feed_dict={x: x_value, y: y_value})
        print(loss_value)

35343616.0
35549212.0
38577800.0
36940370.0
31306810.0
31954522.0
35456132.0
31064544.0
20708544.0
10526854.0
4671208.0
2108873.0
1118105.9
723874.6
545777.25
447437.75
381676.38
331723.94
291087.62
256949.95
227868.12
202803.6
181043.44
162066.22
145437.16
130798.25
117862.81
106411.45
96238.16
87204.81
79147.5
71939.03
65476.996
59672.07
54447.305
49732.51
45478.97
41637.227
38157.53
35000.703
32133.055
29524.96
27149.648
24983.453
23005.977
21198.84
19546.32
18034.156
16649.08
15378.557
14212.4375
13141.705
12157.591
11253.52
10421.051
9654.412
8948.057
8296.977
7696.837
7143.2163
6631.67
6159.069
5722.2065
5318.153
4944.404
4598.679
4279.1426
3983.0142
3708.6301
3454.196
3218.2007
2999.147
2795.8018
2606.9531
2431.4526
2268.349
2116.7256
1975.7268
1844.5205
1722.5098
1608.9346
1503.1638
1404.6824
1312.8885
1227.3662
1147.6624
1073.3461
1004.0191
939.3587
879.02673
822.8502
770.506
721.6315
675.98083
633.33386
593.4638
556.1984
521.3557
488.76886
458.28793
429.78333
403.10803
378.14