In [1]:
import numpy as np

In [2]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [7]:
y

array([[ 1.27791750e+00, -7.94074710e-01,  1.10592127e+00,
        -1.23323443e+00,  2.24697263e-01, -1.35561567e+00,
        -7.01613120e-03,  2.38740121e+00,  2.58094413e+00,
        -9.36946176e-01],
       [-3.73907996e-01, -1.10542742e+00,  4.77159821e-01,
        -1.48535658e-01, -8.32909510e-01, -7.30705210e-02,
        -6.13231594e-01,  6.56372318e-01, -5.50527254e-02,
        -6.49896748e-01],
       [ 1.41774039e+00,  2.71833631e-02,  6.43649237e-01,
        -3.55868973e-01, -8.64806547e-01,  6.45908480e-01,
         9.62761192e-01, -1.81879307e+00,  8.13344545e-01,
        -1.07014726e+00],
       [ 2.39794232e-01, -8.27958996e-01,  4.40678400e-01,
        -8.01738416e-01, -2.80788719e-01,  1.19744239e+00,
        -1.15094739e-01,  1.72620010e+00,  2.04570986e+00,
        -4.98226067e-01],
       [-2.08420850e+00,  7.52534207e-01, -4.67624100e-01,
         1.36395222e+00,  1.37568716e+00, -1.78803680e-01,
         8.16607435e-01,  1.21144608e+00,  1.79594359e-01,
         9.

In [3]:
learning_rate = 1e-6

In [4]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28552147.742794223
1 22207910.88601175
2 20041268.28801959
3 18702004.97015564
4 16793481.474098325
5 13817879.790463982
6 10394975.683930969
7 7202984.822419647
8 4778788.656764293
9 3127939.98406528
10 2087998.1559043997
11 1447969.2371590778
12 1054064.5968934828
13 804330.2474916442
14 639306.9553982952
15 524716.5536507993
16 441247.53682029736
17 377633.3831378864
18 327271.4773545524
19 286258.50355947344
20 252097.23008945456
21 223186.07207104866
22 198458.6121363447
23 177140.72072790988
24 158620.73317563784
25 142451.3794178653
26 128244.88071422583
27 115732.9657279946
28 104670.96042629375
29 94873.63499387741
30 86164.7569210674
31 78393.78462847433
32 71468.08081736718
33 65286.9327814261
34 59735.3404370129
35 54736.331990703125
36 50245.26738653476
37 46183.95050210702
38 42505.15675483917
39 39166.790523411255
40 36131.71270596617
41 33368.17887531476
42 30851.353740238374
43 28553.803086002692
44 26452.458301828334
45 24528.226781863625
46 22763.808337142047
47 21

430 0.0005043885333637131
431 0.0004852450407921154
432 0.00046682877662498976
433 0.0004491135762155739
434 0.0004320776816998823
435 0.0004156912474796313
436 0.00039993166116054603
437 0.0003847705554287515
438 0.00037019031906754865
439 0.00035616414067432046
440 0.0003426715075819204
441 0.0003296972387402875
442 0.0003172221558416737
443 0.0003052178262505838
444 0.00029366569953491326
445 0.00028255660440429695
446 0.00027186896291112646
447 0.00026158901998585553
448 0.00025169825109948806
449 0.00024218498610859327
450 0.0002330309732146355
451 0.0002242252803955837
452 0.00021575722295590131
453 0.00020760826697749623
454 0.00019976797021874383
455 0.00019222616550819015
456 0.0001849712249212661
457 0.00017799120748612893
458 0.00017127612126896334
459 0.00016481844402121772
460 0.00015860585604520196
461 0.00015262506395431717
462 0.0001468714592332414
463 0.00014133723103479384
464 0.00013601077757318056
465 0.00013088579927645754
466 0.00012595604493376318
467 0.000121212

In [8]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [9]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 423.5332946777344
199 6.648295879364014
299 0.15837790071964264
399 0.004585823975503445
499 0.0003263606340624392


In [31]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [32]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 653.2158203125
199 5.762205600738525
299 0.10091760754585266
399 0.002558561507612467
499 0.00021423966973088682
