In [1]:
%matplotlib inline


PyTorch: optim
--------------

A fully-connected ReLU network with one hidden layer, trained to predict y from x
by minimizing squared Euclidean distance.

This implementation uses the nn package from PyTorch to build the network.

Rather than manually updating the weights of the model as we have been doing,
we use the optim package to define an Optimizer that will update the weights
for us. The optim package defines many optimization algorithms that are commonly
used for deep learning, including SGD+momentum, RMSProp, Adam, etc.



In [2]:
import torch

In [4]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [5]:
x = torch.rand(N, D_in)
y = torch.rand(N, D_out)

In [6]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

In [10]:
loss_fn = torch.nn. MSELoss(size_average = False)
learning_rate = 1e-4

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [13]:
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t , loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 tensor(194.7169)
1 tensor(174.6191)
2 tensor(156.3337)
3 tensor(139.6699)
4 tensor(124.6605)
5 tensor(111.3596)
6 tensor(99.5801)
7 tensor(89.2902)
8 tensor(80.5968)
9 tensor(73.2933)
10 tensor(67.4785)
11 tensor(63.0612)
12 tensor(60.0138)
13 tensor(58.0896)
14 tensor(57.1175)
15 tensor(56.7823)
16 tensor(56.7613)
17 tensor(56.8038)
18 tensor(56.7832)
19 tensor(56.5842)
20 tensor(56.1849)
21 tensor(55.6042)
22 tensor(54.8863)
23 tensor(54.1047)
24 tensor(53.3245)
25 tensor(52.5706)
26 tensor(51.8441)
27 tensor(51.2038)
28 tensor(50.6543)
29 tensor(50.1780)
30 tensor(49.7564)
31 tensor(49.3920)
32 tensor(49.0472)
33 tensor(48.7078)
34 tensor(48.3497)
35 tensor(47.9708)
36 tensor(47.5855)
37 tensor(47.1985)
38 tensor(46.8320)
39 tensor(46.4827)
40 tensor(46.1389)
41 tensor(45.8065)
42 tensor(45.4857)
43 tensor(45.1581)
44 tensor(44.8167)
45 tensor(44.4698)
46 tensor(44.1190)
47 tensor(43.7794)
48 tensor(43.4383)
49 tensor(43.1100)
50 tensor(42.7899)
51 tensor(42.4770)
52 tensor(42.155

429 tensor(0.1786)
430 tensor(0.1752)
431 tensor(0.1718)
432 tensor(0.1685)
433 tensor(0.1652)
434 tensor(0.1620)
435 tensor(0.1588)
436 tensor(0.1557)
437 tensor(0.1527)
438 tensor(0.1497)
439 tensor(0.1468)
440 tensor(0.1439)
441 tensor(0.1411)
442 tensor(0.1384)
443 tensor(0.1357)
444 tensor(0.1330)
445 tensor(0.1304)
446 tensor(0.1279)
447 tensor(0.1254)
448 tensor(0.1229)
449 tensor(0.1205)
450 tensor(0.1181)
451 tensor(0.1158)
452 tensor(0.1136)
453 tensor(0.1113)
454 tensor(0.1091)
455 tensor(0.1070)
456 tensor(0.1049)
457 tensor(0.1028)
458 tensor(0.1008)
459 tensor(1.00000e-02 *
       9.8839)
460 tensor(1.00000e-02 *
       9.6892)
461 tensor(1.00000e-02 *
       9.4991)
462 tensor(1.00000e-02 *
       9.3121)
463 tensor(1.00000e-02 *
       9.1276)
464 tensor(1.00000e-02 *
       8.9494)
465 tensor(1.00000e-02 *
       8.7716)
466 tensor(1.00000e-02 *
       8.5993)
467 tensor(1.00000e-02 *
       8.4299)
468 tensor(1.00000e-02 *
       8.2632)
469 tensor(1.00000e-02 *
     

In [None]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()